In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

## Read in Data

In [2]:
data_dir = '../data/aclImdb/train'

data_home = f'{data_dir}/unsup'
file_list = os.listdir(data_home)

In [3]:
# Read in file names
files = [f for f in file_list if os.path.isfile(os.path.join(data_home, f))]
files_df = pd.DataFrame(files, columns=['File'])

# Read in review URLs
url_df = pd.read_csv(f'{data_dir}/urls_unsup.txt', header=None)
url_df = url_df.rename(columns = {0:'URL'})

# Read in IMDb metadata
metadata_df = pd.read_csv(f'../data/movie_results.csv')

### Set up LIB

In [4]:
# Combine reviews with metadata
review_metadata_df = url_df.merge(metadata_df, how='left', on='URL')

# Combine documents (files) with reviews and metadata
LIB = pd.concat([files_df, review_metadata_df], axis=1)

# Remove documents with no metadata
LIB = LIB[LIB['ID'].notnull()].drop(columns=['Error'])

LIB['Genre'] = LIB.Genres.str.split(',').str[0]

LIB['Genre 2'] = LIB.Genres.str.split(',').str[1]

LIB['Genre 3'] = LIB.Genres.str.split(',').str[2]

# Split Genres and store in cols
genres_df = LIB.copy()
genres_df['Genres'] = genres_df['Genres'].str.split(', ')
genres_exploded = genres_df.explode('Genres')

# Create the one-hot encoding
genre_dummies = pd.get_dummies(genres_exploded['Genres'], prefix='', prefix_sep='')

# Combine the dummies with original data by grouping back to original rows
genre_dummies = genre_dummies.groupby(level=0).max()

# Join the original DataFrame with the genre columns
LIB = pd.concat([LIB, genre_dummies], axis=1).reset_index(drop=True)

# Sample 5,000 reviews
LIB = LIB.sample(5000, random_state=42).reset_index(drop=True)

# Set index name
LIB.index.name = 'review_id'

In [5]:
bool_cols = LIB.select_dtypes(include='bool')  # Select only boolean columns
bool_cols.sum()

Action         1044
Adult             8
Adventure       770
Animation       193
Biography       179
Comedy         1681
Crime           857
Documentary     140
Drama          2473
Family          352
Fantasy         563
Film-Noir        63
Game-Show        11
History         162
Horror         1028
Music           177
Musical         137
Mystery         578
News              7
Reality-TV       21
Romance         972
Sci-Fi          693
Short            82
Sport            98
Talk-Show        12
Thriller       1421
War             246
Western         133
dtype: int64

In [6]:
LIB.to_csv('LIB.csv')

### Set up TOKENS

In [7]:
OHCO = ['review_id', 'para_num', 'sent_num', 'token_num']

#### Read PARAS

In [8]:
# List to store the raw content of each file
file_contents = []

files = [file for file in LIB.File]

# Initialize a counter for the review_id
review_id = 0

# Loop through each file
for file in files:
    
    file_path = data_home + '/' + file
    
    # Read the file content
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        file_content = f.read()  # Read the entire file as a string
    
    # Replace <br /> tags with newline characters
    file_content = file_content.replace('<br />', '\n')
    
    # Split content by newlines to create individual lines
    lines = file_content.splitlines()
    
    # Create DataFrame with each line as a row
    df = pd.DataFrame(lines, columns=['para_str'])
    
    df = df[~df['para_str'].str.match(r'^\s*$')]
    
    df = df.reset_index(drop=True)
    
    # Set 'line_num' as the index for each file's DataFrame
    df.index.name = 'para_num'
    
    # Reset the index to make 'line_num' a regular column
    df = df.reset_index('para_num')
    
    # Add a 'review_id' for each file (constant for each file)
    df['review_id'] = review_id
    
    # Append the DataFrame to the list
    file_contents.append(df)
    
    # Increment review_id for the next file
    review_id += 1

# Concatenate all file contents into one DataFrame
PARAS = pd.concat(file_contents)

# Set MultiIndex with 'review_id' and 'line_num'
PARAS.set_index(OHCO[0:2], inplace=True)

In [9]:
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
review_id,para_num,Unnamed: 2_level_1
0,0,This is a mildly enjoyable flick which attempt...
0,1,In trying to be serious and political the film...
1,0,"On the whole, this was a pretty good, pretty f..."
2,0,I cannot wait to see the goofs start on this h...
2,1,I also choked on my drink when a certain perso...


#### Read SENTENCES

In [10]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')

SENTS.index.names = OHCO[:3]

In [11]:
SENTS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
review_id,para_num,sent_num,Unnamed: 3_level_1
0,0,0,This is a mildly enjoyable flick which attempt...
0,0,1,This is mainly due to the exceedingly annoying...
0,0,2,OK in this case it is not taken too far but it...
0,1,0,In trying to be serious and political the film...
0,1,1,"And the there's the plot holes, oh my god, the..."


#### Split into TOKENS

In [12]:
TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
TOKENS['pos_group'] = TOKENS.pos.str[:2]
TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
TOKENS['term_str'] = TOKENS.token_str.str.lower().str.replace(r"\W+", "", regex=True)

TOKENS.index.names = OHCO[:4]

In [13]:
TOKENS.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,pos_group,token_str,term_str
review_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
374,2,1,27,"(problem, NN)",NN,NN,problem,problem
1840,3,1,23,"(., .)",.,.,.,
1335,5,2,3,"(by, IN)",IN,IN,by,by
2814,0,1,21,"(make, VB)",VB,VB,make,make
3039,2,0,24,"(Henry, NNP)",NNP,NN,Henry,henry
4489,4,2,27,"(,, ,)",",",",",",",
1636,0,2,11,"(but, CC)",CC,CC,but,but
2824,0,1,4,"(rented, VBN)",VBN,VB,rented,rented
1256,0,4,32,"(like, IN)",IN,IN,like,like
1046,0,6,7,"(is, VBZ)",VBZ,VB,is,is


In [14]:
TOKENS.to_csv('TOKENS.csv')

#### Get VOCAB

In [15]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'

VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

# Define stopwords
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name = 'term_str'

# Create Porter stems
stemmer1 = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), axis=1)

# Create maximum POS fields
VOCAB['max_pos'] = TOKENS.groupby(['term_str', 'pos']).size().unstack(fill_value=0).idxmax(axis=1)
VOCAB['max_pos_group'] = TOKENS.groupby(['term_str', 'pos_group']).size().unstack(fill_value=0).idxmax(axis=1)

# Join vocab to stop words
if 'stop' not in VOCAB.columns:
    VOCAB = VOCAB.join(sw)
    VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [16]:
VOCAB

Unnamed: 0_level_0,n,p,i,porter_stem,max_pos,max_pos_group,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,158602,1.193317e-01,3.066951,,.,.,0
the,66897,5.033312e-02,4.312348,the,DT,DT,1
and,32261,2.427309e-02,5.364498,and,CC,CC,1
a,32120,2.416700e-02,5.370818,a,DT,DT,1
of,28482,2.142978e-02,5.544239,of,IN,IN,1
...,...,...,...,...,...,...,...
metalsongs,1,7.523973e-07,20.342002,metalsong,NNS,NN,0
goatees,1,7.523973e-07,20.342002,goate,NNS,NN,0
metalpurists,1,7.523973e-07,20.342002,metalpurist,NNS,NN,0
sternn,1,7.523973e-07,20.342002,sternn,NNP,NN,0


In [17]:
VOCAB.to_csv('VOCAB.csv')