In [1]:
import pandas as pd
import numpy as np
import os
from functions import *
import spacy

---

In [2]:
# function to gather list of filenames
def getFileNamesList(directory:str):
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_names.append(filename)
            continue
        else:
            continue     
    return file_names

### Create a negative reviews dataframe

In [3]:
# gather negative review filenames
file_names = getFileNamesList('/Users/jason/Flatiron/MOD4/aclImdb/test/neg')

In [4]:
# initialize list to append reviews to
reviews_series = []

#loop through all neg. filenames
for file_name in file_names:
# Access the text with file path and file name 
    with open('/Users/jason/Flatiron/MOD4/aclImdb/test/neg/' + file_name) as f:
        # Read the text in as a Series and append it to reviews_series
        review = pd.Series(f.readlines())
        reviews_series.append(review)

In [5]:
#create df from reviews_series list
df_neg = pd.DataFrame(reviews_series)

In [6]:
df_neg.rename(columns={0:'reviews'}, inplace=True)

In [7]:
#add in sentiment for the reviews (all neg.)
df_neg['sentiment'] = 0

In [8]:
df_neg.head()

Unnamed: 0,reviews,sentiment
0,Alan Rickman & Emma Thompson give good perform...,0
1,I have seen this movie and I did not care for ...,0
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0
3,"This film is bundled along with ""Gli fumavano ...",0
4,I only comment on really very good films and o...,0


---

### Create a positive reviews dataframe

In [9]:
# gather positive review filenames
file_names = getFileNamesList('/Users/jason/Flatiron/MOD4/aclImdb/test/pos')

In [10]:
# initialize list to append pos. reviews
reviews_series_2 = []

# For each file in files_list
for file_name in file_names:
# Access the text with file path and file name 
    with open('/Users/jason/Flatiron/MOD4/aclImdb/test/pos/' + file_name) as f:
        # Read the text in as a Series and append it to reviews_series
        review = pd.Series(f.readlines())
        reviews_series_2.append(review)

In [11]:
#create df from reviews_series_2 list
df_pos = pd.DataFrame(reviews_series_2)

In [12]:
df_pos.rename(columns={0:'reviews'}, inplace=True)

In [13]:
#add in sentiment for the reviews (all pos.)
df_pos['sentiment'] = 1

In [14]:
df_pos.head()

Unnamed: 0,reviews,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


### Create and clean final dataframe

In [15]:
# concatenate neg. and pos. dataframes
df = pd.concat([df_neg,df_pos],ignore_index=True)

In [16]:
# Removes punctuation, numbers, and makes text lower case
# (found in function.py file)
cleanText(df, 'reviews', 'cleaned_reviews')

In [17]:
df.head()

Unnamed: 0,reviews,sentiment,cleaned_reviews
0,Alan Rickman & Emma Thompson give good perform...,0,alan rickman emma thompson give good performan...
1,I have seen this movie and I did not care for ...,0,i have seen this movie and i did not care for ...
2,"In Los Angeles, the alcoholic and lazy Hank Ch...",0,in los angeles the alcoholic and lazy hank chi...
3,"This film is bundled along with ""Gli fumavano ...",0,this film is bundled along with gli fumavano l...
4,I only comment on really very good films and o...,0,i only comment on really very good films and o...


In [18]:
# use spacy lemmatizer to lemmatize cleaned_reviews column
nlp = spacy.load('en_core_web_md')
df['cleaned_reviews'] = df["cleaned_reviews"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

In [19]:
# export cleaned dataframe to use for model evaluation
df.to_csv('cleaned_holdout_set', index=False)