In [1]:
import pandas as pd
import numpy as np
import os
from functions import *
import spacy

---

In [2]:
# function to gather list of filenames
def getFileNamesList(directory:str):
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_names.append(filename)
            continue
        else:
            continue     
    return file_names

### Create a negative reviews dataframe

In [3]:
# gather negative review filenames
file_names = getFileNamesList('/Users/jason/Flatiron/MOD4/aclImdb/train/neg')

In [4]:
# initialize list to append reviews to
reviews_series = []

#loop through all neg. filenames
for file_name in file_names:
# Access the text with file path and file name 
    with open('/Users/jason/Flatiron/MOD4/aclImdb/train/neg/' + file_name) as f:
        # Read the text in as a Series and append it to reviews_series
        review = pd.Series(f.readlines())
        reviews_series.append(review)

In [5]:
#create df from reviews_series list
df_neg = pd.DataFrame(reviews_series)

In [6]:
df_neg.rename(columns={0:'reviews'}, inplace=True)

In [7]:
#add in sentiment for the reviews (all neg.)
df_neg['sentiment'] = 0

In [8]:
df_neg.head()

Unnamed: 0,reviews,sentiment
0,Working with one of the best Shakespeare sourc...,0
1,"Well...tremors I, the original started off in ...",0
2,Ouch! This one was a bit painful to sit throug...,0
3,"I've seen some crappy movies in my life, but t...",0
4,"""Carriers"" follows the exploits of two guys an...",0


---

### Create a positive reviews dataframe

In [9]:
# gather positive review filenames
file_names = getFileNamesList('/Users/jason/Flatiron/MOD4/aclImdb/train/pos')

In [10]:
# initialize list to append pos. reviews
reviews_series_2 = []

# For each file in files_list
for file_name in file_names:
# Access the text with file path and file name 
    with open('/Users/jason/Flatiron/MOD4/aclImdb/train/pos/' + file_name) as f:
        # Read the text in as a Series and append it to reviews_series
        review = pd.Series(f.readlines())
        reviews_series_2.append(review)

In [11]:
#create df from reviews_series_2 list
df_pos = pd.DataFrame(reviews_series_2)

In [12]:
df_pos.rename(columns={0:'reviews'}, inplace=True)

In [13]:
#add in sentiment for the reviews (all pos.)
df_pos['sentiment'] = 1

In [14]:
df_pos.head()

Unnamed: 0,reviews,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Bizarre horror movie filled with famous faces ...,1
2,"A solid, if unremarkable film. Matthau, as Ein...",1
3,It's a strange feeling to sit alone in a theat...,1
4,"You probably all already know this by now, but...",1


### Create and clean final dataframe

In [15]:
# concatenate neg. and pos. dataframes
df = pd.concat([df_neg,df_pos],ignore_index=True)

In [16]:
# Removes punctuation, numbers, and makes text lower case
# (found in function.py file)
cleanText(df, 'reviews', 'cleaned_reviews')

In [17]:
df.head()

Unnamed: 0,reviews,sentiment,cleaned_reviews
0,Working with one of the best Shakespeare sourc...,0,working with one of the best shakespeare sourc...
1,"Well...tremors I, the original started off in ...",0,well tremors i the original started off in and...
2,Ouch! This one was a bit painful to sit throug...,0,ouch this one was a bit painful to sit through...
3,"I've seen some crappy movies in my life, but t...",0,ive seen some crappy movies in my life but thi...
4,"""Carriers"" follows the exploits of two guys an...",0,carriers follows the exploits of two guys and...


In [18]:
# use spacy lemmatizer to lemmatize cleaned_reviews column
nlp = spacy.load('en_core_web_md')
df['cleaned_reviews'] = df["cleaned_reviews"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

In [19]:
# export cleaned dataframe to use for eda/modeling
df.to_csv('cleaned_reviews_dataframe', index=False)