In [7]:
#from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
df = pd.read_csv('reviews_dataframe')

In [3]:
df.head()

Unnamed: 0,reviews,sentiment
0,Working with one of the best Shakespeare sourc...,-1
1,"Well...tremors I, the original started off in ...",-1
2,Ouch! This one was a bit painful to sit throug...,-1
3,"I've seen some crappy movies in my life, but t...",-1
4,"""Carriers"" follows the exploits of two guys an...",-1


### StopWords

In [15]:
# Define the stop words
#my_stop_words = ENGLISH_STOP_WORDS.union(['movie','film', etc])

# Build and fit the vectorizer
vect = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
vect.fit(df['reviews'])

# Create the bow representation
X_review = vect.transform(df['reviews'])
# Create the data frame
X_df = pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())

   00  000  0000000000001  00001  00015  001  007  00am  00pm  00s  ...  \
0   0    0              0      0      0    0    0     0     0    0  ...   
1   0    0              0      0      0    0    0     0     0    0  ...   
2   0    0              0      0      0    0    0     0     0    0  ...   
3   0    0              0      0      0    0    0     0     0    0  ...   
4   0    0              0      0      0    0    0     0     0    0  ...   

   zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz  zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz  \
0                                0                                          0   
1                                0                                          0   
2                                0                                          0   
3                                0                                          0   
4                                0                                          0   

   zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz  ánd  åmål  ém

### Bag of Words

In [16]:
# Build the vectorizer, specify max features 
vect = CountVectorizer(max_features=100,stop_words=ENGLISH_STOP_WORDS)
# can add in ngram_range=(1,2) for uni/bigram analysis, max_df = 200 (limits size of vocab to which occurs in 
# no more than 200 docs), min_df = 50 (limit size of vocab to ignore terms which occur in less than 50 docs)

# Fit the vectorizer
vect.fit(df['reviews'])

# Transform the review column
X_review = vect.transform(df['reviews'])

# Create the bow representation
X_df=pd.DataFrame(X_review.toarray(), columns=vect.get_feature_names())
print(X_df.head())

   10  acting  action  actors  actually  away  awful  bad  believe  best  ...  \
0   0       0       0       0         0     0      0    0        0     1  ...   
1   0       0       0       0         0     0      0    2        0     0  ...   
2   0       0       0       1         0     1      0    0        0     0  ...   
3   1       0       0       0         0     0      0    0        0     2  ...   
4   0       0       1       0         0     1      1    0        0     0  ...   

   trying  ve  want  wasn  watch  watching  way  work  worst  years  
0       0   0     0     0      0         0    0     0      0      0  
1       0   0     0     0      1         0    0     0      0      0  
2       0   0     0     0      0         0    2     1      0      0  
3       1   1     0     0      0         0    0     0      1      0  
4       0   0     0     0      0         0    0     0      1      0  

[5 rows x 100 columns]


### tokenization (feature engineering - show how many words are in a review, how many sentences, how many punctuation marks)

In [9]:
# Tokenize each item in the review column 
word_tokens = [word_tokenize(review) for review in df['reviews']]

# Create an empty list to store the length of reviews
len_tokens = []

# Iterate over the word_tokens list and determine the length of each item
for i in range(len(word_tokens)):
     len_tokens.append(len(word_tokens[i]))

# Create a new feature for the lengh of each review
df['n_words'] = len_tokens 

In [10]:
df.head()

Unnamed: 0,reviews,sentiment,n_words
0,Working with one of the best Shakespeare sourc...,-1,55
1,"Well...tremors I, the original started off in ...",-1,220
2,Ouch! This one was a bit painful to sit throug...,-1,146
3,"I've seen some crappy movies in my life, but t...",-1,481
4,"""Carriers"" follows the exploits of two guys an...",-1,940


### Stemming/Lemming

In [18]:
# Call the stemmer
porter = PorterStemmer()

# Transform the column of reviews to tokens
tokens = [word_tokenize(review) for review in df['reviews']]
# Stem the list of tokens
stemmed_tokens = [[porter.stem(word) for word in review] for review in tokens]

In [17]:
#Call the lemmer
WNlemmatizer = WordNetLemmatizer()

# Transform the column of reviews to tokens
tokens = [word_tokenize(review) for review in df['reviews']]
# Stem the list of tokens
lemmed_tokens = [[WNlemmatizer.lemmatize(word) for word in review] for review in tokens]

KeyboardInterrupt: 

### TfIdf

In [20]:
# Define the vectorizer and specify the arguments
vect = TfidfVectorizer(ngram_range=(1, 2), max_features=100, stop_words=ENGLISH_STOP_WORDS).fit(df['reviews'])
# can also specify our own token_pattern using regex

# Transform the vectorizer
X_txt = vect.transform(df['reviews'])

# Transform to a data frame and specify the column names
X=pd.DataFrame(X_txt.toarray(), columns=vect.get_feature_names())
X.head()

Unnamed: 0,10,acting,action,actors,actually,away,awful,bad,believe,best,...,trying,ve,want,wasn,watch,watching,way,work,worst,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2598,0.0,0.0,...,0.0,0.0,0.0,0.0,0.15893,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.263806,0.0,0.303831,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.449483,0.279081,0.0,0.0
3,0.084258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166486,...,0.091223,0.077849,0.0,0.0,0.0,0.0,0.0,0.0,0.078806,0.0
4,0.0,0.0,0.144418,0.0,0.0,0.138168,0.137611,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118349,0.0


## Modeling

In [None]:
#columns to drop from df
drop = []

# choose X, y
predictors = movies.drop(columns = drop)
target = movies['label']

In [None]:
#TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=333)

### Logistic Regression

In [None]:
#INSTANTIATE LOGISTIC REGRESSION
logreg = LogisticRegression(class_weight='balanced', random_state = 333)
# can choose a regularization C = ? to penalize so it doesnt overfit

#fit to training set
logreg.fit(X_train, y_train)

#predict on test set
y_pred_log = logreg.predict(X_test)


# checking accuracy
print('Test Accuracy score: ', metrics.accuracy_score(y_test, y_pred_log))

# checking F1
print('Test F1 score: ', metrics.f1_score(y_test, y_pred_log))

# print confusion matrix
print('Confusion matrix test set: \n', confusion_matrix(y_test, y_pred_log)/len(y_test))

### Decision Tree

In [None]:
# INSATNTIATE DECISION TREE MODEL
tree = DecisionTreeClassifier(class_weight = 'balanced',random_state = 333)

tree.fit(X_train, y_train)

y_pred_tree = tree.predict(X_test)

# checking accuracy
print('Test Accuracy score: ', metrics.accuracy_score(y_test, y_pred_tree))

# checking F1
print('Test F1 score: ', metrics.f1_score(y_test, y_pred_tree))

# print confusion matrix
print('Confusion matrix test set: \n', confusion_matrix(y_test, y_pred_tree)/len(y_test))

### Random Forest

In [None]:
#INSTANTIATE RANDOM FOREST MODEL
rfc = RandomForestClassifier(random_state = 333, class_weight='balanced')

rfc.fit(X_train, y_train)

y_pred_forest = rfc.predict(X_test)

# checking accuracy
print('Test Accuracy score: ', metrics.accuracy_score(y_test, y_pred_forest))

# checking F1
print('Test F1 score: ', metrics.f1_score(y_test, y_pred_forest))
# print confusion matrix
print('Confusion matrix test set: \n', confusion_matrix(y_test, y_pred_forest)/len(y_test))