In [1]:
#from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from nltk.corpus import stopwords
from  nltk import FreqDist
from functions import *
import string
import spacy
from tqdm import tqdm
from spacy.util import minibatch, compounding

In [2]:
df = pd.read_csv('cleaned_reviews_dataframe')

In [3]:
df.head()

Unnamed: 0,reviews,sentiment,cleaned_reviews
0,Working with one of the best Shakespeare sourc...,0,work with one of the good shakespeare source t...
1,"Well...tremors I, the original started off in ...",0,well tremor i the original start off in and i ...
2,Ouch! This one was a bit painful to sit throug...,0,ouch this one be a bit painful to sit through ...
3,"I've seen some crappy movies in my life, but t...",0,-PRON- have see some crappy movie in -PRON- li...
4,"""Carriers"" follows the exploits of two guys an...",0,carrier follow the exploit of two guy and tw...


In [4]:
# getting data into correct format, the spacy model takes in a list of tuples, 
# with review first and sentiment second, and trains cnn based of that
df['tuples'] = df.apply(
    lambda row: (row['reviews'],row['sentiment']), axis=1)
train = df['tuples'].tolist()
train[:1]

[("Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.  Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.",
  0)]

In [5]:
# load spacy with pretrained statistical models for english
nlp = spacy.load('en_core_web_md')

In [6]:
#functions from spacy documentation (altered slightly)

#loads our data and splits into training and test reviews
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

# evaluates the model in each epoch
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

# Number of texts to train from
# can change this but we used all of our reviews to train the model
n_texts=25000


# Number of training iterations
# we played around with this number and settled on 10 iterations
# after that the Loss started to go above or below 1 and F1 score did not change at all really
n_iter=10

In [7]:
#functions from spacy documentation (altered slightly)

# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')
textcat.add_label("NEGATIVE")

# load the dataset
print("Loading movie reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Loading movie reviews data...
Using 25000 examples (20000 training, 5000 evaluation)


In [8]:
#functions from spacy documentation (altered slightly)

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

# begin training loop
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
14.473	0.897	0.901	0.899
5.902	0.903	0.906	0.904
3.856	0.903	0.905	0.904
2.576	0.905	0.904	0.905
1.869	0.901	0.900	0.901
1.861	0.904	0.904	0.904
1.311	0.902	0.903	0.903
1.301	0.903	0.903	0.903
1.099	0.900	0.900	0.900
0.995	0.897	0.897	0.897


## Testing model with easy and mixed reviews

In [9]:
# testing negative review
test_text = "This movie is horrible"
doc = nlp(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))

This movie is horrible NEGATIVE


In [10]:
# testing negative review with "nice" words
test_text = "while there are good points in this movie overall its pretty meh"
doc = nlp(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))

while there are good points in this movie overall its pretty meh NEGATIVE


In [13]:
# test model with a moview reivew that has an IMDB rating of 6 to see how it works
test_text = "Originally, I wrote a very long review of Shawshank Redemption, but i've decided not to post it. Instead i'd just like to say that i would compare the movie to a poem that i wrote in 5th grade. i'm not going to write the poem out because i don't have it with me right now, but it had to do with rainbows and waterfalls and following your dreams. In other words, it was complete b.s. I just wrote what i thought were some typical poetic themes and threw them together, hoping people would like it. My plan succeeded, the teacher thought it was the best in the class, and it got published in the school newspaper. Seems like the director of The Shawshank Redemption, Frank Darabont, and the writer of the book, Stephen King, did the same thing. The movie is technically well made and does succeed at telling the story. I admit that I got caught up in the ending. But no matter how much you try to defend it, it's still pure Hollywood. Cliche here and there. I recognize this for what it is: a director's attempt to manipulate people's emotions, which, due to a large amount of naivite in the world, succeeds. He did the same thing with The Green Mile with more success. Now, with The Majestic, I am glad to see that it seems that most reviewers and many moviegoers have recognized this, and given it generally lower reviews than his first two films. Finally! Now I only hope people will start to give Shawshank lower ratings and get it down from the heights of the top 250 list."
doc = nlp(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))
# we get sentiment based off how negative or positive a review is, this model may be able to be updated
# to classify neutral reviews
print()
print(doc.cats)

Originally, I wrote a very long review of Shawshank Redemption, but i've decided not to post it. Instead i'd just like to say that i would compare the movie to a poem that i wrote in 5th grade. i'm not going to write the poem out because i don't have it with me right now, but it had to do with rainbows and waterfalls and following your dreams. In other words, it was complete b.s. I just wrote what i thought were some typical poetic themes and threw them together, hoping people would like it. My plan succeeded, the teacher thought it was the best in the class, and it got published in the school newspaper. Seems like the director of The Shawshank Redemption, Frank Darabont, and the writer of the book, Stephen King, did the same thing. The movie is technically well made and does succeed at telling the story. I admit that I got caught up in the ending. But no matter how much you try to defend it, it's still pure Hollywood. Cliche here and there. I recognize this for what it is: a director'

## Saving model to predict on holdout set

In [12]:
output_dir=%pwd
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to /Users/jason/Flatiron/MOD4/mod_4_nlp_project
