In [1]:
#from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from nltk.corpus import stopwords
from  nltk import FreqDist
from mod_4_nlp_project.functions import *
import string
import spacy
from tqdm import tqdm
from spacy.util import minibatch, compounding

In [2]:
train_df = pd.read_csv('movies.csv', index_col = 0)

In [3]:
cleanText(train_df, 'reviews')

In [4]:
train_df.head()

Unnamed: 0,reviews,sentiment
0,working with one of the best shakespeare sourc...,0
1,welltremors i the original started off in and...,0
2,ouch this one was a bit painful to sit through...,0
3,ive seen some crappy movies in my life but thi...,0
4,carriers follows the exploits of two guys and ...,0


In [5]:
train_df['tuples'] = train_df.apply(
    lambda row: (row['reviews'],row['sentiment']), axis=1)
train = train_df['tuples'].tolist()
train[:1]

[('working with one of the best shakespeare sources this film manages to be creditable to its source whilst still appealing to a wider audience  branagh steals the film from under fishburnes nose and theres a talented cast on good form',
  0)]

In [6]:
#functions from spacy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=25000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [7]:
nlp = spacy.load('en_core_web_md')  # create english Language class

In [8]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')
textcat.add_label("NEGATIVE")

# load the dataset
print("Loading movie reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Loading movie reviews data...
Using 25000 examples (20000 training, 5000 evaluation)


In [9]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
14.187	0.889	0.888	0.889
5.955	0.899	0.896	0.897
3.787	0.897	0.895	0.896
2.719	0.895	0.897	0.896
2.227	0.897	0.896	0.897
1.855	0.894	0.897	0.895
1.581	0.891	0.893	0.892
1.218	0.889	0.891	0.890
1.219	0.889	0.889	0.889
1.334	0.889	0.890	0.890
1.180	0.887	0.890	0.888
1.049	0.891	0.891	0.891
0.895	0.887	0.887	0.887
1.061	0.887	0.888	0.888
0.733	0.889	0.889	0.889
0.829	0.888	0.887	0.887
0.763	0.885	0.885	0.885
0.771	0.885	0.885	0.885
0.799	0.887	0.887	0.887
0.847	0.886	0.887	0.887


In [12]:
test_text = "This movie is horrible"
doc = nlp(test_text)
print(test_text, max(doc.cats, key=lambda key: doc.cats[key]))

This movie is horrible NEGATIVE
