In [1]:
#! unzip rotten-tomatoes.zip

# Import Necessary Libraries

In [2]:
import pandas as pd
import csv
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
np.random.seed(0)

# Load Data into DataFrame

In [3]:
data = []
with open('reviews.tsv', encoding="utf8", errors='ignore') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for row in reader:
        data.append(row)

In [4]:
reviews = pd.DataFrame(data[1:], columns=data[0])

In [5]:
reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


# Clean the DataFrame

In [6]:
score_dict = {'fresh': 1, 'rotten': 0}
reviews['fresh'] = reviews['fresh'].map(score_dict)

In [7]:
reviews.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,1,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,0,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,1,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,1,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,1,,0,Cinema Scope,"October 12, 2017"


In [8]:
reviews = reviews[['review', 'fresh']]

In [9]:
reviews.head()

Unnamed: 0,review,fresh
0,A distinctly gallows take on contemporary fina...,1
1,It's an allegory in search of a meaning that n...,0
2,... life lived in a bubble in financial dealin...,1
3,Continuing along a line introduced in last yea...,1
4,... a perverse twist on neorealism...,1


In [10]:
X = reviews['review']
y = reviews['fresh']

# Perform Train, Test, Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Count the Words in a Review

In [12]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(X_train)

X_train_counts = count_vectorizer.transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# TF-IDF Transform the Word Counts

In [13]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_counts)

x_train_tfidf = tfidf_transformer.transform(X_train_counts)
x_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Create a Classifier to Score Reviews

In [14]:
classifier = MultinomialNB()
classifier.fit(x_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Make Predictions Using Classifier

In [15]:
classifier.predict(x_test_tfidf)

array([1, 1, 1, ..., 1, 1, 0])

In [16]:
classifier.predict_proba(x_test_tfidf)

array([[0.45622796, 0.54377204],
       [0.4879811 , 0.5120189 ],
       [0.23872698, 0.76127302],
       ...,
       [0.14758638, 0.85241362],
       [0.46843436, 0.53156564],
       [0.72596988, 0.27403012]])

In [17]:
classifier.predict(count_vectorizer.transform(['This is an awesome movie']))

array([1])

In [18]:
classifier.predict_proba(count_vectorizer.transform(['This is an awesome movie']))
# first represent prob that the score is rotten (0), second is fresh (1)

array([[0.36188785, 0.63811215]])

# Alternative Method Using Pipeline after the Train, Test, Split

In [19]:
text_classifier = Pipeline([('count_vectorizer', CountVectorizer(stop_words='english')),
                            ('tfidf_vectorizer', TfidfTransformer()),
                            ('clf', MultinomialNB())])

text_classifier.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='englis...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
text_classifier.predict(X_test)

array([1, 1, 1, ..., 1, 1, 0])

In [21]:
text_classifier.predict_proba(X_test)

array([[0.45622796, 0.54377204],
       [0.4879811 , 0.5120189 ],
       [0.23872698, 0.76127302],
       ...,
       [0.14758638, 0.85241362],
       [0.46843436, 0.53156564],
       [0.72596988, 0.27403012]])

In [22]:
text_classifier.predict(['This is an awesome movie'])

array([1])

In [23]:
text_classifier.predict_proba(['This is an awesome movie'])

array([[0.34995927, 0.65004073]])

The probabilities are a little different, but very close to the step by step method.