In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# **Part 1 - Data Preprocessing**

In [2]:
df = pd.read_csv('moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
len(df) #we got 2000 reviews

2000

In [4]:
print(df['review'][7]) #here's a positive review

tim robbins and martin lawernce team up in this road movie comedy . 
robbins plays an exec who discovers his wife having sex with his boss . 
he goes into depression , and drives around his neighbourhood until he arrives inside the usual 'ghetto' side of every american city . 
there , lawernce attempts to steal his car , but to no avail , and is dragged along with robbin's on a trip to arizona . 
there , they hold up a store , are mistaken for two other robbers ( just like in my cousin vinny ) and are chased by the police , and the other robbers . 
of course , there's gags along the way , usually from lawernce . 
although the film is midly funny , and quite watchable , there's something so horribly familiar about it all . 
this film should really be called beverly hills midnight run there's lawernce with his wisecracking and heavy profanity , just like eddie murphy in beverly hills cop , and pratically the same idea as midnight run . 
it's full of all the road movie cliches , and even 

In [5]:
df.isnull().sum() #some reviews are empty, so they were probably randomly assigned labels

label      0
review    35
dtype: int64

In [6]:
df.dropna(inplace=True) #dropping null values
df.isnull().sum()

label     0
review    0
dtype: int64

In [7]:
#so we took care of null values, but we also want to get rid of reviews that are just empty strings. We can use the .isspace() attribute to do this
mystring = '117'
myempty = '                  '

print(mystring.isspace()) #see it's true
print(myempty.isspace()) #see its s false

False
True


In [8]:
blanks = [] #initializing our list of blanks, so that once we iterate through our dataframe, we can add empty string reviews to this

for index,label,review in df.itertuples():
  #returning index location, label, review through our dataframe using the .itertuples() callback
  if review.isspace():
    blanks.append(index) #appending the index number for each review that's just an empty string

len(blanks)

27

In [9]:
df.drop(blanks,inplace=True) #so we're dropping the indexes listed within blanks
len(df) #2000 - 35 null - 27 blanks = 1938

1938

# **Part 2 - Fit and Transform**

In [10]:
X = df['review'] #passing in the review column as X
y = df['label'] #y is the label on if the review was positive or negative

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [11]:
#Below is to show what TfidfVectorizer does to the train data
X_train.shape #so original shape of 1453 messages

(1453,)

In [12]:
#TfidfVectorizer first performs a counter vectorization on each unique word, then applies a Tfidf transform on the unique words
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape #resulting shape now detected 34600 unique words over 1453 messages

(1453, 34737)

In [13]:
#Our pipeline does the following steps:
#Step 1: Count vectorize each unique word then apply TFIDF feature extraction transform on X_train data. This is done using TfidfVectorizer() imported above 
#Step 2: Fit the data using LinearSVC. The purpose of the Support Vector Classifier is to fit to the data and return a best fit hyperplane that categorizes our data
text_clf = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('clf', LinearSVC())]) #so each tuple in the pipeline is a step, starting with the TfidfVectorizer, then utilizing LinearSVC

In [14]:
text_clf.fit(X_train,y_train) #now we just called our pipelines and ran our X_train & y_train to it. We didn't need to create an instance of each item and fit them separately

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# **Part 3 - Predicting and Results**

In [15]:
predictions = text_clf.predict(X_test)

df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['pos','neg'], columns=['pos','neg'])
df #performing way better now

Unnamed: 0,pos,neg
pos,198,38
neg,37,212


In [16]:
print(classification_report(y_test,predictions)) #performing way better now

              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       236
         pos       0.85      0.85      0.85       249

    accuracy                           0.85       485
   macro avg       0.85      0.85      0.85       485
weighted avg       0.85      0.85      0.85       485



In [17]:
print(metrics.accuracy_score(y_test,predictions))

0.845360824742268


In [18]:
text_clf.predict(['The director did a great job']) #tells me this is positive

array(['pos'], dtype=object)

In [19]:
text_clf.predict(["The acting was very well done"]) #tells me this is positive

array(['pos'], dtype=object)

In [20]:
text_clf.predict(["The acting was very well done, but the movie's itself was boring"]) #notice how despite me praising the actor, it picked up on my final negative review message

array(['neg'], dtype=object)

In [21]:
text_clf.predict(['I love this movie!']) #tells me this is negative, which is wrong

array(['neg'], dtype=object)