In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# **Part 1 - Data Preprocessing**

In [3]:
df = pd.read_csv('amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
len(df) #we got 10000 reviews

10000

In [7]:
print(df['review'][117]) #here's a negative review

I got fooled...: I did not know this was the audio cd of the game, I thought it could be used on the playstation. Maybe the other guys are from the US or somewhere where it is actually easy to get the game,and play it,but this was all in japanese and was not compatible with my machine. I am deeply disappointed as i love the arcade game, but the Dance stage euromix (Dance Dance Revolution equivalent) is lacking in decent tracks, many are jungle-y and are difficult to dance to them. Also, most of these tracks are unheard of here and are therefore not very enjoyable. However, my favourite are Keep on moving, make a Jam, and Video Killed the Radio star. But it gets a bit pedantic if you keep playing on these tracks. If anyone is in the same situation as I am, or those of you who are lucky enough the play on the newest version, can you please let me know where I can order one? I am in dance deprivation/boredom!


In [8]:
df.isnull().sum() #no empty reviews

label     0
review    0
dtype: int64

# **Part 2 - Fit and Transform**

In [9]:
X = df['review'] #passing in the review column as X
y = df['label'] #y is the label on if the review was positive or negative

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [10]:
#Below is to show what TfidfVectorizer does to the train data
X_train.shape #so original shape of 8000 messages

(8000,)

In [12]:
#TfidfVectorizer first performs a counter vectorization on each unique word, then applies a Tfidf transform on the unique words
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape #resulting shape now detected 28262 unique words over 8000 messages

(8000, 28262)

In [13]:
#Our pipeline does the following steps:
#Step 1: Count vectorize each unique word then apply TFIDF feature extraction transform on X_train data. This is done using TfidfVectorizer() imported above 
#Step 2: Fit the data using LinearSVC. The purpose of the Support Vector Classifier is to fit to the data and return a best fit hyperplane that categorizes our data
text_clf = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('clf', LinearSVC())]) #so each tuple in the pipeline is a step, starting with the TfidfVectorizer, then utilizing LinearSVC

In [14]:
text_clf.fit(X_train,y_train) #now we just called our pipelines and ran our X_train & y_train to it. We didn't need to create an instance of each item and fit them separately

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# **Part 3 - Predicting and Results**

In [15]:
predictions = text_clf.predict(X_test)

df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['pos','neg'], columns=['pos','neg'])
df #performing way better now

Unnamed: 0,pos,neg
pos,845,142
neg,135,878


In [16]:
print(classification_report(y_test,predictions)) #performing way better now

              precision    recall  f1-score   support

         neg       0.86      0.86      0.86       987
         pos       0.86      0.87      0.86      1013

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [17]:
print(metrics.accuracy_score(y_test,predictions))

0.8615


In [19]:
text_clf.predict(["Halo 3 is the best game of all time, I've never had more fun playing any other game. So many hours put into this masterpiece!"]) #tells me this is positive

array(['pos'], dtype=object)

In [21]:
text_clf.predict(["This product is so boring! I want a refund for this scam"]) #tells me this is negative

array(['neg'], dtype=object)

In [23]:
text_clf.predict(["The part itself was made well, but functionality is not working"]) #notice how despite me praising the manufacturing, it picked up on my final negative review message

array(['neg'], dtype=object)

In [26]:
text_clf.predict(['awesome product, would buy again']) #tells me this is positive

array(['pos'], dtype=object)