In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [27]:
sentiment= pd.read_csv('labeledTrainData.tsv',sep='\t')

In [28]:
sentiment.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [29]:
from collections import Counter

In [30]:
count = Counter()

for corpus in sentiment['review']:
    for word in corpus.split():
        count[word] += 1
count.most_common(20)

[('the', 287032),
 ('a', 155096),
 ('and', 152664),
 ('of', 142972),
 ('to', 132568),
 ('is', 103228),
 ('in', 85580),
 ('I', 65973),
 ('that', 64560),
 ('this', 57196),
 ('it', 54429),
 ('/><br', 50935),
 ('was', 46698),
 ('as', 42510),
 ('with', 41721),
 ('for', 41070),
 ('but', 33790),
 ('The', 33762),
 ('on', 30766),
 ('movie', 30500)]

In [31]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lillianphan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

print(stop)
count_reduced = Counter()

for w, c in count.items():
    if not w.lower() in stop:
        count_reduced[w] = c

count_reduced.most_common(20)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[('/><br', 50935),
 ('movie', 30500),
 ('film', 27397),
 ('one', 20688),
 ('like', 18133),
 ('would', 11922),
 ('good', 11435),
 ('really', 10815),
 ('even', 10607),
 ('see', 10155),
 ('-', 9355),
 ('get', 8777),
 ('story', 8526),
 ('much', 8507),
 ('time', 7764),
 ('make', 7485),
 ('could', 7462),
 ('also', 7422),
 ('first', 7339),
 ('people', 7335)]

In [33]:
import re
def preprocessor(text):
    """ Return a cleaned version of text
    """
   
    text = re.sub('<[^>]*>', '', text)
    
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text



In [34]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [35]:
from sklearn.model_selection import train_test_split

X = sentiment['review']
y = sentiment['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)


In [37]:
clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

predictions = clf.predict(X_test)
print('accuracy:',accuracy_score(y_test,predictions))
print('confusion matrix:\n',confusion_matrix(y_test,predictions))
print('classification report:\n',classification_report(y_test,predictions))

accuracy: 0.8853333333333333
confusion matrix:
 [[3293  503]
 [ 357 3347]]
classification report:
              precision    recall  f1-score   support

          0       0.90      0.87      0.88      3796
          1       0.87      0.90      0.89      3704

avg / total       0.89      0.89      0.89      7500



In [41]:
twits = [
    "I love this movie",
    "I don't like it so much!",
    "Worth watching",
]

preds = clf.predict_proba(twits)

for i in range(len(twits)):
    print(f'{twits[i]} --> Negative, Positive = {preds[i]}')
sentiment.sample(10)

I love this movie --> Negative, Positive = [0.04 0.96]
I don't like it so much! --> Negative, Positive = [0.7 0.3]
Worth watching --> Negative, Positive = [0.22 0.78]


Unnamed: 0,id,sentiment,review
17802,11238_1,0,By reading the box at the video store this mov...
6029,1057_9,1,I thought this had the right blend of characte...
2716,11765_4,0,You know what you are getting when you purchas...
7147,4789_4,0,"\Phantasm\"" of 1979 was a highly atmospheric, ..."
8848,7327_10,1,I have just recently been through a stage wher...
4387,11746_2,0,Well this film has certainly had a fair amount...
734,4919_7,1,Kazan's early film noir won an Oscar. Some of ...
22411,3726_7,1,I wasn't expecting the highest calibre of film...
24251,2771_7,1,David and Bathsheba is a lavish Hollywood Bibl...
7277,1609_9,1,"\Moonstruck\"" is a movie that I liked the firs..."
