In [1]:
import spacy
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

In [2]:
pos_rev = pd.read_csv('datasets/netflix/pos.txt', sep='\n', encoding='latin-1', header=None)

In [3]:
pos_rev.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [4]:
# add new column - mood
pos_rev['mood'] = 1
# rename 0 column
pos_rev.rename(columns={0:'review'}, inplace=True)
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [33]:
neg_rev = pd.read_csv('datasets/netflix/negative.txt', sep='\n', encoding='latin-1', header=None)

In [34]:
# add new column - mood
neg_rev['mood'] = 0
# rename 0 column
neg_rev.rename(columns={0:'review'}, inplace=True)
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [7]:
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [23]:
def remove_punctuations_stopwords(input_text):
    doc = nlp(input_text)
    output = []
    for token in doc:
        if not token.is_punct and not token.is_stop:
            output.append(str(token))
    return ' '.join(output)

In [25]:
# 1. Lowercase
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: x.lower())
# print(pos_rev)

# remove @... tags since it doesn't help in sentimental analysis. Ex. @ICC, @Moviename
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: re.sub(r'@\S+', "", x))
        
# 2. Removing punctuations and stop words
pos_rev['review'] = pos_rev.apply(lambda row: remove_punctuations_stopwords(row['review']), axis=1)
pos_rev.head()

Unnamed: 0,review,mood
0,rock destined 21st century new conan going spl...,1
1,gorgeously elaborate continuation lord rings t...,1
2,effective tepid biopic,1
3,like movies fun wasabi good place start,1
4,emerges rare issue movie honest keenly observe...,1


In [35]:
# 1. Lowercase
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: x.lower())
# print(pos_rev)

# remove @... tags since it doesn't help in sentimental analysis. Ex. @ICC, @Moviename
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: re.sub(r'@\S+', "", x))
        
# 2. Removing punctuations and stop words
neg_rev['review'] = neg_rev.apply(lambda row: remove_punctuations_stopwords(row['review']), axis=1)
neg_rev.head()

Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,laddish juvenile teenage boys possibly find funny,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discards potential pathological study e...,0
4,visually flashy narratively opaque emotionally...,0


In [37]:
# Merging both pos_rev and neg_rev

com_rev = pd.concat([pos_rev, neg_rev], axis=0).reset_index()
com_rev

Unnamed: 0,index,review,mood
0,0,rock destined 21st century new conan going spl...,1
1,1,gorgeously elaborate continuation lord rings t...,1
2,2,effective tepid biopic,1
3,3,like movies fun wasabi good place start,1
4,4,emerges rare issue movie honest keenly observe...,1
...,...,...,...
10657,5326,terrible movie people find moving,0
10658,5327,definitions time waster movie surely,0
10659,5328,stands crocodile hunter hurried badly cobbled ...,0
10660,5329,thing looks like home video quickie,0


In [38]:
# 4. train_test_split
X = com_rev['review'].values
y = com_rev['mood'].values

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'].values, test_size=0.2, random_state=101)

In [39]:
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [40]:
# For reading purpose
train_data = pd.DataFrame({'review': X_train, 'mood':y_train})
test_data = pd.DataFrame({'review': X_test, 'mood': y_test})

In [41]:
train_data

Unnamed: 0,review,mood
0,puts washington honest working man john q arch...,0
1,poignant familiar story young person suspended...,1
2,timely director dreamed quietly lyrical tale p...,1
3,film virtually chokes self consciousness,0
4,film takes inside rhythms subject experience w...,1
...,...,...
8524,branagh forceful non shakespeare screen perfor...,1
8525,movie friday fans critics damned like sort thi...,0
8526,heaviest joyless movie giant dragons taking ...,0
8527,film rival live fine little amuse bouche appet...,1


In [42]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film examine va...,1
1,seen heard like film recommend originality,1
2,ending leave unfulfilled performances enjoy me...,1
3,surface lovers run crime flick lot common pies...,1
4,walk remember shrewd activate girlish tear duc...,0
...,...,...
2128,bullock good job working natural likability,1
2129,results memorable interesting,1
2130,apparently designed reverie memory regret thin...,0
2131,movie insecure capacity excite churns flagrant...,0


In [44]:
# 6. BOW/TFID
vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'(?u)\b[A-Za-z]+\b', 
    ngram_range=(1, 1),
    max_features=30000)
train_vector = vectorizer.fit_transform(train_data['review'])
test_vector = vectorizer.transform(test_data['review'])

In [45]:
train_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [46]:
# to see vocabulary 
vectorizer.get_feature_names()

['aaa',
 'aaliyah',
 'abagnale',
 'abandon',
 'abandone',
 'abandoned',
 'abandono',
 'abbass',
 'abbott',
 'abbreviated',
 'abc',
 'abderrahmane',
 'abel',
 'aberration',
 'abhorrent',
 'abhors',
 'abiding',
 'abilities',
 'ability',
 'abject',
 'able',
 'ably',
 'abomination',
 'aborbing',
 'aboriginal',
 'aboul',
 'abound',
 'abrahams',
 'abrams',
 'abridged',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolutamente',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'absurd',
 'absurdist',
 'absurdities',
 'absurdity',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'aburrido',
 'abuse',
 'abused',
 'abuses',
 'abysmal',
 'abysmally',
 'abyss',
 'acaba',
 'acabamos',
 'academic',
 'academy',
 'accelerated',
 'accent',
 'accents',
 'accentuating',
 'accept',
 'acceptable',
 'acceptance',
 'accepting',
 'accepts',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental',
 'acclaim',
 'accl

In [47]:
len(vectorizer.get_feature_names())

15966

In [48]:
# 7. SVM

from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vector, train_data['mood'])

SVC(kernel='linear')

In [49]:
pred = classifier.predict(test_vector)

In [50]:
accuracy_score(pred, test_data['mood'])

0.7416783872480075