In [2]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [3]:
pos_rev = pd.read_csv('datasets/netflix/pos.txt', sep='\n', encoding='latin-1', header=None)

In [4]:
pos_rev.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [5]:
# add new column - mood
pos_rev['mood'] = 1

In [6]:
# rename 0 column
pos_rev.rename(columns={0:'review'}, inplace=True)
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [7]:
neg_rev = pd.read_csv('datasets/netflix/negative.txt', sep='\n', encoding='latin-1', header=None)

In [8]:
# add new column - mood
neg_rev['mood'] = 0
# rename 0 column
neg_rev.rename(columns={0:'review'}, inplace=True)
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


Pipeline

1. Lowercase
2. Tokenization
3. Remove stopwords
4. Remove punctuations
5. Lemma/Stem
6. BOW/TFIDF
7. train_test_split
8. Naive Bayes / SVM
9. Evaluate model
10. Save model
11. Test model

In [9]:
# 1. Lowercase
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: x.lower())
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [10]:
# remove @... tags since it doesn't help in sentimental analysis. Ex. @ICC, @Moviename
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: re.sub(r'@\S+', "", x))
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [11]:
# 2. Removing punctuations
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))

In [12]:
# 3. Removing stopwords
wordnet = WordNetLemmatizer()

pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: " ".join([wordnet.lemmatize(word, 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))
pos_rev

Unnamed: 0,review,mood
0,rock destine 21st century 's new `` conan `` '...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,sometimes like go movies fun wasabi good place...,1
4,emerge something rare issue movie 's honest ke...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family 's joyous life act yiddi...,1
5328,stand shadow motown best kind documentary one ...,1
5329,'s nice see piscopo years chaykin headly price...,1


In [13]:
# making same changes for neg_rev
# 1. Lowercase
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: x.lower())
# remove @... tags since it doesn't help in sentimental analysis. Ex. @ICC, @Moviename
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: re.sub(r'@\S+', "", x))
# 2. Removing punctuations
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
# 3. Removing stopwords
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: " ".join([wordnet.lemmatize(word, 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))
neg_rev






# lemma = WordNetLemmatizer()
# pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : x.lower())
# pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : re.sub(r'@\S+' , "" , x))
# pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([word for word in nltk.word_tokenize(x) if word not in string.punctuation]))
# pos_rev.loc[: , 'review'] = pos_rev.loc[: , 'review'].apply(lambda x : " ".join([lemma.lemmatize(word , 'v') for word in nltk.word_tokenize(x) if word not in stopwords.words('english')]))


Unnamed: 0,review,mood
0,simplistic silly tedious,0
1,'s laddish juvenile teenage boys could possibl...,0
2,exploitative largely devoid depth sophisticati...,0
3,garbus discard potential pathological study ex...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find move,0
5327,many definitions 'time waster movie must surel...,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like made-for-home-video quickie,0


In [14]:
# Merging both pos_rev and neg_rev

com_rev = pd.concat([pos_rev, neg_rev], axis=0).reset_index()
com_rev

Unnamed: 0,index,review,mood
0,0,rock destine 21st century 's new `` conan `` '...,1
1,1,gorgeously elaborate continuation `` lord ring...,1
2,2,effective too-tepid biopic,1
3,3,sometimes like go movies fun wasabi good place...,1
4,4,emerge something rare issue movie 's honest ke...,1
...,...,...,...
10657,5326,terrible movie people nevertheless find move,0
10658,5327,many definitions 'time waster movie must surel...,0
10659,5328,stand crocodile hunter hurry badly cobble look...,0
10660,5329,thing look like made-for-home-video quickie,0


In [15]:
# 4. train_test_split
X = com_rev['review'].values
y = com_rev['mood'].values

X_train, X_test, y_train, y_test = train_test_split(com_rev['review'].values, com_rev['mood'].values, test_size=0.2, random_state=101)

In [16]:
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [17]:
# For reading purpose
train_data = pd.DataFrame({'review': X_train, 'mood':y_train})
test_data = pd.DataFrame({'review': X_test, 'mood': y_test})

In [18]:
train_data

Unnamed: 0,review,mood
0,put washington honest work man john q archibal...,0
1,poignant familiar story young person suspend t...,1
2,timely director could ever dream quietly lyric...,1
3,film virtually choke self-consciousness,0
4,film take inside rhythms subject experience watch,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fan critics damn already like sor...,0
8526,perhaps heaviest joyless movie ever make giant...,0
8527,film rival live fine little amuse-bouche keep ...,1


In [19]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move us ma...,1
1,'ve never see hear anything quite like film re...,1
2,end leave unfulfilled performances enjoy memor...,1
3,surface 's lovers-on-the-run crime flick lot c...,1
4,walk remember shrewd enough activate girlish t...,0
...,...,...
2128,bullock good job work natural likability,1
2129,result memorable least interest,1
2130,apparently design reverie memory regret thing ...,0
2131,movie insecure capacity excite churn one two f...,0


In [37]:
# 6. BOW/TFID

stop_words1 = stopwords.words('english')
vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'(?u)\b[A-Za-z]+\b', 
    ngram_range=(1, 1),
    max_features=30000)
train_vector = vectorizer.fit_transform(train_data['review'])
test_vector = vectorizer.transform(test_data['review'])

In [39]:
train_vector.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [40]:
# to see vocabulary 
vectorizer.get_feature_names()

['aaa',
 'aaliyah',
 'abagnale',
 'abandon',
 'abandone',
 'abandono',
 'abbass',
 'abbott',
 'abbreviate',
 'abc',
 'abderrahmane',
 'abel',
 'aberration',
 'abhor',
 'abhorrent',
 'abide',
 'abilities',
 'ability',
 'abject',
 'able',
 'ably',
 'abomination',
 'aborbing',
 'aboriginal',
 'aboul',
 'abound',
 'abrahams',
 'abrams',
 'abridge',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolutamente',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorption',
 'abstract',
 'absurd',
 'absurdist',
 'absurdities',
 'absurdity',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'aburrido',
 'abuse',
 'aby',
 'abysmal',
 'abysmally',
 'acaba',
 'acabamos',
 'academic',
 'academy',
 'accelerate',
 'accent',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'access',
 'accessibility',
 'accessible',
 'accident',
 'accidental',
 'acclaim',
 'accomodates',
 'accompany',
 'accomplish',
 'accomplishment',
 'accomplishments',
 'accord',
 'accordion',
 'accor

In [41]:
len(vectorizer.get_feature_names())

13541

In [42]:
# TODO - Remove unnecessary words like 00, 000.... - DONE

In [43]:
# 7. SVM

from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

classifier = svm.SVC(kernel = 'linear')
classifier.fit(train_vector, train_data['mood'])

SVC(kernel='linear')

In [44]:
pred = classifier.predict(test_vector)

In [45]:
accuracy_score(pred, test_data['mood'])

0.7444913267698078

In [46]:
import joblib
joblib.dump(classifier, 'datasets/netflix/NetFlix.pkl')
joblib.dump(vectorizer, 'datasets/netflix/transform.pkl')

['datasets/netflix/transform.pkl']

In [263]:
# flask

model = joblib.load('datasets/netflix/NetFlix.pkl')
vector = joblib.load('datasets/netflix/transform.pkl')

review = input('Enter the review: ')
open('datasets/retraining.csv', 'a')
    
tfidf = vector.transform([review]).toarray()
my_pred = model.predict(tfidf)

if my_pred == 1:
    print('Positive review')
else:
    print('Negative review')

Enter the review: it was bad movie
Negative review


In [47]:
# task
# regex to remove the digits
# use spacy stopword to remove the stopword
# use naive bayes and try to compare the accuracy

# flask model
# twitter data - train the model


SyntaxError: invalid syntax (<ipython-input-47-9f2025e22296>, line 2)