#### Negative Review Text

In [1]:
with open('neg.txt.neg') as f:
    lines = f.read()

In [43]:
# spliting text
neg_review = lines.split('\n')
neg_review[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
 'effective but too-tepid biopic',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . "]

In [4]:
type(neg_review)

list

In [5]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [6]:
# convert neg_review list into Dataframe
neg_df = pd.DataFrame(neg_review)
neg_df.head(2)

Unnamed: 0,0
0,"simplistic , silly and tedious ."
1,"it's so laddish and juvenile , only teenage bo..."


In [7]:
# rename column name
neg_df.rename(columns={0:'text'}, inplace=True)

In [8]:
neg_df.head(2)

Unnamed: 0,text
0,"simplistic , silly and tedious ."
1,"it's so laddish and juvenile , only teenage bo..."


In [9]:
neg_df['label'] = 0

In [10]:
neg_df.head(2)

Unnamed: 0,text,label
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0


In [11]:
neg_df.shape

(5332, 2)

#### Positive Review Text

In [12]:
with open('pos.txt.pos') as f:
    lines = f.read()

In [14]:
pos_review = lines.split('\n')

In [44]:
pos_review[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
 'effective but too-tepid biopic',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . "]

In [16]:
type(pos_review)

list

In [17]:
pos_df = pd.DataFrame(pos_review)

In [18]:
pos_df.head(2)

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."


In [19]:
pos_df.rename(columns={0:'text'}, inplace=True)

In [20]:
pos_df.head(2)

Unnamed: 0,text
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."


In [21]:
pos_df['label'] = 1

In [22]:
pos_df.head(2)

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1


In [23]:
pos_df.shape

(5332, 2)

In [24]:
# concat two dataframe: neg_df and pos_df
df = pd.concat([pos_df, neg_df], ignore_index=True)

In [25]:
df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [26]:
# Specifying target and feature
x = df['text']
y = df['label']

#### Train, Test and Split Data

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [29]:
x_train.shape

(8531,)

In [30]:
x_test.shape

(2133,)

#### Pipeline Module and TfidfVectorizer

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

#### Build Model Using Logistic Regression Algorithm

In [32]:
from sklearn.linear_model import LogisticRegression

#### classification_report Metrics

In [33]:
from sklearn.metrics import classification_report, confusion_matrix

In [34]:
model = Pipeline([
    ('tfidf_vect', TfidfVectorizer(max_df= 0.5, min_df= 7, stop_words= 'english', sublinear_tf=True)),
    ('logistic regression', LogisticRegression(solver='liblinear', multi_class='auto'))
])

model.fit(x_train, y_train)

y_test_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

print(classification_report(y_test, y_test_pred))
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      1061
           1       0.73      0.76      0.75      1072

    accuracy                           0.74      2133
   macro avg       0.74      0.74      0.74      2133
weighted avg       0.74      0.74      0.74      2133

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      4271
           1       0.84      0.83      0.84      4260

    accuracy                           0.84      8531
   macro avg       0.84      0.84      0.84      8531
weighted avg       0.84      0.84      0.84      8531



In [35]:
confusion_matrix(y_test, y_test_pred)

array([[766, 295],
       [258, 814]], dtype=int64)

#### SGDClassifier Model

In [36]:
from sklearn.linear_model import SGDClassifier

In [37]:
model1 = Pipeline([
    ('tfidf_vect',TfidfVectorizer(max_df= 0.5, min_df= 7, stop_words= 'english', sublinear_tf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))
])

model1.fit(x_train, y_train)

y_test_pred = model1.predict(x_test)
y_train_pred = model1.predict(x_train)

print(classification_report(y_test, y_test_pred))
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.74      0.71      0.72      1061
           1       0.72      0.75      0.74      1072

    accuracy                           0.73      2133
   macro avg       0.73      0.73      0.73      2133
weighted avg       0.73      0.73      0.73      2133

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      4271
           1       0.79      0.80      0.79      4260

    accuracy                           0.79      8531
   macro avg       0.79      0.79      0.79      8531
weighted avg       0.79      0.79      0.79      8531



#### MultinomialNB Naive Bayes Model

In [38]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('multi_NB', MultinomialNB())
])

clf.fit(x_train, y_train)

yTestPred = clf.predict(x_test)
yTrainPred = clf.predict(x_train)

print(classification_report(y_test, yTestPred))
print(classification_report(y_train, yTrainPred ))

              precision    recall  f1-score   support

           0       0.77      0.78      0.78      1061
           1       0.78      0.78      0.78      1072

    accuracy                           0.78      2133
   macro avg       0.78      0.78      0.78      2133
weighted avg       0.78      0.78      0.78      2133

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      4271
           1       0.94      0.91      0.93      4260

    accuracy                           0.93      8531
   macro avg       0.93      0.93      0.93      8531
weighted avg       0.93      0.93      0.93      8531

