In [11]:
# Importing Libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
import re

In [6]:
# Reading Data

train_reviews=[]

for line in open('./movie_data/full_train.txt','r',encoding="utf8"):
    train_reviews.append(line.strip())
    
test_reviews=[]

for line in open('./movie_data/full_test.txt','r',encoding="utf8"):
    test_reviews.append(line.strip())

In [8]:
train_reviews[3]

'This is easily the most underrated film inn the Brooks cannon. Sure, its flawed. It does not give a realistic view of homelessness (unlike, say, how Citizen Kane gave a realistic view of lounge singers, or Titanic gave a realistic view of Italians YOU IDIOTS). Many of the jokes fall flat. But still, this film is very lovable in a way many comedies are not, and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive. Its not The Fisher King, but its not crap, either. My only complaint is that Brooks should have cast someone else in the lead (I love Mel as a Director and Writer, not so much as a lead).'

In [9]:
# Cleaning and Pre Processing

replace_with_no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
replace_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
no_space = ""
space = " "

def cleaning_rev(reviews):
    reviews=[replace_with_no_space.sub(no_space,line.lower()) for line in reviews]
    reviews=[replace_with_space.sub(space,line.lower()) for line in reviews]
    return reviews

train_reviews1 = cleaning_rev(train_reviews)
test_reviews1 = cleaning_rev(test_reviews)


In [10]:
train_reviews1[3]

'this is easily the most underrated film inn the brooks cannon sure its flawed it does not give a realistic view of homelessness unlike say how citizen kane gave a realistic view of lounge singers or titanic gave a realistic view of italians you idiots many of the jokes fall flat but still this film is very lovable in a way many comedies are not and to pull that off in a story about some of the most traditionally reviled members of society is truly impressive its not the fisher king but its not crap either my only complaint is that brooks should have cast someone else in the lead i love mel as a director and writer not so much as a lead'

In [12]:
# Vectorization
vec=CountVectorizer(binary=True)
vec.fit(train_reviews1)

X=vec.transform(train_reviews1)
X_tets=vec.transform(test_reviews1)

In [18]:
# Model Classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8728
Accuracy for C=0.05: 0.88288
Accuracy for C=0.25: 0.88256
Accuracy for C=0.5: 0.88064
Accuracy for C=1: 0.8792


In [20]:
# Building filanl model using c=0.05

final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_tets)))

Final Accuracy: 0.88128


In [25]:
feature_to_coef = {
    word: coef for word, coef in zip(
        vec.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

('excellent', 0.9288812013932073)
('perfect', 0.7934640990541783)
('great', 0.67504091945357)
('amazing', 0.6160397757973147)
('superb', 0.6063967544747644)


In [23]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('worst', -1.3679783640763301)
('waste', -1.1684450509165956)
('awful', -1.0277000481174152)
('poorly', -0.8748317510773427)
('boring', -0.8587249498132771)


In [26]:
a=final_model.predict(X_tets)

In [33]:
a[0:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0])