# IMDB Movie Reviews Sentiment Analysis

In [1]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-imuu3pll
  Running command git clone --filter=blob:none --quiet https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-imuu3pll
  Resolved https://github.com/laxmimerit/preprocess_kgptalkie.git to commit 9ca68d37027af9f6a30d54640347ce3b2e2694b3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-py3-none-any.whl size=7620 sha256=437ae31d4152fcf1f3850fe76e916391dabf6828a3e735ac22583c204337215d
  Stored in directory: /tmp/pip-ephem-wheel-cache-mi4jy1a9/wheels/5c/94/34/99d5ff65e88b8d9a6c5e8d8652f2311d87790a61a1b7466e21
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptal

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [4]:
import preprocess_kgptalkie as ps

In [6]:
df = pd.read_csv('/content/drive/MyDrive/imdb_reviews.txt',sep = '\t', header= None)

In [8]:
df.head() , df.shape

(                                                   0  1
 0  A very, very, very slow-moving, aimless movie ...  0
 1  Not sure who was more lost - the flat characte...  0
 2  Attempting artiness with black & white and cle...  0
 3       Very little music or anything to speak of.    0
 4  The best scene in the movie was when Gerardo i...  1,
 (748, 2))

In [9]:
df.columns = ['reviews','sentiment']

In [10]:
df.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [13]:
df['reviews'] = df['reviews'].apply(lambda x: ps.cont_exp(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_special_chars(x))

df['reviews'] = df['reviews'].apply(lambda x: ps.remove_accented_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_emails(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_html_tags(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_urls(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.make_base(x))

In [15]:
df['reviews'] = df['reviews'].apply(lambda x: str(x).lower())

In [16]:
df

Unnamed: 0,reviews,sentiment
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lose the flat character ...,0
2,attempt artiness with black white and clever c...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when gerardo i...,1
...,...,...
743,i just get bored watch jessice lange take her ...,0
744,unfortunately any virtue in this film producti...,0
745,in a word it is embarrass,0
746,exceptionally bad,0


## Data Preparation for Model Training

In [17]:
x = df['reviews']
y = df['sentiment']

In [19]:
x_train , x_test , y_train , y_test = train_test_split(x ,y , test_size=0.2 , random_state =0, stratify = y)

In [20]:
x_train.shape , x_test.shape

((598,), (150,))

## ML Model Building

In [48]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf',LogisticRegression(solver = 'liblinear'))
])

In [49]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [50]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1 , cv=None)

In [51]:
%%time
clf.fit(x_train ,  y_train)

CPU times: user 853 ms, sys: 35.7 ms, total: 888 ms
Wall time: 19.5 s


In [52]:
clf.best_estimator_

In [53]:
clf.best_params_

{'clf__C': 2,
 'clf__penalty': 'l2',
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 0.5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [54]:
clf.best_score_

0.7792436974789916

In [55]:
y_pred = clf.predict(x_test)

In [56]:
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73        73
           1       0.74      0.75      0.75        77

    accuracy                           0.74       150
   macro avg       0.74      0.74      0.74       150
weighted avg       0.74      0.74      0.74       150



## SVM

In [57]:
from sklearn.svm import LinearSVC

In [58]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [59]:
hyperparameters = {
    'tfidf__max_df': (0.5, 1.0),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    'clf__C': (1,2,2.5,3)
}

In [60]:
clf_svm = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv = 5)

In [61]:
clf_svm.fit(x_train, y_train)

In [65]:
clf_svm.best_params_

{'clf__C': 1,
 'tfidf__analyzer': 'word',
 'tfidf__max_df': 1.0,
 'tfidf__ngram_range': (1, 1),
 'tfidf__use_idf': True}

In [66]:
clf_svm.best_score_

0.7858963585434173

In [68]:
y_pred = clf_svm.predict(x_test)

In [69]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73        73
           1       0.75      0.70      0.72        77

    accuracy                           0.73       150
   macro avg       0.73      0.73      0.73       150
weighted avg       0.73      0.73      0.73       150



## Model testing and saving

In [70]:
X = ['this is great moview. i loved it', 'i have watched this movie. plot is straight. return my money']

In [71]:
clf_svm.predict(X)

array([1, 0])

In [72]:
import pickle as pkl

In [73]:
pkl.dump(clf , open('model.pkl','wb'))