In [1]:
import numpy as np
import pandas as pd

In [2]:
#Loading the dataset
df = pd.read_csv('../data/IMDB Dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review          50000 non-null  object
 1   length of text  50000 non-null  int64 
 2   sentiment       50000 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [3]:
#exploring categories
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [4]:
df.sentiment[:10]

0    positive
1    positive
2    positive
3    positive
4    positive
5    positive
6    positive
7    positive
8    positive
9    negative
Name: sentiment, dtype: object

In [5]:
#changing positive to 1 and negative to 0
y = np.where(df.sentiment=='positive',1,0)
y[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [6]:
#split the dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.review ,y, test_size=0.2, stratify=y)

In [7]:
#vectorize the dataset
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer(lowercase=True,ngram_range=(1,2), min_df=10, max_df=0.8)
X_train_tfid = tfid.fit_transform(X_train)

In [8]:
list(tfid.vocabulary_)[-10:]

['by jeremy',
 'illusive',
 'well sorry',
 'alley cat',
 'temptations',
 'sense why',
 'again since',
 'at little',
 'so likable',
 'the intervening']

In [9]:
#quick evaluation of model compare to dummy
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

score_dummy = cross_val_score(DummyClassifier(strategy='uniform'),X_train_tfid, y_train)
score_lr = cross_val_score(LogisticRegression(max_iter=1000), X_train_tfid, y_train)

print(f'Dummy Score: {score_dummy.mean():0.1f} +- {score_dummy.std():0.2f}')
print(f'LR Score: {score_lr.mean():0.4f} +- {score_lr.std():0.2f}')

Dummy Score: 0.5 +- 0.01
LR Score: 0.9018 +- 0.00


In [10]:
#train and test logistic regressor model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfid,y_train)

X_test_tfid = tfid.transform(X_test)
lr.score(X_test_tfid,y_test)

0.9083

In [11]:
#Assemble pipeline with vectorizer, feature selection, logistic regression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

lr_pipeline = Pipeline([('vect', TfidfVectorizer(lowercase=True,ngram_range=(1,2), min_df=10, max_df=0.5)),
                        ('fs', SelectFromModel(estimator=LogisticRegression(max_iter=1000))),
                        ('lr', LogisticRegression())
                       ])

lr_pipeline.fit(X_train, y_train)
lr_pipeline.score(X_test, y_test)

0.9094

In [12]:
'''
from sklearn.model_selection import GridSearchCV

params={'vect__min_df':[1,5,10,50],
        'vect__max_df':[0.5,0.7,0.8,0.9]
        }
grid_lr_pipeline=GridSearchCV(lr_pipeline, params, cv=3, n_jobs=-1).fit(X_train, y_train)
grid_lr_pipeline.best_params_'''

"\nfrom sklearn.model_selection import GridSearchCV\n\nparams={'vect__min_df':[1,5,10,50],\n        'vect__max_df':[0.5,0.7,0.8,0.9]\n        }\ngrid_lr_pipeline=GridSearchCV(lr_pipeline, params, cv=3, n_jobs=-1).fit(X_train, y_train)\ngrid_lr_pipeline.best_params_"

In [13]:
#testing the pipeline model
from sklearn.metrics import confusion_matrix

y_pred = lr_pipeline.predict(X_train)
confusion = confusion_matrix(y_train, y_pred,normalize='true')
print(f'Training Accuracy:{lr_pipeline.score(X_train,y_train)}')
print(f'Test Accuracy:{lr_pipeline.score(X_test,y_test)}')
print(f'Confusion Matrix:',confusion)

Training Accuracy:0.9442
Test Accuracy:0.9094
Confusion Matrix: [[0.9379 0.0621]
 [0.0495 0.9505]]


In [14]:
#Saving and exporting model
import pickle

filename = 'sentiment_model_pipeline.sav'
pickle.dump(lr_pipeline, open(filename, 'wb'))