In [11]:
import os 
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

import re, string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Data Exploration/Cleaning

In [12]:
path_data = "../data/imdb_data.csv"
data = pd.read_csv(path_data)

In [13]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [14]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [15]:
def clean_text(text):
    text = text.replace('<br /><br />','')
    text = text.lower() 
    text = text.strip()  
    text = re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [16]:
data["review_clean"] = data["review"].apply(lambda x: clean_text(x))

In [17]:
def remove_stopwords(text, stopwords):
    text = ' '.join([i for i in text.split() if i not in stopwords])
    return text

In [18]:
stopwords_list = list(set(stopwords.words("english")))
data["review_clean"] = data["review"].apply(lambda x: remove_stopwords(x, stopwords_list))

In [19]:
def apply_lemmatizer(text, lemmatizer):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [20]:
lemmatizer = WordNetLemmatizer()
data["review_clean"] = data["review"].apply(lambda x: apply_lemmatizer(x, lemmatizer))

## Feature engineering 

In [21]:
X = data["review_clean"]
y = data["sentiment"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [36]:
data.iloc[X_test.index][["review"]].sample(n=30)

Unnamed: 0,review
18627,"As a fan of Notorious B.I.G., I was looking fo..."
18084,"Deep Water (2006) ****<br /><br />""It is indif..."
1017,Turn your backs away or you're gonna get in bi...
38062,This is a great ending to the show. The fact t...
48607,"Yes, bad acting isn't only one thing to mentio..."
43104,"This was one of the most dishonest, meaningles..."
38567,Seeing that this got a theatrical release nowh...
16978,The primary aspect of this film which most peo...
47293,I've had this movie on tape for years and star...
40253,"When I think about TV movies, I always think o..."


In [35]:
data.iloc[X_test.index][["review"]].sample(n=30).to_csv(r"../data/test_reviews.csv", index=False)

In [23]:
# Apply TF-IDF on tokens
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=0.05)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
joblib.dump(tfidf_vectorizer, "../artifacts/tfidf-vectorizer.sav")

['../artifacts/tfidf-vectorizer.sav']

## Model training 

In [25]:
# Logistic Regression
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr.fit(X_train_tfidf, y_train)

y_pred = lr.predict(X_test_tfidf)
print("Logistic Regression:")
print("train:", accuracy_score(y_train, lr.predict(X_train_tfidf)))
print("test:", accuracy_score(y_test, y_pred))

Logistic Regression:
train: 0.8442666666666667
test: 0.83272


In [26]:
joblib.dump(lr, "../artifacts/logistic_regression.sav")

['../artifacts/logistic_regression.sav']

## Tests with other models

In [27]:
# Multinomial NB
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

y_pred = mnb.predict(X_test_tfidf)
print("Multinomial NB:")
print("train:", accuracy_score(y_train, mnb.predict(X_train_tfidf)))
print("test:", accuracy_score(y_test, y_pred))

Multinomial NB:
train: 0.81584
test: 0.81248


In [28]:
# Random Forest
forest = RandomForestClassifier(max_depth=12, n_estimators=200, min_samples_split=4, criterion="entropy", random_state=0)
forest.fit(X_train_tfidf, y_train)

y_pred = forest.predict(X_test_tfidf)
print("Random Forest:")
print("train:", accuracy_score(y_train, forest.predict(X_train_tfidf)))
print("test:", accuracy_score(y_test, y_pred))

Random Forest:
train: 0.8598133333333333
test: 0.78712
