 ## Movie sentiment analysis based on an IMDB reviews dataset.

In [14]:
import sklearn
import numpy as np
import pandas as pd


class Review:
    def __init__(self, text,sentiment):
        self.text=text
        self.sentiment=sentiment

reviews=[]

df=pd.read_csv('IMDB Dataset.csv')
df.head(20)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


### Removing HTML tags from reviews

In [18]:
df['review']=df['review'].str.replace('<br /><br />','')
df.head(20)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [19]:
for entry in df.itertuples():
    reviews.append(Review(entry.review,entry.sentiment))


 ### Preparing the data for use

In [20]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)
x_train= [x.text for x in training]
y_train= [x.sentiment for x in training]

x_test= [x.text for x in test]
y_test= [x.sentiment for x in test]

# print(len(x_test))
# print(len(y_test))


# print(len(x_train))
# print(len(y_train))



### Vectorization of reviews

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer()
trainingvectors_x=vectorizer.fit_transform(x_train)
testingvectors_x=vectorizer.transform(x_test)

### Logistic Regression classification algorithm

In [22]:
from sklearn.linear_model import LogisticRegression
lr_classifier=LogisticRegression(max_iter=200)
lr_classifier.fit(trainingvectors_x,y_train)

LogisticRegression(max_iter=200)

### Alternative Linear SVC model

In [23]:
from sklearn.svm import LinearSVC
svc_classifier=LinearSVC()
svc_classifier.fit(trainingvectors_x,y_train)

LinearSVC()

### Evaluation of logistic regression model

In [24]:
from sklearn.metrics import f1_score

print(f1_score(y_test, lr_classifier.predict(testingvectors_x), average=None))
print(lr_classifier.score(testingvectors_x,y_test))

[0.89455803 0.89845934]
0.8965454545454545


### Evaluation of SVC model

In [25]:
print(f1_score(y_test, svc_classifier.predict(testingvectors_x), average=None))
print(svc_classifier.score(testingvectors_x,y_test))

[0.89781337 0.90104384]
0.8994545454545455


Custom datasets example:

In [26]:
inputs=['this movie was meh','great movie','below expectations','i liked it','a waste of time','a letdown','could have been much better']
testing_inputs=vectorizer.transform(inputs)
lr_classifier.predict(testing_inputs)

array(['negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative'], dtype='<U8')

Using logistic regression algorithm, we've managed to train a model that predicts with up to 90% accuracy the sentiment of movie reviews, having such a large dataset is key.

In [27]:
svc_classifier.predict(testing_inputs)

array(['positive', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative'], dtype='<U8')

Although the SVC model shows better performance metrics, it failed to accurately predict the sentiment of the first text, and we will be checking if our model can be improved further by passing different values for certain parameters to the GridSearchCV function.

### Improving our model

In [28]:
from sklearn.model_selection import GridSearchCV

param={'C':(1,4,8,12,18)}
svc= LinearSVC()
improved_clf=GridSearchCV(svc,param,cv=5)
improved_clf.fit(trainingvectors_x,y_train)

#print(improved_clf.score(testingvectors_x,y_test))


GridSearchCV(cv=5, estimator=LinearSVC(), param_grid={'C': (1, 4, 8, 12, 18)})

In [29]:
import pickle

with open('classifier.pkl', 'wb') as f:
    pickle.dump(lr_classifier, f)
with open('classifier_svm.pkl','wb') as f:
    pickle.dump(svc_classifier,f)