In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [2]:
df = pd.read_csv('D:\datasets\IMDB Dataset.csv')
print(df.head(10))

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


In [3]:
sentiment = LabelEncoder()
df['sentiment_num'] = sentiment.fit_transform(df["sentiment"])
df.head(10)

Unnamed: 0,review,sentiment,sentiment_num
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
5,"Probably my all-time favorite movie, a story o...",positive,1
6,I sure would like to see a resurrection of a u...,positive,1
7,"This show was an amazing, fresh & innovative i...",negative,0
8,Encouraged by the positive comments about this...,negative,0
9,If you like original gut wrenching laughter yo...,positive,1


In [4]:
inputs = df.drop(['sentiment'],axis = 'columns')
inputs.head()

Unnamed: 0,review,sentiment_num
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
x_train,x_test,y_train,y_test = train_test_split(inputs.review,inputs.sentiment_num,test_size=0.25)

In [6]:
v = CountVectorizer()
x_train_count = v.fit_transform(x_train.values)
x_train_count

<37500x90595 sparse matrix of type '<class 'numpy.int64'>'
	with 5117798 stored elements in Compressed Sparse Row format>

In [7]:
# Naive Bayes
model1 = MultinomialNB()
model1.fit(x_train_count,y_train)

MultinomialNB()

In [8]:
x_test_count = v.transform(x_test)

In [10]:
model1.score(x_test_count,y_test)

0.84832

In [12]:
# KNN
model2 = KNeighborsClassifier(n_neighbors=10)
model2.fit(x_train_count,y_train)
model2.score(x_test_count,y_test)

0.6508

In [11]:
# logistic Regression
model3 = LogisticRegression(solver='liblinear', random_state=0)
model3.fit(x_train_count,y_train)
model3.score(x_test_count,y_test)

0.88872

In [14]:
# Decision Tree
model4 = tree.DecisionTreeClassifier()
model4.fit(x_train_count,y_train)
model4.score(x_test_count,y_test)

0.72504

In [15]:
# SVM
model5 = SVC()
model5.fit(x_train_count,y_train)
model5.score(x_test_count,y_test)

0.87248

In [16]:
# Random Forest
model6 = RandomForestClassifier()
model6.fit(x_train_count,y_train)
model6.score(x_test_count,y_test)

0.85064

In [30]:
review = ['it was boring',
         'i loved it',
         'wasted time, it was quite boring',
         'awesome',
         'loved it']
review_count = v.transform(review)
sol = model1.predict(review_count)

In [31]:
for i in sol:
    if i == 0:
        print("Negative")
    else:
        print("positive")

Negative
positive
Negative
positive
positive


In [13]:
def predict_sentiment(text):
    text = [text]
    sentiments = v.transform(text)
    senti = model3.predict(sentiments)
    if senti == 0:
        return "Negative","\U0001F620"
    else:
        return "positive","\U0001f600"
    

In [18]:
print(predict_sentiment('i loved it'))

('positive', '😀')
