In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snb

#using os library for don't want to get kernel appears error!!!
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

In [2]:
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data["sentiment"].replace({"positive":1,"negative":0},inplace=True)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
data.shape

(50000, 2)

In [5]:
data_positive = data[data["sentiment"]==1].sample(5000)# getting 5000 samples with shuffle
data_negative = data[data["sentiment"]==0].sample(5000)
new_data = pd.concat([data_positive,data_negative])
new_data.head()

Unnamed: 0,review,sentiment
19451,"for many and many years, gaijin have visited j...",1
40332,Being 15 myself I enjoyed this flick thourough...,1
42883,I find this movie the best movie I have ever s...,1
30938,"Before Barton jumps all over my remarks, let's...",1
5247,For the sake of propaganda during World War II...,1


In [6]:
index_list = []
for i in range(0,new_data.shape[0]):
    index_list.append(i)

In [7]:
new_data = new_data.sample(10000) # setting index numbers after get data with shuffle method
new_data.head()
new_data = new_data.set_axis(index_list,axis ="index")

In [8]:
new_data.sentiment.value_counts()

sentiment
1    5000
0    5000
Name: count, dtype: int64

In [9]:
new_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [10]:
new_data.review[4]

"First of all that I would like to say is that Edison Chen is extremely hot and that Sam Lee is looking much better than before XD! This is probably one of the most original movies I have seen so far; shows a poverty lifestyle background of a Cambodian. The Cambodian(Edison aka Pang) goes around killing people to survive himself; has done it throughout his entire life. Sam Lee's(Wai) duty is to capture the Cambodian for good. There are tons of violent actions but has a good story to it. The movie shows the struggles between those two characters; they both beat each other like angry dogs. GO AND WATCH PPL...STRONGLY SUGGESSTED!!! (GO HK FILMS)"

In [11]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def stop_words(text):
# Process the text using spaCy
    doc = nlp(text)

# Remove stopwords
    filtered_words = [token.text for token in doc if not token.is_stop]

# Join the filtered words to form a clean text
    clean_text = ' '.join(filtered_words)
    return clean_text

In [12]:
stop_words(new_data.review[4])

"like Edison Chen extremely hot Sam Lee looking better XD ! probably original movies seen far ; shows poverty lifestyle background Cambodian . Cambodian(Edison aka Pang ) goes killing people survive ; entire life . Sam Lee's(Wai ) duty capture Cambodian good . tons violent actions good story . movie shows struggles characters ; beat like angry dogs . WATCH PPL ... STRONGLY SUGGESSTED ! ! ! ( HK FILMS )"

In [13]:
#deleting \ , "", <br /><br /> , stopwords

def preprocessing(text):
    text = stop_words(text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W+]', ' ', text.lower())
    return text

In [14]:
preprocessing(new_data.review[4])

'like edison chen extremely hot sam lee looking better xd   probably original movies seen far   shows poverty lifestyle background cambodian   cambodian edison aka pang   goes killing people survive   entire life   sam lee s wai   duty capture cambodian good   tons violent actions good story   movie shows struggles characters   beat like angry dogs   watch ppl     strongly suggessted         hk films  '

In [15]:
new_data.review = new_data.review.apply(preprocessing) # apply preprocessing to all review data in dataframe
new_data.head()

Unnamed: 0,review,sentiment
0,ocean probably better ocean know people disa...,1
1,saw sundance figure won directing award pa...,0
2,case script plays audience manner serves exten...,1
3,mccabe mrs miller takes place turn ...,0
4,like edison chen extremely hot sam lee looking...,1


In [16]:
new_data.shape

(10000, 2)

In [17]:
vectorizer = TfidfVectorizer(lowercase=False)

In [18]:
vectorized_text = vectorizer.fit_transform(new_data.review).toarray()
vectorized_text

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
vectorized_text.shape

(10000, 52002)

In [20]:
x_train,x_test,y_train,y_test = train_test_split(vectorized_text,new_data.sentiment,test_size=0.2,random_state=0,stratify=new_data.sentiment)
x_train.shape

(8000, 52002)

In [21]:
model = LogisticRegressionCV(max_iter=10000)
model.fit(x_train,y_train)

In [22]:
model.score(x_test,y_test)

0.881

In [23]:
y_predicted = model.predict(x_test)

In [24]:
from sklearn.metrics import classification_report
cr = classification_report(y_pred=y_predicted,y_true=y_test)

In [25]:
print(cr) # Accuracy and other scores:

              precision    recall  f1-score   support

           0       0.90      0.85      0.88      1000
           1       0.86      0.91      0.88      1000

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000



In [26]:
import pickle
pickle.dump(model,open("model_of_sentiment_analysis.pkl","wb"))
pickle.dump(vectorizer,open("vectorizer_of_sentiment_analysis.pkl","wb"))