In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
nlp=spacy.load("en_core_web_sm")

In [3]:
df=pd.read_csv("imdb_reviews.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [6]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [8]:
def process_text(text):
    text=re.sub("<br ?/?>", "", text)
    text=re.sub("[^\w\s]", "", text)
    doc=nlp(text)
    processed_text=[token.lemma_.lower() for token in doc if not token.is_stop]
    final_text=" ".join(processed_text)
    return final_text

In [9]:
df['review']=df['review'].apply(process_text)
df.head()

Unnamed: 0,review,sentiment
0,reviewer mention watch 1 oz episode ll hook ri...,1
1,wonderful little production filming technique ...,1
2,think wonderful way spend time hot summer week...,1
3,basically s family little boy jake think s zom...,0
4,petter matteis love time money visually stunni...,1


In [10]:
X=df['review']
y=df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [11]:
tfidf=TfidfVectorizer()

X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

In [12]:
model=KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_tfidf, y_train)

KNeighborsClassifier()

In [13]:
y_pred=model.predict(X_test_tfidf)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.72      0.76      6250
           1       0.75      0.83      0.79      6250

    accuracy                           0.78     12500
   macro avg       0.78      0.78      0.78     12500
weighted avg       0.78      0.78      0.78     12500



In [15]:
confusion_matrix(y_test, y_pred)

array([[4511, 1739],
       [1054, 5196]], dtype=int64)

In [22]:
def final_model_report(y_true, y_prediction):
    print("model accuracy: ", accuracy_score(y_test, y_prediction))
    print("="*100)
    print("classification report: \n", classification_report(y_true, y_prediction))

In [23]:
final_model_report(y_test, y_pred)

model accuracy:  0.77656
classification report: 
               precision    recall  f1-score   support

           0       0.81      0.72      0.76      6250
           1       0.75      0.83      0.79      6250

    accuracy                           0.78     12500
   macro avg       0.78      0.78      0.78     12500
weighted avg       0.78      0.78      0.78     12500

