<a href="https://colab.research.google.com/github/iemAnshuman/AI-Projects/blob/main/movie_review_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("IMDB Dataset.csv")

In [5]:
!pip install nltk

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
import re
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

df["sentences"] = df["review"].apply(sent_tokenize)
df[["review", "sentences"]]

Unnamed: 0,review,sentences
0,One of the other reviewers has mentioned that ...,[One of the other reviewers has mentioned that...
1,A wonderful little production. <br /><br />The...,"[A wonderful little production., <br /><br />T..."
2,I thought this was a wonderful way to spend ti...,[I thought this was a wonderful way to spend t...
3,Basically there's a family where a little boy ...,[Basically there's a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter Mattei's ""Love in the Time of Money"" i..."
...,...,...
49995,I thought this movie did a down right good job...,[I thought this movie did a down right good jo...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...","[Bad plot, bad dialogue, bad acting, idiotic d..."
49997,I am a Catholic taught in parochial elementary...,[I am a Catholic taught in parochial elementar...
49998,I'm going to have to disagree with the previou...,[I'm going to have to disagree with the previo...


In [8]:
df["tokens_raw"] = df["review"].apply(word_tokenize)
df[["review", "tokens_raw"]]

Unnamed: 0,review,tokens_raw
0,One of the other reviewers has mentioned that ...,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,"[A, wonderful, little, production, ., <, br, /..."
2,I thought this was a wonderful way to spend ti...,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,"[Basically, there, 's, a, family, where, a, li..."
4,"Petter Mattei's ""Love in the Time of Money"" is...","[Petter, Mattei, 's, ``, Love, in, the, Time, ..."
...,...,...
49995,I thought this movie did a down right good job...,"[I, thought, this, movie, did, a, down, right,..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...","[Bad, plot, ,, bad, dialogue, ,, bad, acting, ..."
49997,I am a Catholic taught in parochial elementary...,"[I, am, a, Catholic, taught, in, parochial, el..."
49998,I'm going to have to disagree with the previou...,"[I, 'm, going, to, have, to, disagree, with, t..."


In [9]:
eng_stop = set(stopwords.words("english"))

def clean_tokens(tokens):
    cleaned = []
    for t in tokens:
        t = t.lower()                          # normalize case
        t = re.sub(r"[^a-z']+", " ", t)        # keep letters and apostrophes; drop digits/punct → spaces
        t = t.strip()
        if t and t not in eng_stop:            # remove empty strings and stopwords
            cleaned.append(t)
    return cleaned

df["tokens_clean"] = df["tokens_raw"].apply(clean_tokens)
df[["review", "tokens_clean"]]

Unnamed: 0,review,tokens_clean
0,One of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, oz, epis..."
1,A wonderful little production. <br /><br />The...,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,"[basically, 's, family, little, boy, jake, thi..."
4,"Petter Mattei's ""Love in the Time of Money"" is...","[petter, mattei, 's, love, time, money, '', vi..."
...,...,...
49995,I thought this movie did a down right good job...,"[thought, movie, right, good, job, n't, creati..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,"[catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,"['m, going, disagree, previous, comment, side,..."


In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df["tokens_clean"], df["sentiment"], test_size=0.2)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

x_train_str = [" ".join(tokens) for tokens in x_train]
x_test_str = [" ".join(tokens) for tokens in x_test]

tfidf_vectorizer = TfidfVectorizer()

x_train_tfidf = tfidf_vectorizer.fit_transform(x_train_str)
x_test_tfidf = tfidf_vectorizer.transform(x_test_str)

print("TF-IDF vectorized data shape (train):", x_train_tfidf.shape)
print("TF-IDF vectorized data shape (test):", x_test_tfidf.shape)

TF-IDF vectorized data shape (train): (40000, 90645)
TF-IDF vectorized data shape (test): (10000, 90645)


In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_tfidf, y_train)

In [17]:
from sklearn.naive_bayes import MultinomialNB

model2 = MultinomialNB()
model2.fit(x_train_tfidf, y_train)

In [18]:
y1_prediction = model.predict(x_test_tfidf)
y2_prediction = model2.predict(x_test_tfidf)

In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

def evaluate(y_true, y_pred, name="model"):
    acc = accuracy_score(y_true, y_pred)
    # Explicitly set pos_label to 'positive' since the labels are strings
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label="positive")
    print(f"=== {name} ===")
    print(f"Accuracy: {acc:.4f}  Precision: {p:.4f}  Recall: {r:.4f}  F1: {f1:.4f}")
    print("\nClassification report:\n", classification_report(y_true, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))

evaluate(y_test, y1_prediction, "LogisticRegression")
evaluate(y_test, y2_prediction, "MultinomialNB")

=== LogisticRegression ===
Accuracy: 0.8980  Precision: 0.8912  Recall: 0.9088  F1: 0.8999

Classification report:
               precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4954
    positive       0.89      0.91      0.90      5046

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion matrix:
 [[4394  560]
 [ 460 4586]]
=== MultinomialNB ===
Accuracy: 0.8671  Precision: 0.8781  Recall: 0.8553  F1: 0.8666

Classification report:
               precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4954
    positive       0.88      0.86      0.87      5046

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion matrix:
 [[4355  599]
 [ 730 4316]]
