In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from pathlib import Path
import re, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [37]:
df = pd.read_csv("imdb_cleaned.csv")

In [38]:
df

Unnamed: 0,review,cleaned_review,label
0,"Silent Night, Deadly Night 5 is the very last ...",silent night deadly night last series like par...,0
1,The idea ia a very short film with a lot of in...,idea short film lot information interesting en...,1
2,"For me, this movie just seemed to fall on its ...",movie seemed fall face main problem casting gl...,0
3,Was this based on a comic-book? A video-game? ...,based comic book video game drawing year old n...,1
4,Caution: May contain spoilers...<br /><br />I'...,caution may contain spoiler seen movie time li...,1
...,...,...,...
9995,This is the best movie I've come across in a l...,best movie come across long best movie kind sc...,1
9996,**Possible Spoiler*** Adam Sandler is usually ...,possible spoiler adam sandler usually typecast...,1
9997,This was a new alltime low among westerns. The...,new alltime low among western writing excrucia...,0
9998,"I enjoyed ""American Movie"", so I rented Chris ...",enjoyed american movie rented chris smith firs...,0


In [39]:


# Keep required columns
_req = {"cleaned_review", "label"}
missing = _req - set(df.columns)
if missing:
    raise KeyError(f"Missing columns: {missing}. Expected {_req}")

data = df[["cleaned_review", "label"]].dropna().copy()
texts = data["cleaned_review"].astype(str)
y = data["label"].astype("int8")


In [40]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50_000, min_df=2)
X = vectorizer.fit_transform(texts)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [42]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
}

for name, clf in models.items():
    clf.fit(X_train, y_train)


In [43]:
def eval_model(clf):
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )
    return acc, p, r, f1, y_pred

rows, reports = [], {}
for name, clf in models.items():
    acc, p, r, f1, y_pred = eval_model(clf)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})
    reports[name] = classification_report(
        y_test, y_pred, target_names=["negative", "positive"], zero_division=0
    )

comparison_df = pd.DataFrame(rows).sort_values("F1", ascending=False).reset_index(drop=True)

print("\n=== Classification Reports ===")
for n, rep in reports.items():
    print(f"\n{n}\n{rep}")

comparison_df



=== Classification Reports ===

Logistic Regression
              precision    recall  f1-score   support

    negative       0.90      0.86      0.88      1008
    positive       0.86      0.90      0.88       992

    accuracy                           0.88      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.88      0.88      0.88      2000


Decision Tree
              precision    recall  f1-score   support

    negative       0.71      0.67      0.69      1008
    positive       0.69      0.73      0.71       992

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000


Random Forest
              precision    recall  f1-score   support

    negative       0.84      0.84      0.84      1008
    positive       0.84      0.84      0.84       992

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84  

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.879,0.863372,0.898185,0.880435
1,Random Forest,0.8385,0.836858,0.837702,0.83728
2,Decision Tree,0.7,0.686667,0.726815,0.70617


In [44]:
best_model_name = comparison_df.iloc[0]["Model"]
best_model = models[best_model_name]

sample_texts = data["cleaned_review"].sample(5, random_state=7).tolist()
sample_preds = best_model.predict(vectorizer.transform(sample_texts))
label_inv = {0: "negative", 1: "positive"}

print(f"Best model by F1: {best_model_name}\n")
for t, p in zip(sample_texts, sample_preds):
    print(f"REVIEW: {t[:250]}{'...' if len(t) > 250 else ''}")
    print(f"PRED  : {label_inv[int(p)]}")
    print("-" * 80)


Best model by F1: Logistic Regression

REVIEW: cant describe terrible movie one find way animated totaly disgusting men rohan look fake poorly colored hair doesnt even come close matching actor look like orcs looked terrible gandalf walk bad limp cant pronounce thing correctly saruman isnt aruman...
PRED  : negative
--------------------------------------------------------------------------------
REVIEW: knowledgeable fan recommend film faithful fact well acted year old living istanbul heard friend talking new music sensation caused girl scream thought hmmmm girl like must crap record haley mill everly brother ricky nelson soon vacation family milita...
PRED  : positive
--------------------------------------------------------------------------------
REVIEW: know minority uwe boll talented frog even toad frog reminiscent hundred talent hack churn one useless crap fest another movie crap fest slater talent minimally utilized leading one believe got thing like failed relationship mind reid

In [45]:
comparison_df.to_csv("imdb_model_comparison.csv", index=False)
print("Saved to imdb_model_comparison.csv")


Saved to imdb_model_comparison.csv
