In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [2]:
FILE_PATH = "imdb_cleaned_sample.csv"
df = pd.read_csv(FILE_PATH)[["cleaned_review", "label"]].dropna()
df["cleaned_review"] = df["cleaned_review"].astype(str).str.strip()
df["label"] = df["label"].astype(str).str.strip().str.lower()

In [3]:
df

Unnamed: 0,cleaned_review,label
0,like ronald colman dashing debonair fellow see...,1
1,found film fascinating study family crisis leo...,1
2,thief liar present naturalistic depiction leve...,1
3,cant understand decided release film introduce...,1
4,screwball comedy romantic mismatch new york ci...,1
...,...,...
9995,film tell story extrovert frannie suddenly ret...,0
9996,lous film purple butterfly pit secret organiza...,0
9997,biggest mystery veronica mar one tackle screen...,0
9998,live salt lake city mormon rent movie well liv...,0


In [4]:
if set(df["label"]) <= {"0","1"}:
    df["label"] = df["label"].map(lambda x: "positive" if x == "1" else "negative")

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_review"].values,
    df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=df["label"].values
)

In [7]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(solver="liblinear", max_iter=1000, random_state=42),
    "Decision Tree":       DecisionTreeClassifier(max_depth=40, random_state=42),
    "Random Forest":       RandomForestClassifier(n_estimators=200, max_depth=25, n_jobs=-1, random_state=42),
}

In [9]:
rows = []
preds = {}  # store predictions per model (for example table)
for name, clf in models.items():
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    preds[name] = y_pred

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)
    print(f"\n=== Classification Report: {name} ===")
    print(classification_report(y_test, y_pred, digits=3, zero_division=0))

    rows.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "Precision (weighted)": round(prec, 4),
        "Recall (weighted)": round(rec, 4),
        "F1 (weighted)": round(f1, 4),
    })


=== Classification Report: Logistic Regression ===
              precision    recall  f1-score   support

    negative      0.853     0.844     0.849      1000
    positive      0.846     0.855     0.850      1000

    accuracy                          0.850      2000
   macro avg      0.850     0.849     0.849      2000
weighted avg      0.850     0.850     0.849      2000


=== Classification Report: Decision Tree ===
              precision    recall  f1-score   support

    negative      0.692     0.672     0.682      1000
    positive      0.681     0.701     0.691      1000

    accuracy                          0.686      2000
   macro avg      0.687     0.686     0.686      2000
weighted avg      0.687     0.686     0.686      2000


=== Classification Report: Random Forest ===
              precision    recall  f1-score   support

    negative      0.821     0.797     0.809      1000
    positive      0.803     0.826     0.814      1000

    accuracy                          

In [11]:
metrics_df = pd.DataFrame(rows).sort_values(by="F1 (weighted)", ascending=False).reset_index(drop=True)
display(metrics_df)

Unnamed: 0,Model,Accuracy,Precision (weighted),Recall (weighted),F1 (weighted)
0,Logistic Regression,0.8495,0.8495,0.8495,0.8495
1,Random Forest,0.8115,0.8118,0.8115,0.8115
2,Decision Tree,0.6865,0.6867,0.6865,0.6864


In [14]:
np.random.seed(42)
k = min(5, len(X_test))
idx = np.random.choice(range(len(X_test)), size=k, replace=False)

examples = []
for i in idx:
    row = {
        "review_snippet": (X_test[i][:200] + "...") if len(X_test[i]) > 200 else X_test[i],
        "true_label": y_test[i],
    }
    for name in models.keys():
        row[f"{name}_pred"] = preds[name][i]
    examples.append(row)

examples_df = pd.DataFrame(examples)
print("\n=== Example Predictions (5 random test reviews) ===")
display(examples_df)


=== Example Predictions (5 random test reviews) ===


Unnamed: 0,review_snippet,true_label,Logistic Regression_pred,Decision Tree_pred,Random Forest_pred
0,twenty five year ago showed film childrens cla...,positive,positive,positive,positive
1,man farthest reach earth traveling new world n...,negative,negative,negative,negative
2,first get personal feeling way let start sayin...,negative,negative,negative,negative
3,likely voted best comedy year many coincidence...,positive,positive,negative,positive
4,real challenge make movie baby devoured wild c...,negative,positive,negative,negative


In [15]:
best_row = metrics_df.iloc[0]
print(f"\nBest model by weighted F1: {best_row['Model']} "
      f"(Acc={best_row['Accuracy']}, P={best_row['Precision (weighted)']}, "
      f"R={best_row['Recall (weighted)']}, F1={best_row['F1 (weighted)']})")


Best model by weighted F1: Logistic Regression (Acc=0.8495, P=0.8495, R=0.8495, F1=0.8495)
