In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [3]:
data_path = r"C:\Users\bbuser\Desktop\aclImdb_cleaned\imdb_train_cleaned_sample.csv"
df = pd.read_csv(data_path)

In [4]:
df = df[['cleaned_review', 'label']].dropna()
if df['label'].dtype == object:
    df['label'] = df['label'].map({'neg': 0, 'pos': 1}).astype(int)

x_text = df['cleaned_review'].astype(str)
y = df['label'].astype(int)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    x_text, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1, 2), max_features=50000)
x_train_tf = vectorizer.fit_transform(x_train)
x_test_tf  = vectorizer.transform(x_test)

In [7]:
models = {
    "logreg": LogisticRegression(max_iter=2000, n_jobs=None),
    "dtree": DecisionTreeClassifier(random_state=42),
    "rf": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
}

In [8]:
rows = []
preds = {}

for name, model in models.items():
    model.fit(x_train_tf, y_train)
    y_pred = model.predict(x_test_tf)
    preds[name] = y_pred

    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)

    rows.append({
        "model": name,
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1
    })

    print(f"\n=== {name.upper()} — classification report ===")
    print(classification_report(y_test, y_pred, target_names=["negative", "positive"]))

metrics_df = pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)
print("\n=== Comparison table (higher is better) ===")
print(metrics_df.to_string(index=False))


=== LOGREG — classification report ===
              precision    recall  f1-score   support

    negative       0.90      0.87      0.88      1000
    positive       0.87      0.90      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000


=== DTREE — classification report ===
              precision    recall  f1-score   support

    negative       0.70      0.70      0.70      1000
    positive       0.70      0.70      0.70      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000


=== RF — classification report ===
              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      1000
    positive       0.87      0.85      0.86      1000

    accuracy                           0.86      2000
   macro avg 

In [9]:
ex_idx = np.random.RandomState(42).choice(len(x_test), size=5, replace=False)
example_df = pd.DataFrame({
    "cleaned_review": x_test.iloc[ex_idx].values,
    "true": ["positive" if t == 1 else "negative" for t in y_test.iloc[ex_idx].values],
    "logreg": ["positive" if p == 1 else "negative" for p in preds["logreg"][ex_idx]],
    "dtree":  ["positive" if p == 1 else "negative" for p in preds["dtree"][ex_idx]],
    "rf":     ["positive" if p == 1 else "negative" for p in preds["rf"][ex_idx]],
})

In [10]:
print("\n=== Example predictions (5 samples) ===")
print(example_df.to_string(index=False))


=== Example predictions (5 samples) ===
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     cleaned_review     true   logreg    dtree       rf
                                                                                                                                                                                                                                                                                                                                       

In [12]:
example_df

Unnamed: 0,cleaned_review,true,logreg,dtree,rf
0,mystery men one movie get funnier time naive i...,positive,positive,positive,positive
1,show dull lame basically rip sort various thin...,negative,negative,positive,negative
2,interesting hardly scientific evidence movie p...,negative,negative,negative,negative
3,anyone wish get impression soviet view modern ...,positive,positive,positive,positive
4,early group suspiciously old looking teen mani...,negative,negative,positive,negative


##### Logistic Regression is the best model for these cleaned IMDb reviews: it gets the highest scores (accuracy 0.89, F1 0.888), is fast, and works well with TF-IDF text because a simple linear boundary fits the data and avoids overfitting. Random Forest comes second (accuracy 0.86, F1 0.862): it’s solid but heavier and harder to explain (you only get rough feature importances). Decision Tree is last (accuracy 0.70, F1 0.703): it’s easy to understand but tends to overfit sparse TF-IDF features, so it generalizes worse.