In [1]:
import pandas as pd

In [2]:
data_path = r"C:\Users\bbuser\Downloads\imdb_movie_cleaned.csv"

df = pd.read_csv(data_path)

In [3]:
df

Unnamed: 0,id,rating,txt,label,cleaned_review
0,4506,10,I want to add to the praise for the production...,1,want add praise production film especially lum...
1,537,10,To me A Matter of Life and Death is just that-...,1,matter life death simply best film ever madefr...
2,6577,8,"With few exceptions, most of George Bernard Sh...",1,exception george bernard shaw play virtually d...
3,10992,10,"Damn, I've seen this movie for at least 4 time...",1,damn ive seen movie least time still dont get ...
4,324,8,I didn't expect to like this film as much as I...,1,didnt expect like film much got simply saw lis...
...,...,...,...,...,...
9995,2214,4,"this movie wasn't absolutely atrocious, but it...",0,movie wasnt absolutely atrocious pretty bad ac...
9996,7842,1,Caught this by accident on a t.v. showing - an...,0,caught accident showing could hardly believe u...
9997,890,2,I saw this regurgitated pile of vignettes toni...,0,saw regurgitated pile vignette tonight preview...
9998,516,2,Ever notice how in his later movies Burt Reyno...,0,ever notice later movie burt reynolds laugh so...


In [4]:
assert {'cleaned_review', 'label'}.issubset(df.columns), \
       "The file must contain 'cleaned_review' and 'label' columns"

In [5]:
df = df[['cleaned_review', 'label']].dropna()
df = df[df['cleaned_review'].str.len() > 3].reset_index(drop=True)

In [6]:
print(df.head(), df.shape)

                                      cleaned_review  label
0  want add praise production film especially lum...      1
1  matter life death simply best film ever madefr...      1
2  exception george bernard shaw play virtually d...      1
3  damn ive seen movie least time still dont get ...      1
4  didnt expect like film much got simply saw lis...      1 (10000, 2)


In [7]:
#Feature Extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
X = df['cleaned_review']
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range=(1, 2),
    max_features=50000
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("Train shape:", X_train_tfidf.shape)
print("Test shape:", X_test_tfidf.shape)


Train shape: (8000, 50000)
Test shape: (2000, 50000)


In [9]:
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ),
    "DecisionTree": DecisionTreeClassifier(
        random_state=42  # keep defaults as baseline
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        random_state=42
    )
}


for name, clf in models.items():
    clf.fit(X_train_tfidf, y_train)


In [11]:
# Evaluation
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

In [12]:
rows = []
reports = {}

for name, clf in models.items():
    y_pred = clf.predict(X_test_tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="weighted", zero_division=0
    )

    rows.append({
        "Model": name,
        "Accuracy": acc,
        "Precision_w": prec,
        "Recall_w": rec,
        "F1_w": f1
    })
    reports[name] = classification_report(y_test, y_pred, zero_division=0)

metrics_df = pd.DataFrame(rows).sort_values("F1_w", ascending=False).reset_index(drop=True)


metrics_show = metrics_df.copy()
for c in ["Accuracy", "Precision_w", "Recall_w", "F1_w"]:
    metrics_show[c] = metrics_show[c].map(lambda x: f"{x:.3f}")

metrics_show


Unnamed: 0,Model,Accuracy,Precision_w,Recall_w,F1_w
0,LogisticRegression,0.852,0.854,0.852,0.851
1,RandomForest,0.841,0.841,0.841,0.841
2,DecisionTree,0.687,0.687,0.687,0.687


In [13]:
for name, rep in reports.items():
    print(f"\n=== {name} ===")
    print(rep)


=== LogisticRegression ===
              precision    recall  f1-score   support

           0       0.88      0.81      0.85      1000
           1       0.83      0.89      0.86      1000

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000


=== DecisionTree ===
              precision    recall  f1-score   support

           0       0.69      0.68      0.68      1000
           1       0.68      0.70      0.69      1000

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000


=== RandomForest ===
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1000
           1       0.84      0.85      0.84      1000

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
we