In [111]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

In [100]:
df = pd.read_csv("Data/cleaned_IMDB_data.csv")

In [101]:
df

Unnamed: 0,id,rating,category,review
0,0,9,positive,bromwell high cartoon comedy run time program ...
1,1,7,positive,like adult comedy cartoon like south park near...
2,2,9,positive,bromwell high nothing short brilliant expertly...
3,3,10,positive,world stage people actor itor something like h...
4,4,8,positive,futz show preserve experimental theatre moveme...
...,...,...,...,...
9995,4995,3,negative,find real task sit film sound track good accen...
9996,4996,3,negative,really enjoy first half hour movie wow turn co...
9997,4997,2,negative,certainly without merit already writerdirector...
9998,4998,3,negative,one good celebritys reality show ever see see ...


In [102]:
df.duplicated().sum()
df = df.drop_duplicates()

## Feature Extraction

Use TF-IDF vectorization on the cleaned reviews.

In [103]:
review_col = df["review"]
label_col = df["category"]

In [104]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    ngram_range=(1, 2),
    max_df=0.90,
    min_df=2,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(review_col)

print(X_tfidf.shape)  # (num_docs, num_features)

(10000, 141049)


## Encoding

In [105]:
df = df.drop(columns=["id"])

In [106]:
le = LabelEncoder()
y = le.fit_transform(df["category"])

## Modeling & Model Evaluation

In [107]:
X = X_tfidf
y = label_col

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Logistic Regression:
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))

Logistic Regression
              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      1000
    positive       0.89      0.91      0.90      1000

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000



In [108]:
# Decision Tree:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree")
print(classification_report(y_test, y_pred_dt))

Decision Tree
              precision    recall  f1-score   support

    negative       0.71      0.72      0.72      1000
    positive       0.72      0.71      0.71      1000

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000



In [109]:
# Random Forest:
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest")
print(classification_report(y_test, y_pred_rf))

Random Forest
              precision    recall  f1-score   support

    negative       0.87      0.86      0.87      1000
    positive       0.86      0.87      0.87      1000

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



Comparison Table of Metrics

In [110]:
acc_lr  = accuracy_score(y_test, y_pred_lr)
prec_lr, rec_lr, f1_lr, _ = precision_recall_fscore_support(y_test, y_pred_lr, average="macro", zero_division=0)

acc_dt  = accuracy_score(y_test, y_pred_dt)
prec_dt, rec_dt, f1_dt, _ = precision_recall_fscore_support(y_test, y_pred_dt, average="macro", zero_division=0)

acc_rf  = accuracy_score(y_test, y_pred_rf)
prec_rf, rec_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average="macro", zero_division=0)


comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest"],
    "Accuracy": [acc_lr, acc_dt, acc_rf],
    "Precision_macro": [prec_lr, prec_dt, prec_rf],
    "Recall_macro": [rec_lr, rec_dt, rec_rf],
    "F1_macro": [f1_lr, f1_dt, f1_rf]
}).sort_values("F1_macro", ascending=False).reset_index(drop=True)

print(comparison)

                 Model  Accuracy  Precision_macro  Recall_macro  F1_macro
0  Logistic Regression    0.8965         0.896834        0.8965  0.896478
1        Random Forest    0.8675         0.867544        0.8675  0.867496
2        Decision Tree    0.7135         0.713548        0.7135  0.713484


## Report    

1. Summarize which model performs best: <br>
Best model: **Logistic Regression.**


2. Interpretability vs. Performance Trade-offs:

#### **Logistic Regression:**

Performance: Best results.

Interpretability: Medium.

#### **Random Forest:**

Performance: Strong, but not quite as good as Logistic Regression.

Interpretability: Harder to explain than a single decision tree because predictions come from an ensemble of 300+ trees. Still, feature importance scores can be extracted.

#### **Decision Tree:**

Performance: Weakest of the three.

Interpretability: Easiest to interpret, since you can visualize the tree and explain predictions step by step.