In [18]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [19]:
df = pd.read_csv("data/imdb_cleaned_sample.csv")
df

Unnamed: 0,id,rating,txt,label,cleaned_review
0,6784,8,"I like my Ronald Colman dashing and debonair, ...",1,like ronald colman dashing debonair fellow see...
1,11884,8,I found this film to be a fascinating study of...,1,found film fascinating study family crisis leo...
2,1656,9,"""Thieves and Liars"" presents us with a very na...",1,thief liar present naturalistic depiction leve...
3,4745,7,I can't understand why they decided to release...,1,cant understand decided release film introduce...
4,305,8,Screwball comedy about romantic mismatches in ...,1,screwball comedy romantic mismatch new york ci...
...,...,...,...,...,...
9995,2510,4,This TV film tells the story of extrovert Fran...,0,film tell story extrovert frannie suddenly ret...
9996,5041,2,Ye Lou's film Purple Butterfly pits a secret o...,0,lous film purple butterfly pit secret organiza...
9997,8517,2,The biggest mystery of Veronica Mars is not on...,0,biggest mystery veronica mar one tackle screen...
9998,5903,1,"I live in Salt Lake City and I'm not a Mormon,...",0,live salt lake city mormon rent movie well liv...


In [20]:
df.drop(columns=['txt'], inplace=True)

In [21]:
df.columns = df.columns.str.strip()

In [22]:
X = df["cleaned_review"]
y = df["label"] 

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
#LogisticRegression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
y_pred = log_reg.predict(X_test_tfidf)
print("\n=== Logistic Regression ===")
print(classification_report(y_test, y_pred))

# 6. Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_tfidf, y_train)
y_pred = dt.predict(X_test_tfidf)
print("\n=== Decision Tree ===")
print(classification_report(y_test, y_pred))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       988
           1       0.84      0.89      0.86      1012

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000


=== Decision Tree ===
              precision    recall  f1-score   support

           0       0.68      0.69      0.69       988
           1       0.70      0.68      0.69      1012

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000



In [27]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred))




=== Random Forest ===
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       988
           1       0.83      0.82      0.82      1012

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000



In [28]:
# Support Vector Machine
svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
print("\n=== Linear SVM ===")
print(classification_report(y_test, y_pred))


=== Linear SVM ===
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       988
           1       0.84      0.86      0.85      1012

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



Among the four models tested, Logistic Regression performed best with 86% accuracy, balancing precision and recall effectively. Linear SVM was close behind at 84%, while Random Forest reached 82%. The Decision Tree lagged at 69%, showing poor suitability for text data.

so Logistic Regression is the top choice, offering both high accuracy and interpretability, making it the most reliable model for sentiment classification.