In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
path = r"C:\Users\bbuser\Desktop\DataScience-Brain-Bytes\Team_members\from_Hajer\data\imdb_cleaned_sample.csv"
df=pd.read_csv(path)
df

Unnamed: 0,id,rating,txt,label,cleaned_review
0,6784,8,"I like my Ronald Colman dashing and debonair, ...",1,like ronald colman dashing debonair fellow see...
1,11884,8,I found this film to be a fascinating study of...,1,found film fascinating study family crisis leo...
2,1656,9,"""Thieves and Liars"" presents us with a very na...",1,thief liar present naturalistic depiction leve...
3,4745,7,I can't understand why they decided to release...,1,cant understand decided release film introduce...
4,305,8,Screwball comedy about romantic mismatches in ...,1,screwball comedy romantic mismatch new york ci...
...,...,...,...,...,...
9995,2510,4,This TV film tells the story of extrovert Fran...,0,film tell story extrovert frannie suddenly ret...
9996,5041,2,Ye Lou's film Purple Butterfly pits a secret o...,0,lous film purple butterfly pit secret organiza...
9997,8517,2,The biggest mystery of Veronica Mars is not on...,0,biggest mystery veronica mar one tackle screen...
9998,5903,1,"I live in Salt Lake City and I'm not a Mormon,...",0,live salt lake city mormon rent movie well liv...


In [3]:
print(df.columns)

Index(['id', 'rating', 'txt', 'label', 'cleaned_review'], dtype='object')


# **1.Features and Labels**

In [4]:
X = df["cleaned_review"]
y = df["label"] 

In [5]:
df.columns = df.columns.str.strip()

# **2. Train-Test Split**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# **3. TF-IDF Vectorization**

In [7]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF vectorization complete.")
print(f"Number of features: {X_train_tfidf.shape[1]}")

TF-IDF vectorization complete.
Number of features: 5000


# **4. Model Training**

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = []

for name, model in models.items():
    # Train the model
    model.fit(X_train_tfidf, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test_tfidf)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)

    # Store results
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1
    })

    # Print classification report
    print(f"\n===== {name} =====")
    print(classification_report(y_test, y_pred))


===== Logistic Regression =====
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1000
           1       0.84      0.85      0.84      1000

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000


===== Decision Tree =====
              precision    recall  f1-score   support

           0       0.67      0.68      0.67      1000
           1       0.68      0.67      0.67      1000

    accuracy                           0.67      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.67      0.67      0.67      2000


===== Random Forest =====
              precision    recall  f1-score   support

           0       0.80      0.83      0.81      1000
           1       0.82      0.79      0.80      1000

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.

# **5. Results Table**

In [9]:
results_df = pd.DataFrame(results)
print("\nComparison Table:\n")
results_df


Comparison Table:



Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.8415,0.836453,0.849,0.84268
1,Decision Tree,0.6735,0.675076,0.669,0.672024
2,Random Forest,0.809,0.823899,0.786,0.804504


Logistic Regression achieved the highest overall accuracy and F1-score, making it the best performer for this dataset.

Random Forest performed reasonably well but slightly lower in recall.

Decision Tree had the lowest performance across all metrics, likely due to overfitting on training data.