In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## 1. Data Loading

In [None]:
df = pd.read_csv("data/cleaned_imdb_reviews.csv")
df = df[['cleaned_review', 'label']]
df

Unnamed: 0,cleaned_review,label
0,bromwell high cartoon comedy ran time program ...,1
1,homelessness houselessness george carlin state...,1
2,brilliant overacting lesley ann warren best dr...,1
3,easily underrated film inn brook cannon sure f...,1
4,typical mel brook film much less slapstick mov...,1
...,...,...
24995,towards end movie felt technical felt like cla...,0
24996,kind movie enemy content watch time bloody tru...,0
24997,saw descent last night stockholm film festival...,0
24998,film pick pound turn rather good 23rd century ...,0


## 2. Vectorization

In [19]:
X = df['cleaned_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## 3. Model Training

In [21]:
logistic_regression = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

In [22]:
logistic_regression.fit(X_train_vec, y_train)
decision_tree.fit(X_train_vec, y_train)
random_forest.fit(X_train_vec, y_train)

## 4. Model Evaluation

In [23]:
metrics = {}

# Logistic Regression Evaluation
y_pred_logistic = logistic_regression.predict(X_test_vec)
metrics['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, y_pred_logistic),
    'Classification Report': classification_report(y_test, y_pred_logistic, output_dict=True)
}

# Decision Tree Evaluation
y_pred_tree = decision_tree.predict(X_test_vec)
metrics['Decision Tree'] = {
    'Accuracy': accuracy_score(y_test, y_pred_tree),
    'Classification Report': classification_report(y_test, y_pred_tree, output_dict=True)
}

# Random Forest Evaluation
y_pred_forest = random_forest.predict(X_test_vec)
metrics['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_forest),
    'Classification Report': classification_report(y_test, y_pred_forest, output_dict=True)
}

In [24]:
comparison_table = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Accuracy': [
        metrics['Logistic Regression']['Accuracy'],
        metrics['Decision Tree']['Accuracy'],
        metrics['Random Forest']['Accuracy'],
    ],
    'Precision': [
        metrics['Logistic Regression']['Classification Report']['weighted avg']['precision'],
        metrics['Decision Tree']['Classification Report']['weighted avg']['precision'],
        metrics['Random Forest']['Classification Report']['weighted avg']['precision'],
    ],
    'Recall': [
        metrics['Logistic Regression']['Classification Report']['weighted avg']['recall'],
        metrics['Decision Tree']['Classification Report']['weighted avg']['recall'],
        metrics['Random Forest']['Classification Report']['weighted avg']['recall'],
    ],
    'F1-Score': [
        metrics['Logistic Regression']['Classification Report']['weighted avg']['f1-score'],
        metrics['Decision Tree']['Classification Report']['weighted avg']['f1-score'],
        metrics['Random Forest']['Classification Report']['weighted avg']['f1-score'],
    ]
})

In [25]:
comparison_table

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.8864,0.886592,0.8864,0.886377
1,Decision Tree,0.7068,0.707247,0.7068,0.706705
2,Random Forest,0.846,0.846314,0.846,0.845981


## 5. Reporting

In [27]:
best_model = comparison_table.loc[comparison_table['F1-Score'].idxmax()]

print("Best Performing Model:")
best_model

Best Performing Model:


Model        Logistic Regression
Accuracy                  0.8864
Precision               0.886592
Recall                    0.8864
F1-Score                0.886377
Name: 0, dtype: object