#### IMDb Movie Reviews Classification

##### As a Data Scientist,I want to use the cleaned IMDb movie reviews to train sentiment classification models,So that I can compare the performance of Logistic Regression, Decision Tree, and Random Forest on predicting review sentiment.

In [1]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score



In [3]:
# Step 2: Load the cleaned IMDb dataset
# Replace 'cleaned_imdb.csv' with your actual file path
df = pd.read_csv(r"C:\Users\bbuser\Downloads\cleaned_imdb.csv")



In [6]:
# Keep only the necessary columns
df = df[['cleaned_review', 'label']]

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_review'], df['label'], test_size=0.2, random_state=42
)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}



In [8]:
# Step 6: Train and evaluate models
results = []

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    })
    
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred))
    print("\n")

# Step 7: Create a comparison table
results_df = pd.DataFrame(results)
print("=== Model Comparison ===")
print(results_df)




=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       988
           1       0.86      0.88      0.87      1012

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



=== Decision Tree ===
              precision    recall  f1-score   support

           0       0.70      0.76      0.73       988
           1       0.74      0.68      0.71      1012

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



=== Random Forest ===
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       988
           1       0.86      0.81      0.83      1012

    accuracy                           0.83      2000
   macro avg       0.83      0.83      0.83      200

In [9]:
# Step 8: Example predictions
sample_reviews = [
    "The movie was fantastic! I loved the acting and the story.",
    "I hated this film. It was boring and too long."
]
sample_tfidf = vectorizer.transform(sample_reviews)

for name, model in models.items():
    preds = model.predict(sample_tfidf)
    print(f"{name} predictions: {list(zip(sample_reviews, preds))}")

Logistic Regression predictions: [('The movie was fantastic! I loved the acting and the story.', np.int64(1)), ('I hated this film. It was boring and too long.', np.int64(0))]
Decision Tree predictions: [('The movie was fantastic! I loved the acting and the story.', np.int64(1)), ('I hated this film. It was boring and too long.', np.int64(0))]
Random Forest predictions: [('The movie was fantastic! I loved the acting and the story.', np.int64(1)), ('I hated this film. It was boring and too long.', np.int64(0))]
