## IMDb Movie Reviews Classification

## Data Loading

In [3]:
import pandas as pd

# Load dataset (adjust path if needed)
df = pd.read_csv("cleaned_imdb_reviews.csv")

# Keep only required columns
df = df[['label', 'cleaned_review']]
print(df.head())

   label                                     cleaned_review
0      1  bromwell high cartoon comedy ran time program ...
1      1  homelessness houselessness george carlin state...
2      1  brilliant overacting lesley ann warren best dr...
3      1  easily underrated film inn brook cannon sure f...
4      1  typical mel brook film much less slapstick mov...


## 2. Feature Extraction (TF-IDF Vectorization)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Features and labels
X = df['cleaned_review']
y = df['label']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # limit to top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## 3. Model Training

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

# Train models
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)

## 4. Model Evaluation

In [8]:
print(y.unique())

[1 0]


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    
    results.append([name, acc, prec, rec, f1])
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred))
    print("\n")

--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      2500
           1       0.86      0.88      0.87      2500

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



--- Decision Tree ---
              precision    recall  f1-score   support

           0       0.69      0.70      0.69      2500
           1       0.69      0.69      0.69      2500

    accuracy                           0.69      5000
   macro avg       0.69      0.69      0.69      5000
weighted avg       0.69      0.69      0.69      5000



--- Random Forest ---
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      2500
           1       0.85      0.84      0.84      2500

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      500

## 5. Comparison Table

In [10]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print(results_df)

                 Model  Accuracy  Precision  Recall  F1-Score
0  Logistic Regression    0.8690   0.861340  0.8796  0.870374
1        Decision Tree    0.6932   0.694288  0.6904  0.692339
2        Random Forest    0.8448   0.845630  0.8436  0.844614


## 6. Example Predictions

In [14]:
# Step 6: Example Predictions (Fixed)

# Mapping numeric labels back to text
label_map = {0: "negative", 1: "positive"}

# Example reviews
sample_reviews = [
    "the movie was absolutely fantastic and thrilling",
    "it was boring, I fell asleep after 20 minutes",
    "not bad, but could have been better"
]

# Transform reviews into TF-IDF features
sample_tfidf = vectorizer.transform(sample_reviews)

# Make predictions with each model
for name, model in models.items():
    preds = model.predict(sample_tfidf)
    # Convert np.int64 → int → label text
    preds_labels = [label_map[int(p)] for p in preds]
    print(f"{name} predictions: {preds_labels}")

Logistic Regression predictions: ['positive', 'negative', 'negative']
Decision Tree predictions: ['positive', 'negative', 'negative']
Random Forest predictions: ['positive', 'negative', 'negative']


## Task Is Done ! 