# **IMDB Movie Reviews Classification**

# **Importing Liabraries**

In [38]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# **Load Cleaned Dataset**

In [42]:
df = pd.read_csv(r"C:\Users\bbuser\Desktop\NumPy\DataScience-Brain-Bytes\DataScience-Brain-Bytes\Team_members\from_deena\data\imdb_cleaned.xls")

# Keep only cleaned review text and label
X = df["cleaned_review"].values
y = df["label"].values

df

Unnamed: 0,review,label,cleaned_review
0,Bromwell High is a cartoon comedy. It ran at t...,1,bromwell high cartoon comedy ran time program ...
1,Homelessness (or Houselessness as George Carli...,1,homelessness houselessness george carlin state...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,brilliant overacting lesley ann warren best dr...
3,This is easily the most underrated film inn th...,1,easily underrated film inn brook cannon sure f...
4,This is not the typical Mel Brooks film. It wa...,1,typical mel brook film much less slapstick mov...
...,...,...,...
9995,The plot of 'House of Games' is the strongest ...,0,plot house game strongest thing successful aut...
9996,I was seriously looking forward to seeing this...,0,seriously looking forward seeing film seemed t...
9997,"Why did I go to see this film? Honestly, becau...",0,see film honestly jim carrey past made hilario...
9998,Jim Carrey is one of the funniest and most gif...,0,jim carrey one funniest gifted comedian film t...


# **TF-IDF Vectorization**

In [43]:
# Convert text into TF-IDF numerical features
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)
print(f"TF-IDF matrix shape: {X_tfidf.shape}")

TF-IDF matrix shape: (10000, 5000)


# **Train/Test Split**

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# **Define Models**

In [45]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# **Train & Evaluate**

In [49]:
results = []

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average="binary", pos_label=1
    )

    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })
    
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))


Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1000
           1       0.86      0.89      0.87      1000

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000


Decision Tree Classification Report:

              precision    recall  f1-score   support

           0       0.71      0.68      0.70      1000
           1       0.69      0.72      0.71      1000

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000


Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1000
           1       0.83      0.84      0.83      1000

    accuracy                           0.83      2000
   ma

# **Comparison Table**

In [51]:
results_df = pd.DataFrame(results)
print("\nModel Comparison:\n")
print(results_df)


Model Comparison:

                 Model  Accuracy  Precision  Recall  F1-score
0  Logistic Regression    0.8720   0.861165   0.887  0.873892
1        Decision Tree    0.7025   0.693780   0.725  0.709046
2        Random Forest    0.8320   0.829365   0.836  0.832669


# **Example Predictions**

In [52]:
sample_reviews = df["cleaned_review"].sample(5, random_state=42)

for review in sample_reviews:
    vec = vectorizer.transform([review])
    pred = models["Logistic Regression"].predict(vec)[0]
    print(f"Review: {review[:100]}...")
    print(f"Predicted Sentiment: {pred}\n")

Review: sure diane silver thinking making movie obviously nothing richard wright novel movie based onwe read...
Predicted Sentiment: 0

Review: ernest borgnine wasted moviethere point putting great actor movieone greatest actor world wastedand ...
Predicted Sentiment: 0

Review: odd couple comic gem one funniest script ever committed celluloid exceeded strangelove spinal tap le...
Predicted Sentiment: 1

Review: movie one exception rule sequel worser original comedy best movie fast action slapstick comedy somet...
Predicted Sentiment: 1

Review: game really great quite challenge great spooky story line graphic also good would recommend game hor...
Predicted Sentiment: 1

