# IMDb Movie Reviews Classification

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
path = 'cleaned_imdb_sample.csv'
df = pd.read_csv(path)
df

Unnamed: 0,review,sentiment,cleaned_review
0,In Panic In The Streets Richard Widmark plays ...,positive,panic street richard widmark play navy doctor ...
1,If you ask me the first one was really better ...,negative,ask first one really better one look sarah rea...
2,I am a big fan a Faerie Tale Theatre and I've ...,positive,big fan faerie tale theatre ive seen one best ...
3,I just finished reading a book about Dillinger...,negative,finished reading book dillinger movie horribly...
4,Greg Davis and Bryan Daly take some crazed sta...,negative,greg davis bryan daly take crazed statement te...
...,...,...,...
9995,Many have stated that Orca  Killer Whale is a...,negative,many stated orca killer whale jaw ripoff reall...
9996,What a wasted cast.<br /><br />This is one of ...,negative,wasted castthis one disappointing film ive see...
9997,"First of all the movie, is an ingenious work o...",positive,first movie ingenious work artmovie plot fille...
9998,Wow. Saw this last night and I'm still reeling...,positive,wow saw last night still reeling good every ch...


In [3]:
df = df[['cleaned_review', 'sentiment']]
df.head()

Unnamed: 0,cleaned_review,sentiment
0,panic street richard widmark play navy doctor ...,positive
1,ask first one really better one look sarah rea...,negative
2,big fan faerie tale theatre ive seen one best ...,positive
3,finished reading book dillinger movie horribly...,negative
4,greg davis bryan daly take crazed statement te...,negative


In [4]:
from sklearn.model_selection import train_test_split

X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)  # Limit features for efficiency
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize models
logreg = LogisticRegression(max_iter=1000, random_state=42)
dtree = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train models
logreg.fit(X_train_tfidf, y_train)
dtree.fit(X_train_tfidf, y_train)
rf.fit(X_train_tfidf, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
from sklearn.metrics import classification_report, accuracy_score

models = {'Logistic Regression': logreg, 'Decision Tree': dtree, 'Random Forest': rf}

results = []

for name, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    })

comparison_df = pd.DataFrame(results)
comparison_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.84,0.840137,0.84,0.839967
1,Decision Tree,0.703,0.703007,0.703,0.702969
2,Random Forest,0.817,0.817252,0.817,0.816988


In [8]:
import random

sample_idx = random.sample(range(len(X_test)), 5)
sample_reviews = X_test.iloc[sample_idx]
sample_labels = y_test.iloc[sample_idx]

output_lines = []  

print("=== Example Predictions ===\n")
output_lines.append("=== Example Predictions ===\n")

for i, (review, true_label) in enumerate(zip(sample_reviews, sample_labels), 1):
    header = f"Review {i}:"
    print(header)
    output_lines.append(header)
    
    review_text = review if len(review) <= 300 else review[:300] + "..."
    print("Review Text:", review_text)
    output_lines.append(f"Review Text: {review_text}")
    
    print("True Sentiment:", true_label)
    output_lines.append(f"True Sentiment: {true_label}")
    
    for name, model in models.items():
        pred = model.predict(tfidf.transform([review]))[0]
        print(f"{name} Prediction: {pred}")
        output_lines.append(f"{name} Prediction: {pred}")
    
    print("-" * 80)
    output_lines.append("-" * 80)

with open("example_predictions.txt", "w", encoding="utf-8") as f:
    for line in output_lines:
        f.write(line + "\n")



=== Example Predictions ===

Review 1:
Review Text: giallo fan seek rare film well written full sort usual low life populate film dont want give anything away wont even say anything plot whole movie creates bizarre atmosphere dont know expect suspect recommended place ive seen get film english european trash cinema
True Sentiment: positive
Logistic Regression Prediction: negative
Decision Tree Prediction: positive
Random Forest Prediction: positive
--------------------------------------------------------------------------------
Review 2:
Review Text: underground comedy movie possibly worst train wreck ive ever seen luckily didnt pay movie friend reluctantly agreed watch siting awful needed prove awful love color comedy figured least would would entertained instead acting awful joke extremely cheesy plot found maybe wasnt supposed plot cant hold ...
True Sentiment: negative
Logistic Regression Prediction: negative
Decision Tree Prediction: negative
Random Forest Prediction: negative
---

##### Reporting Summary
1. Logistic Regression: High accuracy and interpretable, but may misclassify nuanced reviews.
2. Decision Tree: Interpretable with clear rules, but tends to overfit and underperform on text data.
3. Random Forest: Best overall balance of accuracy and robustness, less interpretable than Logistic Regression.