#### IMDb Movie Reviews Classification

###### As a Data Scientist,I want to use the cleaned IMDb movie reviews to train sentiment classification models,So that I can compare the performance of Logistic Regression, Decision Tree, and Random Forest on predicting review sentiment.

In [16]:
import cv2
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# loading dataset
import pandas as pd
df = pd.read_csv(r"C:\Users\bbuser\Downloads\cleaned_imdb.csv")
df.head()

Unnamed: 0,cleaned_review,label
0,liked lot fact see againand plan may love ill ...,1
1,teta luna symbolic spain film everything film ...,1
2,slight charming little movie sure superblycraf...,1
3,much thing happen movie lot meaning woman thou...,1
4,panic never got good theatrical release easily...,1


In [10]:
print(df.columns)

Index(['cleaned_review', 'label'], dtype='object')


In [11]:
#feature and label
X = df['cleaned_review']
y = df['label']  # Assuming 'label' is the column name for sentiment labels

In [12]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
#TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [19]:
#model training
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier()
}
results = []
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    #evaluation
    accuracy = model.score(X_test_tfidf, y_test)
    print(f"Accuracy of {model_name}: {accuracy:.4f}")
    precision = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['precision']
    recall = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['recall']
    f1 = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    #store all metrics
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })
    #print all
    print(confusion_matrix(y_test, y_pred))
print(pd.DataFrame(results))

Accuracy of Logistic Regression: 0.8575
Precision: 0.8578, Recall: 0.8575, F1-Score: 0.8574
[[830 158]
 [127 885]]
Accuracy of Random Forest: 0.8275
Precision: 0.8278, Recall: 0.8275, F1-Score: 0.8275
[[829 159]
 [186 826]]
Accuracy of Random Forest: 0.8275
Precision: 0.8278, Recall: 0.8275, F1-Score: 0.8275
[[829 159]
 [186 826]]
Accuracy of K-Nearest Neighbors: 0.7070
Precision: 0.7103, Recall: 0.7070, F1-Score: 0.7054
[[628 360]
 [226 786]]
Accuracy of K-Nearest Neighbors: 0.7070
Precision: 0.7103, Recall: 0.7070, F1-Score: 0.7054
[[628 360]
 [226 786]]
Accuracy of Decision Tree: 0.7180
Precision: 0.7199, Recall: 0.7180, F1-Score: 0.7176
[[749 239]
 [325 687]]
                 Model  Accuracy  Precision  Recall  F1-Score
0  Logistic Regression    0.8575   0.857791  0.8575  0.857439
1        Random Forest    0.8275   0.827795  0.8275  0.827497
2  K-Nearest Neighbors    0.7070   0.710316  0.7070  0.705439
3        Decision Tree    0.7180   0.719914  0.7180  0.717624
Accuracy of Decisi

In [20]:
#resualt table
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.8575,0.857791,0.8575,0.857439
1,Random Forest,0.8275,0.827795,0.8275,0.827497
2,K-Nearest Neighbors,0.707,0.710316,0.707,0.705439
3,Decision Tree,0.718,0.719914,0.718,0.717624
