# **Movie Recommendation System**

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [10]:
# Step 1: Load the dataset
# Assuming the dataset is in a CSV file
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
# Step 2: Data Cleaning
# Check for missing values
print(data.isnull().sum())  # Should be none in this dataset

review       0
sentiment    0
dtype: int64


In [13]:
# Drop any rows with missing values
data = data.dropna()

In [15]:
# Step 3: Data Preprocessing
# Convert the 'sentiment' column to numerical values (positive = 1, negative = 0)
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])  # 1 = positive, 0 = negative

In [17]:
# Vectorize the 'review' column using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 most important features
X = vectorizer.fit_transform(data['review']).toarray()
# Separate features (X) and target (y)
y = data['sentiment']

In [18]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Step 4: Model Building
# 1. Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

In [20]:
# 2. Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [21]:
# 3. K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [22]:
# 4. XGBoost Classifier
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [23]:
# 5. Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

In [24]:
# 6. Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)

In [25]:
# Step 5: Model Evaluation
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_true, y_pred))

In [26]:
# Evaluate each model
evaluate_model(y_test, y_pred_nb, "Naive Bayes Classifier")
evaluate_model(y_test, y_pred_rfc, "Random Forest Classifier")
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")
evaluate_model(y_test, y_pred_xgb, "XGBoost Classifier")
evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_dtc, "Decision Tree Classifier")

Naive Bayes Classifier Accuracy: 0.85
Classification Report for Naive Bayes Classifier:

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Random Forest Classifier Accuracy: 0.86
Classification Report for Random Forest Classifier:

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

K-Nearest Neighbors Accuracy: 0.73
Classification Report for K-Nearest Neighbors:

              precision    recall  f1-score   support

           0       0.75   

In [27]:
# Step 6: Compare and Conclusion
models = ['Naive Bayes', 'Random Forest', 'KNN', 'XGBoost', 'Logistic Regression', 'Decision Tree']
accuracies = [
    accuracy_score(y_test, y_pred_nb),
    accuracy_score(y_test, y_pred_rfc),
    accuracy_score(y_test, y_pred_knn),
    accuracy_score(y_test, y_pred_xgb),
    accuracy_score(y_test, y_pred_log_reg),
    accuracy_score(y_test, y_pred_dtc)
]

# Print model accuracies
for model, accuracy in zip(models, accuracies):
    print(f"{model} Accuracy: {accuracy:.2f}")

Naive Bayes Accuracy: 0.85
Random Forest Accuracy: 0.86
KNN Accuracy: 0.73
XGBoost Accuracy: 0.86
Logistic Regression Accuracy: 0.89
Decision Tree Accuracy: 0.72
