In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
df = pd.read_csv("IMDB Dataset.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r'[^a-zA-Z]', ' ', text) 
    tokens = word_tokenize(text) 
    tokens = [word for word in tokens if word not in stopwords.words('english')]  
    return ' '.join(tokens)

df['cleaned_review'] = df['review'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

def train_and_evaluate(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

print("\nLogistic Regression Results")
lr_model = LogisticRegression()
train_and_evaluate(lr_model)

print("\nNaive Bayes Results")
nb_model = MultinomialNB()
train_and_evaluate(nb_model)

print("\nSupport Vector Machine Results")
svm_model = SVC()
train_and_evaluate(svm_model)



Logistic Regression Results
Accuracy: 0.8931
Precision: 0.8839458413926499
Recall: 0.9069259773764636
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


Naive Bayes Results
Accuracy: 0.8556
Precision: 0.854606431248767
Recall: 0.8596943838063108
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      4961
           1       0.85      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


Support Vector Machine Results
Accuracy: 0.8956
Precision: 0.8842085016349298
Recall: 0.912284183369