### Setting Up Libraries & Import

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

418

### Text Preprocessing

In [8]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [9]:
# Download the stopwords dataset
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HMT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Step 1: Convert all values in the DataFrame to lowercase
df = df.apply(lambda x: x.astype(str).str.lower())

# Step 2: Remove HTML tags from the 'review' column
df['review'] = df['review'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

# Step 3: Remove square brackets and their content
df['review'] = df['review'].apply(lambda x: re.sub(r'\[.*?\]', '', x))

# Step 4: Remove special characters
df['review'] = df['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x))

# Step 5: Remove English stopwords
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Step 6: Perform text stemming
stemmer = PorterStemmer()
df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

  df['review'] = df['review'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


In [11]:
df['review']

0        one review mention watch 1 oz episod youll hoo...
1        wonder littl product film techniqu unassum old...
2        thought wonder way spend time hot summer weeke...
3        basic there famili littl boy jake think there ...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job wasnt creativ orig...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    im go disagre previou comment side maltin one ...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object

### Text Representation

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

#### Split the data into training and testing sets

In [18]:
X = df.iloc[:,0:1]

print(X)

                                                  review
0      one review mention watch 1 oz episod youll hoo...
1      wonder littl product film techniqu unassum old...
2      thought wonder way spend time hot summer weeke...
3      basic there famili littl boy jake think there ...
4      petter mattei love time money visual stun film...
...                                                  ...
49995  thought movi right good job wasnt creativ orig...
49996  bad plot bad dialogu bad act idiot direct anno...
49997  cathol taught parochi elementari school nun ta...
49998  im go disagre previou comment side maltin one ...
49999  one expect star trek movi high art fan expect ...

[50000 rows x 1 columns]


In [19]:
y = df['sentiment']

print(y)

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


In [20]:
encoder = LabelEncoder()

y = encoder.fit_transform(y)

print(y)

[1 1 1 ... 0 0 0]


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Bag of Words

In [22]:
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train['review'])
X_test_bow = count_vectorizer.transform(X_test['review'])

#### TF-IDF

In [23]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['review'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['review'])

### Model Training & Evaluation

In [24]:
# Multinomial Naive Bayes
nb_classifier = MultinomialNB()

In [25]:
# Train and predict using Bag of Words
nb_classifier.fit(X_train_bow, y_train)
y_pred_bow = nb_classifier.predict(X_test_bow)

# Evaluate Bag of Words model
print("Bag of Words Classification Report:")
print(classification_report(y_test, y_pred_bow))
print("Accuracy:", accuracy_score(y_test, y_pred_bow))

Bag of Words Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy: 0.8565


In [26]:
# Train and predict using TF-IDF
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_tfidf = nb_classifier.predict(X_test_tfidf)

# Evaluate TF-IDF model
print("\nTF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))


TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.85      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy: 0.8622
