Step1: Import libraries

In [1]:
import pandas as pd
import re
import string
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df_movie=pd.read_csv('/content/drive/MyDrive/Projects/IMDB Dataset.csv')

In [8]:
df_movie.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df_movie['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


Step2: Cleaning of data

In [10]:
import re

def clean_text(review):
    review = review.lower()  # Convert to lowercase

    review = re.sub(r"\n", " ", review)  # Replace newline characters with space
    review = re.sub(r"https?://\S+|www\.\S+", "", review)  # Remove URLs
    review = re.sub(r"<.*?>", "", review)  # Remove HTML tags
    review = re.sub(r"[^\w\s]", "", review)  # Remove punctuation
    review = re.sub(r"\s+", " ", review).strip()  # Remove extra whitespace
    review = re.sub(r'\w*\d\w*', '', review)  # Remove words with numbers (e.g. abc123)

    return review


In [11]:
df_movie['review']=df_movie['review'].apply(clean_text)

Step3: PreProcess Text

In [21]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [22]:
stop_words=set(stopwords.words('english'))
stemming=PorterStemmer()
Lemitizer=WordNetLemmatizer()

In [23]:
def preprocess(review):
  words=word_tokenize(review)
  words=[Lemitizer.lemmatize(stemming.stem(word)) for word in words if word not in stop_words]
  return " ".join(words)

In [24]:
df_movie['review']=df_movie['review'].apply(preprocess)

Step3: Feature Extraction

In [25]:
df_movie['sentiment_numeric'] = df_movie['sentiment'].map({'negative': 0, 'positive': 1})

In [26]:
df_movie.head()

Unnamed: 0,review,sentiment,sentiment_numeric
0,one review mention watch oz episod youll hook ...,positive,1
1,wonder littl product film techniqu unassum old...,positive,1
2,thought wonder way spend time hot summer weeke...,positive,1
3,basic there famili littl boy jake think there ...,negative,0
4,petter mattei love time money visual stun film...,positive,1


In [27]:
vector=TfidfVectorizer()
X_tfidf=vector.fit_transform(df_movie['review'])
y_tfidf=df_movie['sentiment_numeric']

Step4: Train Test Split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=42)

In [29]:
# Train Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"F1-score: {f1:.2f}")

Model Accuracy: 0.86
F1-score: 0.86
