In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import WordNetLemmatizer, SnowballStemmer
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
movies = pd.read_csv("IMDB Dataset.csv")
movies.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
movies["sentiment"].nunique()

2

In [5]:
 # balanced dataset
movies["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
movies["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Text preprocessing

In [7]:
movies["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## testing TF-IDF / Bag of words vectorization



In [4]:
from tqdm import tqdm

In [5]:
# Initialize stemmer and stopwords set
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))
corpus = []

# Use tqdm for a simple iterable loop
for review in tqdm(movies["review"].values, desc="Processing Reviews"):
    # Your review processing code here
    review = re.sub(r"[^a-zA-Z0-9]", " ", review.lower())
    tokens = nltk.word_tokenize(review)
    sent = [stemmer.stem(word) for word in tokens if word not in stop_words]
    corpus.append(sent)

print(f"Processed {len(corpus)} reviews!")

Processing Reviews: 100%|██████████| 50000/50000 [02:18<00:00, 360.62it/s]

Processed 50000 reviews!





In [18]:
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report

def naive_bayes_model(X1 , y1):
    X_train, X_test, y_train, y_test =  train_test_split(X1, y1, test_size=0.2, random_state=0)
    bayes_classifier = MultinomialNB().fit(X_train, y_train)
    y_pred = bayes_classifier.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return bayes_classifier
    

In [72]:
# for continus features
from sklearn.naive_bayes import GaussianNB
def Gaussian_naive_bayes_model(X1, y1):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)
    Gaussian_bayes_classifier = GaussianNB().fit(X_train, y_train)  # Use GaussianNB instead of MultinomialNB
    y_pred = Gaussian_bayes_classifier.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return Gaussian_bayes_classifier

In [10]:
y1 = pd.get_dummies(movies["sentiment"])
y1= y1.iloc[:,1].values
y1

array([ True,  True,  True, ..., False, False, False])

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X1 = cv.fit_transform(movies["review"])
print(X1.shape)
cv.vocabulary_

(50000, 101895)


{'one': 64131,
 'of': 63757,
 'the': 90160,
 'other': 64776,
 'reviewers': 75511,
 'has': 40745,
 'mentioned': 57558,
 'that': 90137,
 'after': 2970,
 'watching': 98226,
 'just': 48473,
 'oz': 65469,
 'episode': 30113,
 'you': 101096,
 'll': 53223,
 'be': 8758,
 'hooked': 42850,
 'they': 90347,
 'are': 5707,
 'right': 75915,
 'as': 6166,
 'this': 90455,
 'is': 46765,
 'exactly': 30975,
 'what': 98847,
 'happened': 40445,
 'with': 99740,
 'me': 56982,
 'br': 12041,
 'first': 33526,
 'thing': 90399,
 'struck': 86597,
 'about': 1866,
 'was': 98149,
 'its': 46954,
 'brutality': 12974,
 'and': 4541,
 'unflinching': 94620,
 'scenes': 78746,
 'violence': 97142,
 'which': 98951,
 'set': 80312,
 'in': 44763,
 'from': 35408,
 'word': 100095,
 'go': 37795,
 'trust': 92962,
 'not': 62917,
 'show': 81437,
 'for': 34443,
 'faint': 31918,
 'hearted': 41171,
 'or': 64417,
 'timid': 90979,
 'pulls': 71592,
 'no': 62551,
 'punches': 71639,
 'regards': 74108,
 'to': 91217,
 'drugs': 27443,
 'sex': 80408,

In [38]:
naive_bayes_model(X1,y1)

0.8442
              precision    recall  f1-score   support

       False       0.82      0.88      0.85      5035
        True       0.87      0.81      0.84      4965

    accuracy                           0.84     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.85      0.84      0.84     10000



In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X2 = tfidf.fit_transform(movies["review"])
print(X2.shape)
X2

(50000, 101895)


<50000x101895 sparse matrix of type '<class 'numpy.float64'>'
	with 6826529 stored elements in Compressed Sparse Row format>

In [42]:
naive_bayes_model(X2 , y1)

0.8618
              precision    recall  f1-score   support

       False       0.85      0.89      0.87      5035
        True       0.88      0.84      0.86      4965

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



# Testing Word2Vec

In [6]:
import gensim

In [20]:
# training Word2vec cbow model  from scratch 
model = gensim.models.Word2Vec(corpus , min_count= 2 , window=5 ,vector_size=150) 

In [21]:
# Extracting the embedding matrix
embedding_matrix = np.zeros((len(model.wv), model.vector_size))

# Populate the embedding matrix
for i, word in enumerate(model.wv.index_to_key):
    embedding_matrix[i] = model.wv[word]
embedding_matrix[0]

array([-1.45579875e+00,  5.21815717e-01,  6.49187863e-02,  3.64712358e-01,
       -5.77859163e-01,  3.76251072e-01, -9.71547049e-03,  1.58895743e+00,
       -1.17577851e+00,  1.05337453e+00,  1.13063037e+00, -1.22830319e+00,
       -4.59179193e-01,  9.55084920e-01, -8.76758039e-01, -4.59074497e-01,
        4.16691780e-01,  5.54818630e-01, -1.02391028e+00, -1.33414280e+00,
        2.86422342e-01, -8.78238440e-01,  5.37320912e-01, -3.86113822e-01,
       -4.09302533e-01, -9.09711346e-02, -5.51654577e-01, -1.67350173e-01,
        3.74846250e-01, -7.64800370e-01, -2.36731827e-01,  1.02887833e+00,
        4.67065573e-01, -3.86504799e-01,  5.79262674e-01,  6.81369454e-02,
        3.44045907e-01,  9.43597183e-02, -2.33599156e-01,  1.90742582e-01,
        1.67468712e-01,  7.44840086e-01,  6.86025560e-01, -2.85680085e-01,
       -4.12607670e-01, -4.18745816e-01, -9.76781487e-01,  6.80891991e-01,
       -7.66358197e-01, -3.95425456e-03, -1.10035336e+00,  1.47772223e-01,
        1.44287229e+00,  

In [14]:
model.wv.most_similar("love")

[('ador', 0.5009018182754517),
 ('hate', 0.4869915544986725),
 ('asleep', 0.4843272566795349),
 ('touch', 0.47324052453041077),
 ('lover', 0.4598483741283417),
 ('friendship', 0.4596560001373291),
 ('happi', 0.4467249810695648),
 ('romanc', 0.4459366500377655),
 ('apart', 0.4414093792438507),
 ('sweet', 0.4391103684902191)]

In [15]:
model.wv.most_similar("hate")

[('dislik', 0.662932813167572),
 ('despis', 0.6575043201446533),
 ('complain', 0.5160329341888428),
 ('offend', 0.5131930112838745),
 ('disagre', 0.4923933148384094),
 ('love', 0.4869915246963501),
 ('afraid', 0.48119625449180603),
 ('spoil', 0.4759232997894287),
 ('suck', 0.47455722093582153),
 ('think', 0.47109416127204895)]

In [16]:
# vocabulary
model.wv.index_to_key

['br',
 'movi',
 'film',
 'one',
 'like',
 'time',
 'good',
 'make',
 'charact',
 'see',
 'get',
 'watch',
 'even',
 'stori',
 'would',
 'realli',
 'well',
 'scene',
 'look',
 'show',
 'bad',
 'much',
 'end',
 'great',
 'peopl',
 'love',
 'go',
 'also',
 'first',
 'think',
 'act',
 'play',
 'way',
 'thing',
 'made',
 'could',
 'know',
 'say',
 'seem',
 'work',
 'plot',
 'actor',
 'two',
 'mani',
 'seen',
 'come',
 'year',
 'want',
 'take',
 'never',
 'life',
 'best',
 'tri',
 'littl',
 'ever',
 'man',
 'better',
 'give',
 'still',
 'find',
 'perform',
 'part',
 'feel',
 'use',
 'someth',
 'director',
 'actual',
 'back',
 'interest',
 'lot',
 'real',
 'guy',
 'old',
 'funni',
 'cast',
 'though',
 'live',
 '10',
 'anoth',
 'music',
 'enjoy',
 'star',
 'noth',
 'role',
 'start',
 'new',
 'point',
 'set',
 'everi',
 'girl',
 'day',
 'believ',
 'direct',
 'world',
 'origin',
 'turn',
 'thought',
 'horror',
 'quit',
 'comedi',
 'minut',
 'kill',
 'us',
 'fact',
 'pretti',
 'effect',
 'got',


In [17]:
# every word is associated to a 150 dim vector thanx to the embedding matrix W
print(model.wv["love"].shape)
model.wv["love"]

(150,)


array([ 1.1522502e+00, -1.6956602e+00, -6.2574118e-01,  4.0677778e-02,
       -1.9096283e+00, -4.8479107e-01, -1.5280989e+00, -2.7071598e-01,
        1.2877877e-01, -4.0396857e-01, -1.3128494e-01, -6.1893322e-02,
        1.3506856e+00,  1.2247601e+00, -9.3067312e-01, -1.2340944e-02,
       -2.3092160e+00,  3.1086162e-01, -4.7584733e-01,  2.7380252e-01,
       -5.2389777e-01,  6.4011657e-01,  3.0424383e+00, -8.9927459e-01,
       -1.8285276e+00,  1.0825268e+00,  7.3879802e-01, -7.3065303e-02,
       -1.1666458e+00,  1.1414434e+00,  1.5594392e+00, -2.7168099e-02,
       -1.4950775e+00, -7.4620652e-01,  1.2359978e+00, -1.3189460e-01,
        8.9568239e-01, -3.1382504e-01, -4.7846162e-01, -9.3360180e-01,
        7.9774147e-01,  8.2574296e-01, -2.4820788e-01, -7.1443957e-01,
        1.0444144e+00,  1.9651921e+00,  1.3165137e+00, -1.3672788e+00,
       -7.6065439e-01,  3.7811583e-01, -1.0270007e+00, -1.0428925e+00,
        8.7765448e-02,  9.5212877e-01, -1.5604712e+00,  1.3940193e-01,
      

In [18]:
# Transforming each sentence into a vector by averaging its word embeddings to address the classification problem.
def avg_word2vec(model,sent) : 
     return np.mean([model.wv[word] for word in sent if word in model.wv.index_to_key ],axis=0)

In [68]:
X = np.array([avg_word2vec(model, sent) for sent in tqdm(corpus, desc="Processing Sentences")])

Processing Sentences: 100%|██████████| 50000/50000 [07:59<00:00, 104.24it/s]


In [70]:
X

array([[-3.2805690e-01,  1.1576964e-01,  2.2945741e-02, ...,
        -5.1859516e-01,  5.4574471e-02, -4.0319967e-01],
       [-1.8941358e-01, -1.2645641e-01,  1.5103716e-01, ...,
        -6.6748393e-01,  9.0746731e-02, -3.5737920e-01],
       [-2.9284549e-01, -2.9590022e-04, -4.8942544e-02, ...,
        -3.8965854e-01,  2.6219344e-02, -6.7498702e-01],
       ...,
       [-2.6333302e-02,  2.5525552e-01, -1.2062365e-01, ...,
        -3.9388487e-01, -6.4300254e-02, -5.6986189e-01],
       [-2.9674369e-01, -3.5446901e-02,  1.2522838e-01, ...,
        -4.5978647e-01,  2.5025839e-01, -2.6778144e-01],
       [-5.2191299e-01,  1.8569449e-01,  3.2205856e-01, ...,
        -2.3709612e-01,  3.7495235e-01, -8.8911486e-01]], dtype=float32)

In [73]:
Gaussian_naive_bayes_model(X , y1)

Accuracy: 0.7635
              precision    recall  f1-score   support

       False       0.77      0.75      0.76      5035
        True       0.76      0.77      0.76      4965

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000



######  Word2Vec + GaussianNB (Poor Accuracy)
###### ❌ Why does this perform worse?
###### Word2Vec embeddings are continuous, dense, and do NOT represent word frequencies.
###### GaussianNB assumes feature independence and a Gaussian (normal) distribution, but Word2Vec embeddings don’t follow this assumption.
###### Word2Vec captures semantic meaning rather than simple term importance, which is not ideal for Naive Bayes.

In [28]:
from sklearn.linear_model import LogisticRegression
def logistic_regression_model(X1, y1):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)
    log_reg_classifier = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    y_pred = log_reg_classifier.predict(X_test)
    
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return log_reg_classifier


In [29]:
from sklearn.svm import SVC
def svm_model(X1, y1):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)
    svm_classifier = SVC(kernel='linear').fit(X_train, y_train)  # Linear kernel
    y_pred = svm_classifier.predict(X_test)
    
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return svm_classifier

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def random_forest_model(X1, y1):
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)
    y_pred = rf_classifier.predict(X_test)
    
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    return rf_classifier


In [92]:
random_forest_model(X, y1)

Unnamed: 0,precision,recall,f1-score,support,model
False,0.848585,0.815889,0.831916,5035.0,Random Forest
True,0.820314,0.852367,0.836033,4965.0,Random Forest
accuracy,0.834,0.834,0.834,0.834,Random Forest
macro avg,0.83445,0.834128,0.833974,10000.0,Random Forest
weighted avg,0.834548,0.834,0.83396,10000.0,Random Forest


In [91]:
print("SVM model :/n")
svm_model(X, y1)

SVM model :/n


Unnamed: 0,precision,recall,f1-score,support,model
False,0.865791,0.863555,0.864671,5035.0,SVM
True,0.861993,0.86425,0.86312,4965.0,SVM
accuracy,0.8639,0.8639,0.8639,0.8639,SVM
macro avg,0.863892,0.863902,0.863896,10000.0,SVM
weighted avg,0.863905,0.8639,0.863901,10000.0,SVM


In [90]:
print("Logistic Regression model :/n")
logistic_regression_model(X, y1)

Logistic Regression model :/n
0.8648
              precision    recall  f1-score   support

       False       0.86      0.87      0.87      5035
        True       0.87      0.86      0.86      4965

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

