In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score 
from sklearn.preprocessing import LabelEncoder

In [2]:
import gensim.downloader as api
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [3]:
wv_pretrained = api.load("word2vec-google-news-300")




In [4]:
wv_pretrained.most_similar(positive=["king","woman"], negative=["man"])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

In [5]:
def perform_analogy(word1, word2, word3):
    result = wv_pretrained.most_similar(positive=[word1, word3], negative=[word2])
    return result[0]

analogies = [
    ("king", "man", "woman"),  
    ("brother", "man", "woman"),  
    ("uncle", "man", "woman"),  
    ("nephew", "man", "woman"),  
    ("actor", "man", "woman"),  
    ("hero", "man", "woman") ,
    ("doctor", "hospital", "school"),  # doctor - hospital + school ≈ teacher
    ("painter", "canvas", "stage"),  # painter - canvas + stage ≈ actor
    ("pilot", "airplane", "ship"),  # pilot - airplane + ship ≈ captain
    ("chef", "kitchen", "laboratory"),  # chef - kitchen + laboratory ≈ scientist
    ("author", "book", "song"),  # author - book + song ≈ singer
    ("hitler","Germany", "India")
]

analogy_results = {analogy: perform_analogy(*analogy) for analogy in analogies}

print("Analogy Results:")
for analogy, result in analogy_results.items():
    print(f"{analogy[0]} - {analogy[1]} + {analogy[2]} ≈ {result[0]}")
    print(result)


Analogy Results:
king - man + woman ≈ queen
('queen', 0.7118193507194519)
brother - man + woman ≈ sister
('sister', 0.8103213906288147)
uncle - man + woman ≈ aunt
('aunt', 0.8022665977478027)
nephew - man + woman ≈ niece
('niece', 0.8202236890792847)
actor - man + woman ≈ actress
('actress', 0.8602624535560608)
hero - man + woman ≈ heroine
('heroine', 0.68734210729599)
doctor - hospital + school ≈ guidance_counselor
('guidance_counselor', 0.5969595313072205)
painter - canvas + stage ≈ cabaret_performer
('cabaret_performer', 0.4397791624069214)
pilot - airplane + ship ≈ ships
('ships', 0.4948738217353821)
chef - kitchen + laboratory ≈ lab
('lab', 0.5366887450218201)
author - book + song ≈ anthem
('anthem', 0.5882687568664551)
hitler - Germany + India ≈ gandhi
('gandhi', 0.6194782853126526)


In [6]:
df = pd.read_csv("movie_reviews.csv")
df.shape

(50000, 2)

In [7]:
import string 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords_and_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

df['cleaned_text'] = df['review'].apply(remove_stopwords_and_punctuation)
df.head()

Unnamed: 0,review,sentiment,cleaned_text
0,One of the other reviewers has mentioned that ...,positive,"[One, reviewers, mentioned, watching, 1, Oz, e..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, br, br, The..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, wonderful, way, spend, time, hot,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, theres, family, little, boy, Jake,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Matteis, Love, Time, Money, visually,..."


In [8]:
def get_embedding(words):
    embedding = [wv_pretrained[word] for word in words if word in wv_pretrained]
    return np.mean(embedding, axis=0)
 
embedding=df['cleaned_text'].apply(get_embedding)  

data=pd.DataFrame(embedding.tolist())
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.050317,0.053415,0.038032,0.076007,-0.053392,-0.002179,0.034439,-0.057433,0.076233,0.097119,...,-0.065795,0.010038,-0.116087,0.017102,-0.040221,-0.023342,0.002918,-0.075081,0.041535,-0.019835
1,0.066517,0.071847,-0.025412,0.046563,-0.047924,0.037785,0.033784,-0.072095,0.089296,0.087283,...,-0.1337,0.024485,-0.059384,0.021904,-0.046019,-0.067627,0.066743,-0.064045,0.041706,-0.014796
2,0.03672,0.048992,-0.00907,0.100696,-0.034379,-0.002581,0.046219,-0.048808,0.087476,0.094753,...,-0.07296,0.044974,-0.114639,0.014727,-0.047793,-0.060343,0.034904,-0.066511,0.021394,-0.018467
3,0.068221,0.031535,-0.022251,0.091936,-0.045524,0.062335,0.051493,-0.050584,0.091021,0.099479,...,-0.106683,0.033853,-0.139438,0.034197,-0.022073,-0.059132,-0.019083,-0.053927,0.019353,0.023621
4,0.039528,0.03358,-0.006584,0.058266,-0.020921,0.010489,0.040179,-0.054073,0.07007,0.041623,...,-0.108452,0.044189,-0.087717,0.020076,-0.043566,-0.03158,0.023033,-0.04116,0.047247,-0.022942


In [9]:
df1=pd.concat([df,data],axis=1)
df1.drop(columns=['cleaned_text'],inplace=True)

In [10]:
le = LabelEncoder()
df1['sentiment'] = le.fit_transform(df1['sentiment'])
X = df1.drop(columns=['review','sentiment'])
y = df1['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


Accuracy: 0.8511
F1 Score: 0.8527055099416362


In [11]:
sentences = df['cleaned_text'].tolist()

In [12]:
skipgram_model = Word2Vec(
    sentences=sentences,
    sg=1,
    vector_size=50,
    window=5,
    min_count=1,
)

In [13]:
word_vector = skipgram_model.wv['wonderful']
print(f"Vector for 'wonderful': {word_vector}")

Vector for 'wonderful': [ 0.03441551  0.39873645 -0.36233056 -0.02523674  0.02290742 -0.32009062
  0.4691728   1.0497317  -0.87000555 -0.21361372 -0.07528584 -0.5301443
 -0.5063691  -0.16853376  0.05286307  0.22423881  0.7701386   0.11465402
 -0.66667956 -0.92209643  0.17873845  0.14453901  0.9097889  -0.77363205
  0.4593626   0.2826991  -0.12487649  0.05664964 -0.03575842  0.09887584
  0.4279547   0.00762904 -0.06376167  0.21833904 -0.5457569   0.02686579
  0.44380885 -0.13114996 -0.11421734 -0.33974338  0.6559251  -0.15064533
 -0.6525172   0.3062481  -0.11914285 -0.13604423  0.0116007  -0.7699943
 -0.0292401  -0.08773963]


In [14]:
skipgram_model.save("skipgram_model.model")

In [15]:
loaded_model = Word2Vec.load("skipgram_model.model")
loaded_model=loaded_model.wv

In [16]:
def skip_embedding(words):
    embedding = [loaded_model[word] for word in words if word in loaded_model]
    return np.mean(embedding, axis=0)
 
embedding = df['cleaned_text'].apply(skip_embedding)  
 
 
data=pd.DataFrame(embedding.tolist())
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.063343,0.244839,-0.067982,0.060084,-0.073624,-0.163179,0.426667,0.615599,-0.815102,-0.135929,...,0.336702,-0.117952,0.092497,-0.126463,0.743028,0.172608,-0.287929,-0.291094,0.174713,0.278893
1,-0.024392,0.182389,-0.190436,0.03805,-0.013198,-0.127765,0.310726,0.711857,-0.771701,-0.077946,...,0.477269,-0.084128,0.02273,0.027071,0.610105,0.214239,-0.270501,-0.360278,0.157368,0.226567
2,-0.016882,0.198719,-0.052379,-0.007732,-0.017361,-0.166297,0.45706,0.694028,-0.857787,-0.102636,...,0.328677,-0.170674,0.056612,-0.063626,0.601959,0.167699,-0.355524,-0.40406,0.14956,0.238432
3,-0.066131,0.338758,-0.102425,0.079144,-0.067801,-0.195056,0.429991,0.643728,-0.82605,-0.125206,...,0.348844,-0.225357,0.070194,-0.077345,0.750696,0.202124,-0.331156,-0.364595,0.164018,0.21941
4,-0.037951,0.254834,-0.031797,0.021243,-0.03411,-0.236147,0.393718,0.707003,-0.796103,-0.10475,...,0.389474,-0.130841,0.086422,-0.038045,0.558677,0.231148,-0.183375,-0.348792,0.174064,0.197762


In [17]:
df2=pd.concat([df,data],axis=1)
df2.drop(columns=['cleaned_text'],inplace=True)

In [18]:
le = LabelEncoder()
df2['sentiment'] = le.fit_transform(df2['sentiment'])
X = df2.drop(columns=['review','sentiment'])
y = df2['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.8655
F1 Score: 0.8670554512207176


In [19]:
cbow_model = Word2Vec(
    sentences=sentences,
    sg=0,
    vector_size=50,
    window=5,
    min_count=1,
)

In [20]:
cbow_model.save("cbow_model.model")

In [21]:
cbow_model = Word2Vec.load("cbow_model.model")
loaded_model=cbow_model.wv

In [22]:
word_vector = cbow_model.wv['wonderful']
print(f"Vector for 'wonderful': {word_vector}")

Vector for 'wonderful': [-0.81846875 -0.9106931  -2.3695414  -2.097507    0.21988308 -1.7993802
  2.1262617   1.2597257  -0.5110926  -2.1595979  -1.1689672  -1.8074325
 -1.5526415   0.7066251   1.4114614   0.72903687  1.7051877   2.7015388
 -0.81129    -0.30341464 -2.116086   -1.7197975   1.390849   -2.229772
 -3.05413     2.3331795   2.5287955   0.07866892  2.693956    2.149588
  0.4201168   2.025552    1.3107519   2.9648268  -0.74227196  1.06344
  2.2967203  -2.003037    1.4405121  -1.1280355   2.6575353  -0.04896227
 -3.2848303   1.1505247  -0.6700001  -2.3484464   1.965812   -1.2625608
 -1.7398496  -0.8539397 ]


In [23]:
def cbow_embedding(words):
    embedding = [loaded_model[word] for word in words if word in loaded_model]
    return np.mean(embedding, axis=0)
 
embedding = df['cleaned_text'].apply(cbow_embedding)  
 
data=pd.DataFrame(embedding.tolist())
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.009375,-0.083878,0.129996,0.190679,-0.227386,-0.037609,0.015466,1.251132,-0.772087,-0.46409,...,1.001043,0.352904,0.214728,-0.456391,1.062356,-0.306075,0.248367,-0.412648,-0.145744,0.44169
1,-0.42908,-0.30083,-0.424902,0.210352,-0.537999,-0.112745,0.222454,0.838348,-0.655557,-0.321907,...,1.780913,-0.021433,-0.293082,0.034497,0.879431,-0.125925,0.680777,-0.777004,-0.667192,0.457551
2,-0.027704,-0.14517,-0.051468,0.036498,-0.435205,-0.130171,0.443805,1.297957,-0.611076,-0.516509,...,1.636828,0.180638,0.389332,-0.41312,0.601427,-0.60108,0.362092,-0.735718,-0.520859,0.209026
3,-0.041352,0.026649,0.125734,0.032092,-0.056897,-0.194724,0.402916,0.898428,-0.599401,-0.397633,...,1.668607,-0.232929,0.333019,-0.748645,1.197575,-0.456281,0.755339,-1.138464,-0.3522,0.395871
4,-0.661996,-0.455954,0.045143,-0.236234,-0.483727,-0.391741,-0.042647,1.01938,-0.565574,-0.569736,...,1.673474,0.069063,0.308174,-0.247994,0.665041,-0.14075,0.903648,-0.702217,-0.507646,0.281876


In [24]:
df3=pd.concat([df,data],axis=1)
df3.drop(columns=['cleaned_text'],inplace=True)

In [25]:
le = LabelEncoder()
df3['sentiment'] = le.fit_transform(df3['sentiment'])
X = df3.drop(columns=['review','sentiment'])
y = df3['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.842
F1 Score: 0.8439660280466127
