# Inport the libraries

In [1]:
import tensorflow as tf
print(tf.__version__)

2.11.0


In [2]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
#import the necessary dependencies
import nltk
import pandas as pd
import gensim
from gensim.models import Word2Vec, keyedvectors 

# Data Preprocessing

In [4]:
#google pretrained model
#https://www.kaggle.com/datasets/rootuser/worldnews-on-reddit

In [5]:
##install the kaggle api
! pip install -q kaggle

In [6]:
#create a directory as kaggle
! mkdir -p ~/.kaggle 

In [7]:
#copy api key to kaggle directory
! cp kaggle.json ~/.kaggle

In [8]:
#disable the api key
! chmod 600 /root/.kaggle/kaggle.json

In [9]:
#import the dataset
! kaggle datasets download -d rootuser/worldnews-on-reddit 

Downloading worldnews-on-reddit.zip to /content
100% 26.6M/26.6M [00:00<00:00, 133MB/s] 
100% 26.6M/26.6M [00:00<00:00, 116MB/s]


In [11]:
#unzip the dataset
! unzip /content/worldnews-on-reddit

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [12]:
#read the csv file
world_news_df= pd.read_csv("/content/reddit_worldnews_start_to_2016-11-22.csv")

In [13]:
#read first five rows
world_news_df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [17]:
#how many rows & columns present
world_news_df.shape

(509236, 8)

In [14]:
#take the columns
news_title=world_news_df['title'].values

In [15]:
news_title

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [18]:
news_title[:5]

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border',
       'Jump-start economy: Give health care to all ',
       'Council of Europe bashes EU&UN terror blacklist'], dtype=object)

In [16]:
#tokenize the word
new_vec=[nltk.word_tokenize(title) for title in news_title]

In [22]:
new_vec[0]

['Scores', 'killed', 'in', 'Pakistan', 'clashes']

# Build the model

In [26]:
#text, min word count & size of the vector
model= Word2Vec(new_vec,min_count=1,size=32)

In [27]:
model

<gensim.models.word2vec.Word2Vec at 0x7f2097888dc0>

# Predict the output

In [28]:
#find 10 closet words in the vector space that we have created
model.wv.most_similar('man')

[('woman', 0.9760644435882568),
 ('girl', 0.9261178970336914),
 ('boy', 0.9085848331451416),
 ('teenager', 0.8820904493331909),
 ('couple', 0.8768048286437988),
 ('doctor', 0.8589649200439453),
 ('mother', 0.8584377765655518),
 ('teacher', 0.858078122138977),
 ('pair', 0.8167272806167603),
 ('father', 0.8162134289741516)]

In [29]:
## see the vector
model.wv['man']

array([-3.7071738 ,  1.1618817 ,  3.0424178 , -2.602523  , -3.3714185 ,
       -0.6085686 , -2.7443643 , -3.7664673 ,  1.2150885 , -0.33372217,
        7.112428  ,  0.38500306, -1.2666839 ,  3.921967  ,  1.614928  ,
        4.0404983 ,  0.64496917,  2.6381562 , -1.6300232 , -4.274463  ,
        2.2748547 , -2.1007318 ,  0.03498438,  0.68358564,  0.26074117,
        4.4307346 ,  0.6054774 ,  3.1468499 ,  1.6359432 ,  3.8529866 ,
       -1.0981548 ,  0.9625799 ], dtype=float32)

In [31]:
#let us create the relationship
vec= model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

[('religious', 0.6573885679244995),
 ('freedoms', 0.6517857313156128),
 ('women', 0.6355881690979004),
 ('Lecturers', 0.6295549869537354),
 ('rights', 0.6234898567199707),
 ('clerics', 0.6094865798950195),
 ('education', 0.6070539951324463),
 ('unions', 0.5915075540542603),
 ('gender', 0.5909205675125122),
 ('marriages', 0.5905488729476929)]

In [32]:
#relationship
vec= model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

[('France', 0.8613652586936951),
 ('Belgium', 0.8580750226974487),
 ('Germany', 0.8407019376754761),
 ('Paris', 0.8209952116012573),
 ('Brussels', 0.820104718208313),
 ('Sweden', 0.8132696747779846),
 ('UK', 0.7949399948120117),
 ('Britain', 0.78262859582901),
 ('Switzerland', 0.7391773462295532),
 ('Norway', 0.6927999258041382)]

In [33]:
#relationship
vec= model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

[('napalm', 0.7540183067321777),
 ('resuscitates', 0.7534840106964111),
 ('non-dalit', 0.7415737509727478),
 ('Siachen', 0.7403758764266968),
 ('Belmina', 0.734356701374054),
 ('17-minute', 0.7217748165130615),
 ('Commuted', 0.7207337617874146),
 ('185,000', 0.7194732427597046),
 ('Koppal', 0.7126858830451965),
 ('Iver', 0.7064398527145386)]

#Embeded with Pre-trained Model