### Load Gensim Library

In [1]:
#!pip install gensim



In [17]:
import gensim
import warnings
warnings.filterwarnings('ignore')

In [18]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [19]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import pandas as pd

#change file path to point to where you have stored the zip file.
#df = pd.read_csv('/gdrive/My Drive/Statistical NLP AIML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)
df = pd.read_csv('unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)
print('Number of examples in Dataset: ', df.shape)
df.head()

### Function to Clean up data

In [20]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [21]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [22]:
df['clean_review'][0].split(' ')

['watching',
 'time',
 'chasers',
 'it',
 'obvious',
 'that',
 'it',
 'was',
 'made',
 'by',
 'a',
 'bunch',
 'of',
 'friends',
 'maybe',
 'they',
 'were',
 'sitting',
 'around',
 'one',
 'day',
 'in',
 'film',
 'school',
 'and',
 'said',
 'hey',
 'let',
 's',
 'pool',
 'our',
 'money',
 'together',
 'and',
 'make',
 'a',
 'really',
 'bad',
 'movie',
 'or',
 'something',
 'like',
 'that',
 'what',
 'ever',
 'they',
 'said',
 'they',
 'still',
 'ended',
 'up',
 'making',
 'a',
 'really',
 'bad',
 'movie',
 'dull',
 'story',
 'bad',
 'script',
 'lame',
 'acting',
 'poor',
 'cinematography',
 'bottom',
 'of',
 'the',
 'barrel',
 'stock',
 'music',
 'etc',
 'all',
 'corners',
 'were',
 'cut',
 'except',
 'the',
 'one',
 'that',
 'would',
 'have',
 'prevented',
 'this',
 'film',
 's',
 'release',
 'life',
 's',
 'like',
 'that']

### Convert Review to a Word List

In [23]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [24]:
print(len(documents[0]))

90


In [25]:
len(documents[108])

121

### Build the Model

In [28]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=5, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               vector_size=300,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               epochs=20   #Number of iterations over the text corpus
                              )  

2021-07-04 12:31:12,758 : INFO : collecting all words and their counts
2021-07-04 12:31:12,758 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-07-04 12:31:13,195 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2021-07-04 12:31:13,616 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2021-07-04 12:31:14,096 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2021-07-04 12:31:14,648 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2021-07-04 12:31:15,074 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2021-07-04 12:31:15,074 : INFO : Creating a fresh vocabulary
2021-07-04 12:31:15,256 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 39730 unique words (39.54060052349247%% of original 100479, drops 60749)', 'datetime': '2021-07-04T12:3

2021-07-04 12:31:52,159 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-07-04 12:31:52,162 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-07-04 12:31:52,163 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-07-04 12:31:52,174 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-07-04 12:31:52,175 : INFO : EPOCH - 4 : training on 12084660 raw words (8901430 effective words) took 7.7s, 1155438 effective words/s
2021-07-04 12:31:53,187 : INFO : EPOCH 5 - PROGRESS: at 12.08% examples, 1056312 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:31:54,194 : INFO : EPOCH 5 - PROGRESS: at 23.85% examples, 1044644 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:31:55,196 : INFO : EPOCH 5 - PROGRESS: at 35.14% examples, 1033011 words/s, in_qsize 8, out_qsize 0
2021-07-04 12:31:56,204 : INFO : EPOCH 5 - PROGRESS: at 47.60% examples, 1054420 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:31:57,206 : INFO : EPOCH 5 

2021-07-04 12:32:42,120 : INFO : EPOCH 10 - PROGRESS: at 19.18% examples, 844363 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:43,121 : INFO : EPOCH 10 - PROGRESS: at 31.46% examples, 926319 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:44,126 : INFO : EPOCH 10 - PROGRESS: at 43.30% examples, 960733 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:45,135 : INFO : EPOCH 10 - PROGRESS: at 55.86% examples, 991576 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:46,135 : INFO : EPOCH 10 - PROGRESS: at 68.23% examples, 1010788 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:47,139 : INFO : EPOCH 10 - PROGRESS: at 81.10% examples, 1027938 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:48,140 : INFO : EPOCH 10 - PROGRESS: at 93.60% examples, 1037708 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:32:48,693 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-07-04 12:32:48,694 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-07-04 12:32:

2021-07-04 12:33:35,174 : INFO : EPOCH 15 - PROGRESS: at 98.50% examples, 1091238 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:33:35,287 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-07-04 12:33:35,294 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-07-04 12:33:35,296 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-07-04 12:33:35,303 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-07-04 12:33:35,304 : INFO : EPOCH - 15 : training on 12084660 raw words (8900328 effective words) took 8.2s, 1090116 effective words/s
2021-07-04 12:33:36,309 : INFO : EPOCH 16 - PROGRESS: at 10.48% examples, 924927 words/s, in_qsize 8, out_qsize 0
2021-07-04 12:33:37,324 : INFO : EPOCH 16 - PROGRESS: at 22.32% examples, 975758 words/s, in_qsize 6, out_qsize 1
2021-07-04 12:33:38,329 : INFO : EPOCH 16 - PROGRESS: at 31.70% examples, 928685 words/s, in_qsize 7, out_qsize 0
2021-07-04 12:33:39,339 : INFO : EPOCH 

2021-07-04 12:34:25,467 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-07-04 12:34:25,474 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-07-04 12:34:25,481 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-07-04 12:34:25,484 : INFO : EPOCH - 20 : training on 12084660 raw words (8900861 effective words) took 14.4s, 618216 effective words/s
2021-07-04 12:34:25,485 : INFO : Word2Vec lifecycle event {'msg': 'training on 241693200 raw words (178016766 effective words) took 189.6s, 939058 effective words/s', 'datetime': '2021-07-04T12:34:25.485728', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
2021-07-04 12:34:25,485 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=39730, vector_size=300, alpha=0.025)', 'datetime': '2021-07-04T12:34:25.485728', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2

# Exploring the model

### How many words in the model

### Get an embedding for a word

In [32]:
model.wv['great']

array([-1.3748035 ,  0.40693563,  0.17694889,  1.6558163 , -1.2846459 ,
       -1.347553  , -0.8702519 ,  2.4961493 ,  0.6487469 ,  0.47790593,
       -0.6889029 ,  1.7646244 , -0.60635936, -1.1106366 , -0.77722454,
       -1.0976961 ,  0.37087515,  0.354815  ,  1.1730213 ,  2.9021182 ,
       -2.8885605 ,  0.93890285,  0.48709238,  0.02864273,  0.31310117,
       -1.9950837 ,  1.835493  , -0.33238164,  0.8860662 , -1.1117389 ,
        0.61216503, -0.2104881 ,  2.2876387 ,  0.66033214, -0.45275614,
       -0.02361203,  1.3707657 ,  1.607087  , -1.8434318 , -1.3194755 ,
       -0.642264  ,  0.883949  ,  0.3474199 ,  1.5869032 , -1.1017574 ,
       -0.8828091 ,  0.9528602 ,  2.0473504 ,  0.5597224 ,  0.36629182,
       -0.6778213 , -0.93309855, -1.1415274 , -1.9782115 , -0.11496329,
       -0.373279  ,  0.78701484,  0.964797  , -0.17658205,  0.2471578 ,
        0.847256  , -0.2285553 ,  1.7269222 ,  0.86046946,  1.2873129 ,
       -0.35332093, -1.2887973 ,  1.0080508 ,  1.3432077 , -0.59

In [33]:
model.wv['amazing']

array([-1.9704866 ,  2.0275326 , -0.4924014 ,  0.4802158 , -0.29558533,
       -0.6489331 ,  0.93969667,  2.1106415 , -0.3543536 ,  0.75575936,
       -0.65854883,  1.2827026 , -0.20587799, -2.235863  ,  1.1926695 ,
        0.06958152,  0.20748693, -0.3236101 ,  0.57620186,  0.7624651 ,
       -1.8098365 ,  2.9703863 , -0.4788256 ,  0.8353299 , -0.03560901,
       -1.212328  ,  2.1461914 , -0.56034744,  2.956667  , -0.6205489 ,
       -0.1705835 ,  1.1124654 ,  1.9235274 ,  0.94449544, -1.1804732 ,
        0.11251862,  0.582796  ,  0.2738058 , -0.19790131,  0.34085554,
       -0.8484373 ,  0.8362351 , -1.1548775 ,  0.6066461 ,  0.43042344,
        1.0372915 , -0.17820224,  0.5365912 , -1.127135  , -0.6205496 ,
       -1.9730452 , -0.29284242,  0.55407596, -0.03047536, -0.43120968,
       -0.68871284, -1.0005534 ,  0.48866707,  1.3917851 ,  0.6315337 ,
       -0.12639649, -0.8131704 ,  3.0814626 ,  1.7588593 ,  1.8616524 ,
        0.58225167,  0.83807224,  0.5808407 ,  0.7534169 ,  0.86

### Finding Words which have similar meaning

In [40]:
model.wv.most_similar('amazing')

[('incredible', 0.7218039035797119),
 ('awesome', 0.6939853429794312),
 ('outstanding', 0.6379446983337402),
 ('excellent', 0.6205117702484131),
 ('fantastic', 0.6204419136047363),
 ('astounding', 0.6135815978050232),
 ('exceptional', 0.6026242971420288),
 ('wonderful', 0.5866407752037048),
 ('astonishing', 0.573358416557312),
 ('great', 0.5440261363983154)]

### Find the word which is not like others

### Saving the model

In [42]:
model.save('word2vec-movie-50')

2021-07-04 12:38:54,772 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec-movie-50', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-07-04T12:38:54.772816', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
2021-07-04 12:38:54,773 : INFO : storing np array 'vectors' to word2vec-movie-50.wv.vectors.npy
2021-07-04 12:38:54,827 : INFO : storing np array 'syn1neg' to word2vec-movie-50.syn1neg.npy
2021-07-04 12:38:54,887 : INFO : not storing attribute cum_table
2021-07-04 12:38:54,910 : INFO : saved word2vec-movie-50


In [24]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2021-06-13 06:59:02,429 : INFO : loading Word2Vec object from word2vec-movie-50
2021-06-13 06:59:02,532 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2021-06-13 06:59:02,534 : INFO : setting ignored attribute vectors_norm to None
2021-06-13 06:59:02,539 : INFO : loading vocabulary recursively from word2vec-movie-50.vocabulary.* with mmap=None
2021-06-13 06:59:02,542 : INFO : loading trainables recursively from word2vec-movie-50.trainables.* with mmap=None
2021-06-13 06:59:02,543 : INFO : setting ignored attribute cum_table to None
2021-06-13 06:59:02,548 : INFO : loaded word2vec-movie-50


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [43]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('princess', 0.3559854030609131),
 ('commoner', 0.33295556902885437),
 ('camelot', 0.31230178475379944),
 ('kings', 0.3097738027572632),
 ('queen', 0.3037213385105133),
 ('throne', 0.3017286956310272),
 ('mistress', 0.30098479986190796),
 ('prince', 0.29532286524772644),
 ('flavia', 0.29517504572868347),
 ('guenevere', 0.29114076495170593)]

In [44]:
model.wv.most_similar(positive=['woman', 'hero'], negative=['man'], topn=5)

[('heroine', 0.5597978830337524),
 ('protagonist', 0.43526992201805115),
 ('girl', 0.3650706112384796),
 ('brunette', 0.32930877804756165),
 ('fatale', 0.32776519656181335)]

In [48]:
model.wv.most_similar(positive=['woman','father'], negative=['man'])

[('daughter', 0.6057916283607483),
 ('mother', 0.5633425712585449),
 ('sister', 0.5373031497001648),
 ('wife', 0.53084796667099),
 ('mom', 0.5166723728179932),
 ('aunt', 0.5162851810455322),
 ('parents', 0.5048230886459351),
 ('grandmother', 0.5040150284767151),
 ('spouse', 0.498273104429245),
 ('mum', 0.4803133010864258)]

In [49]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-2.2852557 ,  3.4312801 ,  3.6474965 , -1.308317  , -2.2951822 ,
        0.89837897, -3.431485  ,  0.15043867,  2.2970724 ,  1.0535132 ,
        0.29580843,  1.4091816 ,  1.4258128 ,  0.50019705, -0.7747109 ,
        2.073884  , -1.5886389 ,  0.48537135, -2.189196  ,  1.3262185 ,
        0.6093854 , -1.3228862 , -0.9547868 , -0.56881094, -0.29576302,
        0.02925873,  0.16643918,  1.698378  , -0.9380687 ,  2.0002759 ,
        0.14137068, -2.3957534 , -0.81501937,  1.7643278 ,  0.3247606 ,
       -3.0619738 ,  2.9473891 ,  0.9710996 ,  1.5182089 ,  0.7880571 ,
       -1.5941441 ,  4.163824  , -0.20149136,  2.2668216 ,  2.432766  ,
        1.1591753 , -2.5121155 ,  2.5135255 , -1.7075282 , -4.067596  ,
       -0.05669153, -1.6587718 , -3.1284475 ,  1.8263104 , -2.815324  ,
       -2.9794335 ,  0.65219694, -0.43006563,  0.21741545,  1.9882355 ,
       -0.04917753,  0.5129795 ,  1.036392  , -1.5877602 , -0.60934615,
       -0.62659305, -2.0203156 , -0.6628905 , -1.1472855 ,  1.01