In [7]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk import download
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

In [9]:
download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzippi

True

In [10]:
sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))

In [11]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3106


In [12]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']
['Actus', 'Primus', '.']
['Fran', '.']


In [13]:
#Use re module to preprocess data
#Convert all letters into lowercase
#Remove punctuations, numbers, etc.

for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]

In [16]:
print(sentences[0])
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


In [17]:
#sg: CBOW if 0, skip-gram if 1
#window: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)

model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [18]:
model.init_sims(replace = True)

In [19]:
model.save('word2vec_model')

In [20]:
model = Word2Vec.load('word2vec_model')

In [21]:
model.most_similar('hamlet')

  """Entry point for launching an IPython kernel.


[('horatio', 0.9978235960006714),
 ('oh', 0.9970029592514038),
 ('deere', 0.9969128370285034),
 ('laertes', 0.9965823888778687),
 ('sweet', 0.996576726436615),
 ('meane', 0.996478796005249),
 ('newes', 0.9964228272438049),
 ('ho', 0.9963844418525696),
 ('mother', 0.996315598487854),
 ('friends', 0.9962758421897888)]

In [33]:
model['newes']

  """Entry point for launching an IPython kernel.


array([-0.17700185, -0.01337246,  0.05283739, -0.07893739, -0.08954146,
       -0.06472877,  0.02169863, -0.0685598 , -0.05223832, -0.14221135,
        0.02858667,  0.22449775,  0.02084588,  0.23659717, -0.02166394,
        0.1756883 , -0.08405194,  0.08038558, -0.02777424,  0.20328912,
        0.15305907, -0.13680145,  0.08143277,  0.05294335,  0.06492546,
        0.03371138,  0.04917241,  0.04616524, -0.22632883, -0.05879657,
        0.13974394, -0.04615613, -0.09486467, -0.01979573,  0.05794812,
        0.06483195, -0.01666242, -0.04214269, -0.00246159, -0.05768608,
       -0.08598642, -0.23555246,  0.04228641, -0.1491498 , -0.14082508,
        0.0572255 ,  0.07073558,  0.0869325 , -0.03261286, -0.07107798,
       -0.01986452,  0.04311923, -0.15613984,  0.16595913,  0.00305908,
       -0.10381171,  0.07233582,  0.06295778, -0.01115292,  0.13046473,
       -0.00790747,  0.06434999, -0.04338047, -0.07979916, -0.00706278,
       -0.12311943, -0.15577108,  0.06554712, -0.04228636,  0.09

In [24]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [25]:
v3 = model['king']
v4 = model['queen']

  """Entry point for launching an IPython kernel.
  


In [26]:
cosine_similarity(v3, v4)

0.9959511756896973

In [35]:
cosine_similarity(model['king'], model['newes'])

  """Entry point for launching an IPython kernel.


0.9956969022750854

In [31]:
cosine_similarity(model['king'], model['mother'])

  """Entry point for launching an IPython kernel.


0.9927608966827393