# Introduction

In [None]:
"""
What? Training embedding

Word embeddings are an approach to representing text in NLP. In this notebook we will demonstrate how to train 
embeddings using Genism

Reference: https://github.com/practical-nlp/practical-nlp/blob/master/Ch3/06_Training_embeddings_using_gensim.ipynb
           Harshit Surana, Practical Natural Language Processing
"""

# Import libraries/modules

In [32]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
import os
import requests
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time

# Define training data

In [None]:
"""
Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document 
contained in a list. Every list contains lists of tokens of that document.
"""

In [45]:
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]
corpus

[['dog', 'bites', 'man'],
 ['man', 'bites', 'dog'],
 ['dog', 'eats', 'meat'],
 ['man', 'eats', 'food']]

# Training the model

In [None]:
"""
Two different learning models were introduced as part of the word2vec approach to learn the word embedding; 
they are:
    [1] Continuous Bag-of-Words, or CBOW model.
    [2] Continuous Skip-Gram Model
"""

In [3]:
# Using CBOW Architecture for trainnig
model_cbow     = Word2Vec(corpus, min_count=1,sg=0) 
# Using skipGram Architecture for training 
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)

# Continuous Bag of Words (CBOW) 

In [None]:
"""
In CBOW, the primary task is to build a language model that correctly predicts the center word given the context 
words in which the center word appears.
"""

In [15]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.index_to_key)
print(words)

# Access vector for one word
print(model_cbow.wv['dog'])

Word2Vec(vocab=6, vector_size=100, alpha=0.025)
['man', 'dog', 'eats', 'bites', 'food', 'meat']
[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419371e-03
  7.4669169e-03 -6.1676763e-03  1.1056137e-03  6.0472824e-03
 -2.8400517e-03 -6.1735227e-03 -4.1022300e-04 -8.3689503e-03
 -5.6000138e-03  7.1045374e-03  3.3525396e-03  7.2256685e-03
  6.8002464e-03  7.5307419e-03 -3.7891555e-03 -5.6180713e-04
  2.3483753e-03 -4.5190332e-03  8.3887316e-03 -9.8581649e-03
  6.7646410e-03  2.9144168e-03 -4.9328329e-03  4.3981862e-03
 -1.7395759e-03  6.7113829e-03  9.9648498e-03 -4.3624449e-03
 -5.9933902e-04 -5.6956387e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384959e-03  9.2734173e-03
  7.8980681e-03 -6.9895051e-03 -9.1558648e-03 -3.5575390e-04
 -3.0998420e-03  7.8943158e-03  5.9385728e-03 -1.5456629e-03
  1.5109634e-03  1.7900396e-03  7.8175711e-03 -9.5101884e-03
 -2.0553112e-04  3.4691954e-03 -9.3897345e-04  8.3817719e-03
  9.0107825e-03  6.5365052e-03 -7.1162224e-04  7.7

In [21]:
# Compute similarity 
print("Similarity between eats and bites:",model_cbow.wv.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.wv.similarity('eats', 'man'))

Similarity between eats and bites: -0.013497097
Similarity between eats and man: -0.052354384


From the above similarity scores we can conclude that eats is more similar to bites than man.

In [22]:
#Most similarity
model_cbow.wv.most_similar('meat')

[('food', 0.13887985050678253),
 ('bites', 0.13149003684520721),
 ('eats', 0.06422408670186996),
 ('dog', 0.009391186758875847),
 ('man', -0.05987628176808357)]

In [6]:
# save model
model_cbow.save('model_cbow.bin')

# load model
new_model_cbow = Word2Vec.load('model_cbow.bin')
print(new_model_cbow)

Word2Vec(vocab=6, size=100, alpha=0.025)


# SkipGram

In [None]:
"""
In skipgram, the task is to predict the context words from the center word.
"""

In [27]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.index_to_key)
print(words)

#Acess vector for one word
print(model_skipgram.wv['dog'])

Word2Vec(vocab=6, vector_size=100, alpha=0.025)
['man', 'dog', 'eats', 'bites', 'food', 'meat']
[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419371e-03
  7.4669169e-03 -6.1676763e-03  1.1056137e-03  6.0472824e-03
 -2.8400517e-03 -6.1735227e-03 -4.1022300e-04 -8.3689503e-03
 -5.6000138e-03  7.1045374e-03  3.3525396e-03  7.2256685e-03
  6.8002464e-03  7.5307419e-03 -3.7891555e-03 -5.6180713e-04
  2.3483753e-03 -4.5190332e-03  8.3887316e-03 -9.8581649e-03
  6.7646410e-03  2.9144168e-03 -4.9328329e-03  4.3981862e-03
 -1.7395759e-03  6.7113829e-03  9.9648498e-03 -4.3624449e-03
 -5.9933902e-04 -5.6956387e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384959e-03  9.2734173e-03
  7.8980681e-03 -6.9895051e-03 -9.1558648e-03 -3.5575390e-04
 -3.0998420e-03  7.8943158e-03  5.9385728e-03 -1.5456629e-03
  1.5109634e-03  1.7900396e-03  7.8175711e-03 -9.5101884e-03
 -2.0553112e-04  3.4691954e-03 -9.3897345e-04  8.3817719e-03
  9.0107825e-03  6.5365052e-03 -7.1162224e-04  7.7

In [28]:
#Compute similarity 
print("Similarity between eats and bites:",model_skipgram.wv.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.wv.similarity('eats', 'man'))

Similarity between eats and bites: -0.01351881
Similarity between eats and man: -0.05234512


From the above similarity scores we can conclude that eats is more similar to bites than man.

In [29]:
#Most similarity
model_skipgram.wv.most_similar('meat')

[('food', 0.13887986540794373),
 ('bites', 0.1314900517463684),
 ('eats', 0.06406084448099136),
 ('dog', 0.009391188621520996),
 ('man', -0.059876274317502975)]

In [10]:
# save model
model_skipgram.save('model_skipgram.bin')

# load model
new_model_skipgram = Word2Vec.load('model_skipgram.bin')
print(model_skipgram)

Word2Vec(vocab=6, size=100, alpha=0.025)


# Training Your Embedding on Wiki Corpus

In [None]:
"""
The corpus download page : https://dumps.wikimedia.org/enwiki/20200120/
The entire wiki corpus as of 28/04/2020 is just over 16GB in size.
We will take a part of this corpus due to computation constraints and train our word2vec and fasttext embeddings.

The file size is 294MB so it can take a while to download.
"""

In [31]:
os.makedirs('data/en', exist_ok= True)
file_name = "data/en/enwiki-latest-pages-articles-multistream14.xml-p13159683p14324602.bz2"
file_id = "11804g0GcWnBIVDahjo5fQyc05nQLXGwF"

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

if not os.path.exists(file_name):
    download_file_from_google_drive(file_id, file_name)
else:
    print("file already exists, skipping download")

print(f"File at: {file_name}")

File at: data/en/enwiki-latest-pages-articles-multistream14.xml-p13159683p14324602.bz2


In [34]:
#Preparing the Training data
wiki = WikiCorpus(file_name, dictionary={})
sentences = list(wiki.get_texts())

In [None]:
"""
1.   sg - Selecting the training algorithm: 1 for skip-gram else its 0 for CBOW. Default is CBOW.
2.   min_count-  Ignores all words with total frequency lower than this.<br>
There are many more hyperparamaeters whose list can be found in the official documentation [here.]
(https://radimrehurek.com/gensim/models/word2vec.html)
"""

In [35]:
#CBOW
start = time.time()
word2vec_cbow = Word2Vec(sentences,min_count=10, sg=0)
end = time.time()

print("CBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

CBOW Model Training Complete.
Time taken for training is:0.05 hrs 


In [38]:
#Summarize the loaded model
print(word2vec_cbow)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_cbow.wv.index_to_key)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(word2vec_cbow.wv['film'])}")
print(word2vec_cbow.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_cbow.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_cbow.wv.similarity('film', 'tiger'))
print("-"*30)

Word2Vec(vocab=111150, vector_size=100, alpha=0.025)
------------------------------
Length of vocabulary: 111150
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'which', 'were', 'are', 'this', 'new', 'first', 'be', 'or', 'one', 'has']
------------------------------
Length of vector: 100
[-1.7285163e+00 -6.6549134e-01 -5.5782366e-01 -1.7490451e+00
 -2.3600571e+00  2.7023544e+00 -2.8793383e+00  3.3324206e+00
  2.5752574e-01  9.2314738e-01 -2.4496200e+00 -9.8786438e-01
  2.8701024e+00 -1.5976252e-02  2.1427767e+00 -3.1982270e-01
 -1.9374223e-01  1.5745129e-04  3.6849504e+00  8.4056181e-01
  1.0534444e+00  1.2753040e+00 -2.7505445e-01  2.6600546e-01
 -1.1139123e+00  3.2871590e+00 -1.7120357e+00 -3.5111151e+00
  1.7522961e-01  1.5176815e+00 -6.6908938e-01 -2.5252836e+00
 -3.8272228e-02  2.5019100e+00  1.9924344e+00  6.7706496e-01
  1.3728181e-01  4.7461057e-01 -5.0196785e-01 -1

In [47]:
# save model
from gensim.models import Word2Vec, KeyedVectors   
word2vec_cbow.wv.save_word2vec_format('word2vec_cbow.bin', binary=True)

# load model
# new_modelword2vec_cbow = Word2Vec.load('word2vec_cbow.bin')
# print(word2vec_cbow)

In [39]:
#SkipGram
start = time.time()
word2vec_skipgram = Word2Vec(sentences,min_count=10, sg=1)
end = time.time()

print("SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

SkipGram Model Training Complete
Time taken for training is:0.16 hrs 


In [40]:
#Summarize the loaded model
print(word2vec_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(word2vec_skipgram.wv.index_to_key)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(word2vec_skipgram.wv['film'])}")
print(word2vec_skipgram.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",word2vec_skipgram.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_skipgram.wv.similarity('film', 'tiger'))
print("-"*30)

Word2Vec(vocab=111150, vector_size=100, alpha=0.025)
------------------------------
Length of vocabulary: 111150
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'which', 'were', 'are', 'this', 'new', 'first', 'be', 'or', 'one', 'has']
------------------------------
Length of vector: 100
[-0.22242647 -0.33629146 -0.09735668  0.3212172   0.20769584 -0.4093927
  0.2601584   0.4353261   0.05722366 -0.04553508 -0.2456233  -0.825595
  0.49114382  0.37693536  0.32347617  0.08100371  0.50564265  0.4605116
 -0.6479015  -0.22675617  0.30894187 -0.37535283 -0.23129302 -0.41158074
 -0.55922776 -0.00926834  0.13279851 -0.0278654   0.00178477  0.31258506
 -0.07889551 -0.6536966   0.3207598   0.01975377  0.06030234  0.45609853
  0.09996798  0.23685017 -0.6418596   0.09482809 -0.20822836 -0.2828217
 -0.37507758  0.2892338  -0.22006956  0.35175303 -0.40061903  0.18101247
  0.12281884  0.07

In [51]:
# save model
word2vec_cbow.wv.save_word2vec_format('word2vec_sg.bin', binary=True)

# load model
# new_model_skipgram = Word2Vec.load('model_skipgram.bin')
# print(model_skipgram)

# FastText

In [None]:
"""
When we have a large dataset, and when learning seems infeasible with the approaches described so far, 
fastText is a good option to use to set up a strong working baseline. However, there’s one concern to keep 
in mind when using fastText, as was the case with Word2vec embeddings: it uses pre-trained character n-gram
embeddings. Thus, when we save the trained model, it carries the entire character n-gram embeddings dictionary 
with it. This results in a bulky model and can result in engineering issues. For example, the model stored with 
the name “temp” in the above code snippet has a size close to 450 MB. However, fastText implementation also comes
with options to reduce the memory footprint of its classification models with minimal reduction in classification
performance.


Some of the most popular pre-trained embeddings are
    [1] Word2vec by Google
    [2] GloVe    by Stanford
    [3] fastText by Facebook
"""

In [41]:
#CBOW
start = time.time()
fasttext_cbow = FastText(sentences, sg=0, min_count=10)
end = time.time()

print("FastText CBOW Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

FastText CBOW Model Training Complete
Time taken for training is:0.18 hrs 


In [42]:
#Summarize the loaded model
print(fasttext_cbow)
print("-"*30)

#Summarize vocabulary
words = list(fasttext_cbow.wv.index_to_key)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(fasttext_cbow.wv['film'])}")
print(fasttext_cbow.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",fasttext_cbow.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_cbow.wv.similarity('film', 'tiger'))
print("-"*30)

FastText(vocab=111150, vector_size=100, alpha=0.025)
------------------------------
Length of vocabulary: 111150
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'which', 'were', 'are', 'this', 'new', 'first', 'be', 'or', 'one', 'has']
------------------------------
Length of vector: 100
[-4.1463914  -1.8119481   1.6104423   1.7290591  -2.0994642  -0.631982
 -2.0095596   2.063677    4.921095    1.2384105   3.405752    1.2418664
 -5.2132835   1.425097    0.7388006   0.83315045  1.4783679  -1.4807229
  2.7253404   5.286211   -1.8287735   4.7407475  -2.7341452   2.166024
 -0.16232926  0.84643435  2.6543214   0.5374928   1.3685168   1.106021
 -0.25491908 -0.82786995  0.4071864  -2.0010269  -2.6835878   2.781833
  4.4211655   3.3429868   5.1445546   0.14225101 -4.0890293  -4.4640427
  0.13031602  0.44952396 -3.9834783   0.645766    3.8364275   4.564231
 -0.17106022  2.961449    

In [43]:
#SkipGram
start = time.time()
fasttext_skipgram = FastText(sentences, sg=1, min_count=10)
end = time.time()

print("FastText SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))

FastText SkipGram Model Training Complete
Time taken for training is:0.29 hrs 


In [44]:
#Summarize the loaded model
print(fasttext_skipgram)
print("-"*30)

#Summarize vocabulary
words = list(fasttext_skipgram.wv.index_to_key)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)

#Acess vector for one word
print(f"Length of vector: {len(fasttext_skipgram.wv['film'])}")
print(fasttext_skipgram.wv['film'])
print("-"*30)

#Compute similarity 
print("Similarity between film and drama:",fasttext_skipgram.wv.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_skipgram.wv.similarity('film', 'tiger'))
print("-"*30)

FastText(vocab=111150, vector_size=100, alpha=0.025)
------------------------------
Length of vocabulary: 111150
Printing the first 30 words.
['the', 'of', 'and', 'in', 'to', 'was', 'is', 'for', 'on', 'as', 'by', 'with', 'he', 'at', 'from', 'that', 'his', 'it', 'an', 'also', 'which', 'were', 'are', 'this', 'new', 'first', 'be', 'or', 'one', 'has']
------------------------------
Length of vector: 100
[ 0.30401322  0.3009671  -0.5875729   0.27169418  0.47659576  0.38756916
  0.09156659 -0.15275064  0.57474303 -0.07595404 -0.13621216  0.25278902
 -0.34578922  1.0397061  -0.5496711  -0.3047049  -0.02724782  0.44049594
 -0.05332035  0.02233534  0.09534271  0.44290674  0.39798424  0.3414503
 -0.2835456   0.09747949 -0.2093019   0.05018875  0.40034446  0.05758088
 -0.33358884  0.14034662 -0.0479691   0.4520111  -0.24378438 -0.39850605
  0.24458605  0.37684715  0.00532099  0.12791194  0.44664317 -0.20575352
 -0.06274722  0.2542009   0.03270878  0.2436347  -0.6695596   0.17473593
  0.17213994  

In [None]:
"""
An interesting obeseravtion if you noticed is that CBOW trains faster than SkipGram in both cases.
"""