# Introduction


**What?** Pre-trained Word2Vec embedding model



# Import modules

In [25]:
import os
import wget
import gzip
import shutil
#This module ignores the various types of warnings generated
import warnings 
warnings.filterwarnings("ignore") 
#This module helps in retrieving information on running processes and system resource utilization
import psutil 
from psutil import virtual_memory
import time 
from gensim.models import Word2Vec, KeyedVectors
import spacy

# Import pre-trained model


- Let us take an example of a pre-trained word2vec model, and how we can use it to look for most similar words. 
- We will use the Google News vectors embeddings. https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM
- **ATTENTION!** the file sizr is: 1.65GB it will take a while to download. The decompressed size is over 3GB



In [2]:
gn_vec_path = "GoogleNews-vectors-negative300.bin"
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
    if not os.path.exists("./GoogleNews-vectors-negative300.bin"):
        # Downloading the reqired model
        if not os.path.exists("./GoogleNews-vectors-negative300.bin.gz"):
            if not os.path.exists("GoogleNews-vectors-negative300.bin.gz"):
                wget.download("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz")
            gn_vec_zip_path = "GoogleNews-vectors-negative300.bin.gz"
        else:
            gn_vec_zip_path = "./GoogleNews-vectors-negative300.bin.gz"
        # Extracting the required model
        with gzip.open(gn_vec_zip_path, 'rb') as f_in:
            with open(gn_vec_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    else:
        gn_vec_path = "./" + gn_vec_path

print(f"Model at {gn_vec_path}")

Model at GoogleNews-vectors-negative300.bin


In [4]:
process = psutil.Process(os.getpid())
mem = virtual_memory()

In [10]:
pretrainedpath = gn_vec_path

# Load W2V model. This will take some time, but it is a one time effort! 
pre = process.memory_info().rss
# Check memory usage before loading the model
print("Memory used in GB before Loading the Model: %0.2f"%float(pre/(10**9))) 
print('-'*10)

# Start the timer
start_time = time.time() 
# Toal memory available
ttl = mem.total 

# Load the model
w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True) 
# Calculate the total time elapsed since starting the timer
print("%0.2f seconds taken to load"%float(time.time() - start_time)) 
print('-'*10)

print('Finished loading Word2Vec')
print('-'*10)

post = process.memory_info().rss
# Calculate the memory used after loading the model
print("Memory used in GB after Loading the Model: {:.2f}".format(float(post/(10**9)))) 
print('-'*10)

# Percentage increase in memory after loading the model
print("Percentage increase in memory usage: {:.2f}% ".format(float((post/pre)*100))) 
print('-'*10)

# Number of words in the vocabulary. 
print("Numver of words in vocablulary [Mil]: " + str(len(w2v_model.key_to_index)/1.e6)) 

Memory used in GB before Loading the Model: 1.73
----------
36.63 seconds taken to load
----------
Finished loading Word2Vec
----------
Memory used in GB after Loading the Model: 1.76
----------
Percentage increase in memory usage: 101.87% 
----------
Numver of words in vocablulary [Mil]: 3.0


In [None]:
"""
How many things can we do?
we can inspect the methods with dir
"""

In [13]:
dir(w2v_model)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 '_upconvert_old_d2vkv',
 '_upconvert_old_vocab',
 'add_lifecycle_event',
 'add_vector',
 'add_vectors',
 'allocate_vecattrs',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'expandos',
 'fill_norms',
 'get_index',
 'get_normed_vectors',
 'get_vecattr',
 'get_vector',
 'has_index_for',
 'index2entity',
 'index2word',
 'index_to_key',
 'init_sims',
 'intersect_word2vec_

In [15]:
# Let us examine the model by knowing what the most similar words are, for a given word!
w2v_model.most_similar('beautiful')

[('gorgeous', 0.8353005051612854),
 ('lovely', 0.8106936812400818),
 ('stunningly_beautiful', 0.7329413294792175),
 ('breathtakingly_beautiful', 0.7231340408325195),
 ('wonderful', 0.6854086518287659),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591402888298035)]

In [17]:
# Let us try with another word! 
w2v_model.most_similar('rome')

[('athens', 0.6001025438308716),
 ('albert', 0.5729556083679199),
 ('holmes', 0.5693243145942688),
 ('donnie', 0.5690680146217346),
 ('italy', 0.5673536658287048),
 ('toni', 0.5666349530220032),
 ('spain', 0.566185474395752),
 ('jh', 0.5661598443984985),
 ('pablo', 0.563156008720398),
 ('malta', 0.5620370507240295)]

In [18]:
# What is the vector representation for a word? 
w2v_model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [None]:
# What if I am looking for a word that is not in this vocabulary?
w2v_model['practicalnlp']


- Two things to note while using pre-trained models: 
- [1] Tokens/Words are always lowercased. If a word is not in the vocabulary,   the model throws an exception.
- [2] So, it is always a good idea to encapsulate those statements in try/except blocks.



# Getting the embedding representation for full text


- We have seen how to get embedding vectors for single words. 
- How do we use them to get such a representation for a full text? 
- A simple way is to just sum or average the embeddings for individual words. 
- Let us see a small example using another NLP library Spacy



In [23]:
%time nlp = spacy.load('en_core_web_sm')
# process a sentence using the model
mydoc = nlp("Canada is a large country")
#Get a vector for individual words
#print(doc[0].vector) #vector for 'Canada', the first word in the text 
print(mydoc.vector) #Averaged vector for the entire sentence

CPU times: user 591 ms, sys: 51.3 ms, total: 642 ms
Wall time: 673 ms
[ 1.1530104   0.04257578 -0.12662673 -0.08265086  0.02096112 -0.32236233
 -0.6240498   0.02519732  0.16935535 -0.7434208   0.27868682 -0.20403433
 -0.26521063  0.15699737 -0.288515    0.3498006   0.06954589 -0.04919723
  0.29010016 -0.19193202 -0.03356849 -0.18861568  0.48819193 -0.10287628
 -0.27089745 -0.35096675  0.12004175 -0.42992252  0.02619261  0.30020046
 -0.08323112 -0.22649841  0.38065207 -0.7358086   0.31856763 -0.13183843
  0.11280444 -0.16284898  0.13759     0.5194619  -0.49620238  0.22728035
 -0.19244835  0.1665419  -0.3557002   0.00745243 -0.0097326   0.33902416
 -0.07566185 -0.2623116   0.38962117 -0.2693131  -0.437186   -0.11987744
  0.8256197  -0.05397683  0.40647787  0.23175475  0.14332609  0.20003267
 -0.62319547 -0.277183   -0.41782817  0.26579994  0.7164182  -0.34532383
 -0.24082482  0.00639551  0.76979893 -0.40577835  0.475596   -0.10088948
  0.09429872 -0.36900702 -0.45953855 -0.06675088  0.08


- What happens when I give a sentence with strange words (and stop words), and try to get its word vector in Spacy?
- Well, at least, this is better than throwing an exception!



In [24]:
temp = nlp('practicalnlp is a newword')
temp[0].vector

array([-0.24652115, -0.00370538,  0.47585845, -0.53857994, -0.10164852,
        0.08054233, -0.78846335,  0.57725704,  0.36892247, -0.20525366,
        0.49322945, -0.06111569, -1.041065  ,  0.65225863,  0.45910472,
        0.3026195 ,  0.4009441 , -0.21229711,  0.4503184 , -0.34189436,
        0.3360495 , -0.49880746, -0.67977595,  1.0171478 , -0.85278463,
        0.22901264, -0.27532893,  0.8168068 ,  0.21267067,  0.9669028 ,
       -0.8624253 , -0.35919917,  0.21813078,  0.4975592 , -0.98805666,
        1.1876267 , -1.0230168 , -0.28025502, -0.7530157 ,  1.0236799 ,
       -0.4719799 , -0.28097963, -1.1994808 , -1.1338023 , -0.30421656,
        0.16297932, -0.05472174,  1.0285486 ,  0.6075866 ,  0.43900877,
        1.018733  ,  0.28485686, -0.13497996,  0.23243935,  0.37535557,
       -0.19549476, -0.03630176, -0.6209484 ,  0.4153436 , -0.47523615,
       -1.0024692 ,  0.84635615, -0.9490654 , -0.6589898 , -0.48663056,
       -0.8837726 ,  0.9758252 ,  2.8941932 ,  0.19651982, -0.73

# References


- https://github.com/practical-nlp/practical-nlp/blob/master/Ch3/05_Pre_Trained_Word_Embeddings.ipynb

