In [4]:
from fastai.text import *
import torch
import pickle

### Get pretrained AWD-LSTM protrained model (on wikitext103) from Fast.ai

In [5]:
URLs.WT103_FWD

'https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd'

In [6]:
AWDLSTM_PATH = untar_data(URLs.WT103_FWD)
AWDLSTM_PATH

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd.tgz


PosixPath('/Users/jobymacbookpro/.fastai/data/wt103-fwd')

### Load the vocab (list of string tokens)

In [7]:
vocab_path = AWDLSTM_PATH.ls()[0]
print(vocab_path)

/Users/jobymacbookpro/.fastai/data/wt103-fwd/itos_wt103.pkl


In [8]:
with open(vocab_path, "rb") as f:
     vocab = pickle.load(f)
        
print(len(vocab))
print(vocab[:100])

60000
['xxunk', 'xxpad', 'xxbos', 'xxeos', 'xxfld', 'xxmaj', 'xxup', 'xxrep', 'xxwrep', 'the', ',', '.', 'of', 'and', 'in', 'to', 'a', '=', '"', 'was', 'on', '-', "'s", 'as', 'for', 'that', 'with', 'by', '\n ', ')', '(', '\n \n ', 'is', 'his', 'at', 'he', 'it', 'from', 'were', 'an', 'had', 'which', 'be', 'this', 'but', "'", 'are', 'not', 'first', 'their', 'after', ';', 'one', 'her', '–', 'also', 'its', ':', 'they', 'two', 'or', 'have', 'who', 'new', 'been', 'has', 'she', 'when', 'time', 'during', 'other', 'would', 'into', 'all', 'more', 'while', 'over', 'i', 'him', 'game', 'only', 'later', 'up', 'most', 'three', 'about', 'out', 'between', 'there', 'than', 'some', 'film', 'may', 'no', 'before', 'made', 'such', 'season', 'second', 'where']


### Load the weights

In [9]:
weights_path = AWDLSTM_PATH.ls()[1]
print(weights_path)

/Users/jobymacbookpro/.fastai/data/wt103-fwd/lstm_fwd.pth


In [11]:
model_weights = torch.load(weights_path,map_location=torch.device('cpu'))
print(model_weights.keys())

odict_keys(['0.encoder.weight', '0.encoder_dp.emb.weight', '0.rnns.0.weight_hh_l0_raw', '0.rnns.0.module.weight_ih_l0', '0.rnns.0.module.weight_hh_l0', '0.rnns.0.module.bias_ih_l0', '0.rnns.0.module.bias_hh_l0', '0.rnns.1.weight_hh_l0_raw', '0.rnns.1.module.weight_ih_l0', '0.rnns.1.module.weight_hh_l0', '0.rnns.1.module.bias_ih_l0', '0.rnns.1.module.bias_hh_l0', '0.rnns.2.weight_hh_l0_raw', '0.rnns.2.module.weight_ih_l0', '0.rnns.2.module.weight_hh_l0', '0.rnns.2.module.bias_ih_l0', '0.rnns.2.module.bias_hh_l0', '1.decoder.weight', '1.decoder.bias'])


In [12]:
embedding = model_weights["0.encoder.weight"]
print(embedding.shape)
print(embedding)

torch.Size([60000, 400])
tensor([[-0.1276,  0.0161,  0.1617,  ..., -0.1396,  0.6899, -0.0399],
        [ 0.0272,  0.0011,  0.0401,  ...,  0.0161,  0.0666, -0.0014],
        [ 0.6069, -0.5239,  0.1544,  ..., -0.2551, -0.3308, -0.0702],
        ...,
        [ 0.0096,  0.0814,  0.0213,  ...,  0.0712,  0.0810, -0.0045],
        [ 0.0283, -0.0176,  0.0361,  ...,  0.0756,  0.1470, -0.0139],
        [ 0.1267, -0.0656,  0.0362,  ..., -0.0191,  0.0673,  0.0154]])


# Exercise: Visualize embeddings in 2 dimensions
We need to apply a dimensionality reduction method to convert the 400 dimensions of the embedding to 2 dimensions. We can use:
- TSNE (in [Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html))
- UMAP (in [umap-learn](https://umap-learn.readthedocs.io/))

> ### Tip 1
This methods takes time, if we want to get faster results we can get the first 1000 embedding (or other number) for applying the TSNE or UMAP method.

In [24]:
N_EMBEDDINGS = 10000
embedding_subset = embedding[9:N_EMBEDDINGS]
vocab_subset     = vocab[9:N_EMBEDDINGS]

In [25]:
import umap

In [26]:
reducer     = umap.UMAP()
embeding_2d = reducer.fit_transform(embedding_subset)


array([[ 6.02543 , -0.958461],
       [ 5.302713, -1.759714],
       [ 5.271433, -1.769937],
       [ 3.853413, -2.330853],
       ...,
       [ 3.649645,  3.017429],
       [ 2.024768,  1.193316],
       [ 4.006118,  5.08661 ],
       [ 2.095789,  1.524474]], dtype=float32)

> ### Tip 2
> You can use plotly for plotting the 2d scatter of the embeding cloud. See https://plotly.com/python/text-and-annotations/

In [19]:
import pandas as pd

In [27]:
plot_data = pd.DataFrame(data=embeding_2d, columns=["zero","one"])
plot_data["word"] = vocab_subset
plot_data.head()

Unnamed: 0,zero,one,word
0,6.02543,-0.958461,the
1,5.302713,-1.759714,","
2,5.271433,-1.769937,.
3,3.853413,-2.330853,of
4,4.970701,-1.987625,and


In [29]:
import plotly.express as px

df = plot_data

fig = px.scatter(df, x="zero", y="one", text="word", log_x=False, size_max=60)

fig.update_traces(textposition='top center')

fig.update_layout(
    height=800,
    title_text='Wikipedia Word Embedings'
)

fig.show()