In [None]:
import torch

# Skip-gram Word2Vec

In this notebook, I'll lead you through using PyTorch to implement the [Word2Vec algorithm](https://en.wikipedia.org/wiki/Word2vec) using the skip-gram architecture. By implementing this, you'll learn about embedding words for use in natural language processing. This will come in handy when dealing with things like machine translation.

---
## Loading Data

Next, we'll ask you to load in data and place it in the `data` directory. we'll be leveraging 100 MB of cleaned text from Wikipedia

1. Load the [text8 dataset](https://s3.amazonaws.com/video.udacity-data.com/topher/2018/October/5bbe6499_text8/text8.zip); a file of cleaned up *Wikipedia article text* from Matt Mahoney. 
2. Place that data in the `data` folder in the home directory.
3. Then you can extract it and delete the archive, zip file to save storage space.

After following these steps, you should have one file in your data directory: `data/text8`.

(Sorry that i didn't automate this for you, I know I could have)

In [None]:
# read in the extracted text file      
with open('data/text8') as f:
    text = f.read()

# print out the first 100 characters
print(text[:100])

## Pre-processing

Here I'm fixing up the text to make training easier. This comes from the `utils.py` file. The `preprocess` function does a few things:
>* It converts any punctuation into tokens, so a period is changed to ` <PERIOD> `. In this data set, there aren't any periods, but it will help in other NLP problems. 
* It removes all words that show up five or *fewer* times in the dataset. This will greatly reduce issues due to noise in the data and improve the quality of the vector representations. 
* It returns a list of words in the text.

This may take a few seconds to run, since our text file is quite large. If you want to write your own functions for this stuff, go for it!

In [None]:
import utils

# get list of words
words = utils.preprocess(text)
print(words[:30])

In [None]:
# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

### Dictionaries

Next, I'm creating two dictionaries to convert words to integers and back again (integers to words). This is again done with a function in the `utils.py` file. `create_lookup_tables` takes in a list of words in a text and returns two dictionaries.
>* The integers are assigned in descending frequency order, so the most frequent word ("the") is given the integer 0 and the next most frequent is 1, and so on. 

Once we have our dictionaries, the words are converted to integers and stored in the list `int_words`.

In [None]:
from collections import Counter
import random
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_en = set(stopwords.words('english'))
train_words = [wd for wd in words if wd not in stop_en]

In [None]:
import matplotlib.pyplot as plt
ctr = Counter(train_words)

In [None]:
plt.hist([min(c, 200) for c in ctr.values()], bins=100)
plt.show()

In [None]:
vocab_to_int, int_to_vocab = utils.create_lookup_tables(train_words)
int_words = [vocab_to_int[word] for word in train_words]

if 0:
    # This filtering appears already done for us with min = 6
    word_counts = Counter(int_words)

    total_count = len(int_words)
    counts_raw = {word: count for word, count in word_counts.items()}
    int_words = [word for word in int_words if counts_raw[word] > 5]

word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}

len(int_words), len(train_words), len(int_words)/len(train_words)
int_words = np.array(int_words)

In [None]:
idx_to_id = {idx:id for idx, id in enumerate(int_words)}
train_length = len(int_words)
n_window = 3
n_vocab = len(vocab_to_int)

In [None]:
import torch
from torch import nn
import torch.optim as optim

class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        # TODO: Initialize model
#         self.center_emb = 
#         self.outer_emb =
    
    def forward(self, x_center, x_outer, target):
        #TODO make predictions (raw scores and probabilities)
        return scores, probs
    
    def get_emb(self):
        #TODO get embeddings from model
        return

In [None]:
n_neg = 8

In [None]:
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}

In [None]:
wd_with_p = # Create ordered list of unique words
p_per_wd = # Get 3/4 power unigram probability of word counts

In [None]:
pos_bs = 128

In [None]:
import itertools
def cosine_similarity_vec(u, v):
    # TODO: Calculate cosine similarity
    return cos
def test_some_similarities(embed):
    word_pairs = [
                    ['father', 'mother'], ['boy', 'girl'], ['football', 'baseball'], ['good', 'bad'],
        ['accurate', 'precise'], ['allude','refer'],['anxious','eager'],['convince','persuade'],
        ['fewer','less'],['poisonous','venomous']
                ]
    # TODO : calculate cosine similarities for each pair above
    similarities = 
    return similarities

In [None]:
PATH = 'skipgram'

In [None]:
# check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_dim= 64 # you can change, if you want

model = SkipGram(len(vocab_to_int), embedding_dim).to(device)
# Explain why we can or can't define our loss function in advance based on our needs for this specific problem.
# Hint: If we had a static graph, we could define it as an object
optimizer = optim.Adam(model.parameters(), lr=3e-3)
# model.load_state_dict(torch.load(PATH))

In [None]:
print_every = 30
steps = 0
epochs = 5

for step in range(epochs * train_length):
    # TODO: Fill out sampling and loss function
    xi_center = 
    xi_outer = 
    xneg_center = 
    xneg_outer = 
    x_center = 
    x_outer = 
    targets = 
    steps += 1
    x_center_torch, x_outer_torch, targets_torch = [torch.LongTensor(x).to(device) for x in [x_center, x_outer, targets]]
    
    logits, probs = model(x_center_torch, x_outer_torch, targets_torch)
    optimizer.zero_grad()
    loss = # apply loss function
    loss.backward()
    optimizer.step()

    if step % print_every == 0:
        # TODO: Evaluate models perfomance based on mean of similarities
        embed = 
        scores = 
        score_avg = 
        print(f'eval score: {score_avg:.5f}')
    if step % 10000 == 0 and step > 1:
        torch.save(model.state_dict(), PATH)

In [None]:
torch.save(model.state_dict(), PATH)

## Visualizing the word vectors

Below we'll use T-SNE to visualize how our high-dimensional word vectors cluster together. T-SNE is used to project these vectors into two dimensions while preserving local stucture. Check out [this post from Christopher Olah](http://colah.github.io/posts/2014-10-Visualizing-MNIST/) to learn more about T-SNE and other ways to visualize high-dimensional data.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
# getting embeddings from the embedding layer of our model, by name
embeddings = model.get_emb()

In [None]:
viz_words = 600
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

In [None]:
fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)

In [None]:
from sklearn.decomposition import PCA

In [None]:
selected_words = ['big', 'bigger', 'biggest', 'strong', 'stronger', 'strongest', 'slow', 'slower', 'slowest',
                  'clear', 'clearer', 'clearest', 'loud', 'louder', 'loudest', 'dark', 'darker', 'darkest']
selected_idxes = [vocab_to_int[wd] for wd in selected_words]
lowdim_embeddings = PCA().fit_transform(embeddings)
embed_pca = lowdim_embeddings[selected_idxes, :2]
fig, ax = plt.subplots(figsize=(16, 16))
for i, idx in enumerate(selected_idxes):
    plt.scatter(*embed_pca[i, :], color='steelblue')
    plt.annotate(selected_words[i], (embed_pca[i, 0], embed_pca[i, 1]), alpha=0.7)