## Ex4: Skip-Gram
In this exercise, you must implement and train a Skip-Gram model. The data is provided in two .csv files. You can run the code on your own machine or you can choose to upload it to Google Colab. We suggest the latter as it supports GPU accelleration. Google Colab is about 4 times faster than my Mac Book Pro and training is still somewhat time consuming. Google Colab is free and it should be easy to get started. You can simply upload this notebook to Google Colab. Instructions for Google Colab follow here:

Welcome to Google Colab 

To use the data use the tab on the left and click the bottom pane (folder icon) and use the upload to session storage button to upload data.csv and word_to_idx.csv.

To enable gpu go the runtime fan and click Change runtime type and select GPU.

Otherwise this works as a jupyter notebook. Run the code and see what happens. On our last run an epoch took around 143 seconds.

Run 3 epochs or more.

To implement:
* You must implement the forward pass of the skip-gram neural net. Here we have included the application of softmax into the loss function. This means that your implementation should simply compute the output before applying softmax. Thus essentially you are implementing a neural net with one hidden layer, no bias and identity activation, both in the hidden and output layer.

In [None]:
%matplotlib inline
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time
from sklearn.preprocessing import normalize
import pandas as pd

def get_dicts(path='word_to_idx.csv'):
    df_dict = pd.read_csv(path)
    print('See word index dataframe head')
    print(df_dict.head(10))
    idx_to_word = {idx: word for idx, word in zip(df_dict.idx, df_dict.word)}
    word_to_idx = {word: idx for idx, word in zip(df_dict.idx, df_dict.word)}
    return idx_to_word, word_to_idx

def get_data(path='data.csv'):
    df_dat = pd.read_csv(path)
    print('See data dataframe head')
    print(df_dat.head(10))
    dat = torch.from_numpy(df_dat.X_in.values)
    labels = torch.from_numpy(df_dat.X_out.values)
    dataset = torch.utils.data.TensorDataset(dat, labels)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, pin_memory=True)    
    print('size of dataset:', len(dataset)) 
    return dataset, dataloader


class KNN():
    """ Simple K nearest neighbour data structure """
    def __init__(self, embedding, word_to_idx, idx_to_word):
        print('Create KNN')
        self.embedding = normalize(embedding.numpy(), axis=1)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word

    def query(self, idx, k=5):
        tmp = {}
        for i in idx:
            tmp[i] = self.get_most_similar(i, k)
        return tmp

    def print_nearest(self, words, k=5):
        for x in words:
            idx = self.word_to_idx[x]
            k_near_idx = self.get_most_similar(idx, k)
            similar_words = [self.idx_to_word[z] for z in k_near_idx]
            print('Most Similar to {0}:'.format(x), ', '.join(similar_words))

    def get_most_similar(self, i, k):
        """ Get the indexes of the most similar embedding vectors 
    
            Args:
                i: int
                k: int
            Returns 
                k_nearest: list    
        """
        embed_i = self.embedding[i, :].reshape(-1, 1)
        scores = (self.embedding @ embed_i).ravel()
        ordered_sims = np.argsort(scores)[::-1]
        k_nearest = ordered_sims[1:k + 1] # i is probably includes
        assert ordered_sims[0] == i
        return k_nearest

def print_nearest(embedding, word_to_idx, idx_to_word):
    knn = KNN(embedding, word_to_idx, idx_to_word)
    test_words = ["three", "cat", "city", "player", "king", "queen"]
    knn.print_nearest(test_words)
    


In [None]:
%matplotlib inline
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Using:', device)

class BasicSkipGram(nn.Module):

    def __init__(self, num_embeddings, embedding_dim):
        """ Trivial init
        """
        super(BasicSkipGram, self).__init__()
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.embedding = None # untrained
        self.lossfunction = nn.CrossEntropyLoss()
      
    def forward(self, inputs, params):
        """ Compute the forward pass
            Note that we have put the application of softmax into the loss function
            Thus the forward pass should only compute the output before taking softmax
            Inputs consists of batch_size (denoted n) many indices i_1,...,i_n where i_j is the index of a word in the dictionary. 
            The corresponding one-hot encoding would be a vector with a 1 in position i_j and 0 elsewhere.
            Output must have size n x num_embeddings
        """
        embedding_mat = params['embedding'] # num_embeddings x embedding_dim
        soft_layer = params['soft_layer'] # embedding_dim x num_embeddings
        out = None
        ### YOUR CODE HERE
        embeddings = embedding_mat[inputs, :] # n x embedding_dim
        out = embeddings @ soft_layer
        ### END CODE
        return out

    def loss(self, pred, labels):
        return self.lossfunction(pred, labels)
    
    def train(self, train_loader,  epochs=1):
        """ fit the neural net using CrossEntropyLoss"""
        print('start training emb model')
        ## Initialize parameters
        train_embedding = torch.randn(net.num_embeddings, net.embedding_dim, device=device, requires_grad=True)
        soft_layer = torch.randn(net.embedding_dim, net.num_embeddings, device=device, requires_grad=True)
        params={'embedding': train_embedding, 'soft_layer': soft_layer}
        print_steps = 10000
        # optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
        ## Create GD optimizer + Adam is your friend
        optimizer = optim.Adam(params.values())
        for epoch in range(epochs):  # loop over the dataset multiple times
            running_loss = 0.0
            start_time = time.time()
            batch_time = time.time()
            for i, data in enumerate(train_loader):
                _inputs, _labels = data
                inputs = _inputs.to(device)
                labels = _labels.to(device)
                optimizer.zero_grad()

                outputs = net.forward(inputs, params)
                loss = self.loss(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                
                if i % print_steps == print_steps - 1:
                    batch_end_time = time.time()
                    mean_loss = running_loss / print_steps
                    print(f'Epoch {epoch}: batch {i+1} mean last {print_steps} loss: {mean_loss:.3f} - time used {time.time() - batch_time:.2f} s')
                    running_loss = 0.0
                    batch_time = time.time()
            end_time = time.time()
            print(f'\nEpoch {epoch} finished - in {end_time - start_time:.2f} seconds')


        print('\nFinished Training')
        self.embedding = train_embedding.detach()
    

    



In [None]:
embedding_dim = 128
idx_to_word, word_to_idx = get_dicts(path='word_to_idx.csv')
vocab_size = len(idx_to_word)
print('vocabulary size', vocab_size)
dataset, dataloader = get_data(path='data.csv')
net = BasicSkipGram(num_embeddings=vocab_size, embedding_dim=embedding_dim)    
net.train(dataloader, epochs=4)




In [None]:
print_nearest(net.embedding.cpu(), word_to_idx, idx_to_word)