# Assignment 1.3: Naive word2vec (40 points)

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch and code from your previous task.

## Results of this task: (30 points)
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)

## Extra questions: (10 points)
 * Intrinsic evaluation: you can find datasets [here](http://download.tensorflow.org/data/questions-words.txt)
 * Extrinsic evaluation: you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

In [1]:
import gc
import string
import re
from collections import Counter
import numpy as np
gc.collect()

0

In [254]:
class Batcher:
    def __init__(self, window_size, corpus_path, min_freq, max_freq, max_voc_size, batch_size):
        self.corpus_path = corpus_path
        self.window_size = window_size
        self.min_freq = min_freq
        self.max_freq = max_freq
        self.max_voc_size = max_voc_size
        self.batch_size = batch_size
        self.words = None
        self.word2index = None
        self.index2word = None
        self.freq = None
        self.voc = None
        self.voc_size = None
        self.corpus = None
        self.corpus_size = None
        
        
    def read_data(self, S):
        if S == None:
            with open(self.corpus_path, 'r') as f:
                S = f.read()
            S = S.lower()[:100000000]
        print('Len of S = ', len(S))
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        S = regex.sub(' ', S)
        words = list(S.split())
        self.words = words
        unique_words = list(set(words))
        self.word2index = {k: v for v, k in enumerate(unique_words)}
        self.word2index['UNK'] = -1
        self.index2word = {v: k for v, k in enumerate(unique_words)}
        self.index2word[-1] = 'UNK'
        words = [self.word2index[word] for word in words]
        
        print('Size of words = ', len(words))
        counter = Counter(words)
        print('Size of counter = ', len(counter))
        if self.min_freq != None:
            counter = {x : counter[x] for x in counter if counter[x] >= self.min_freq}
        print('Size of counter after min_freq = ', len(counter))
        if self.max_freq != None:
            counter = {x : counter[x] for x in counter if counter[x] <= self.max_freq}
        print('Size of counter after max_freq = ', len(counter))
        counter = Counter(counter)

        self.freq = dict(counter.most_common(self.max_voc_size))
        self.voc = set(self.freq)
        self.voc_size = len(self.voc)
        
        unk = set(words).difference(self.voc)
        print('Size of freq dict = ', len(self.voc))
        print('Number of vocabulary words = ', len(self.voc))
        print('Number of unknown words = ', len(unk))

        words = [-1 if word in unk else word for word in words]
        
        if len(words)%self.batch_size == 0:
            padding = self.window_size
        else:
            padding = self.batch_size - len(words)%self.batch_size + self.window_size
            
        self.corpus = [-2]*self.window_size + words + [-2]*padding
        self.corpus_size = len(self.corpus)
            
    def generator0(self):
        i = self.window_size
        x_batch = []
        y_batch = []
        
        while i < self.corpus_size-self.window_size:
            if len(x_batch)==self.batch_size:
                x_batch = []
                y_batch = []
                
            x = self.corpus[i-self.window_size: i] + self.corpus[i+1: i+self.window_size+1]
            y = [self.corpus[i]]
            x_batch.append(x)
            y_batch.append(y)
            i += 1
            if len(x_batch)==self.batch_size:
                yield np.array(x_batch), np.array(y_batch)
                
    def generator1(self):
        i = self.window_size
        x_batch = []
        y_batch = []
        
        while i < self.corpus_size-self.window_size:
            if len(x_batch)==self.batch_size:
                x_batch = []
                y_batch = []
                
            x = self.corpus[i-self.window_size: i] + self.corpus[i+1: i+self.window_size+1]
            x_window = []
            y = self.corpus[i]
            y_window = []
            v_y = [0]*self.voc_size
            v_y[y] = 1
            
            for j in range(self.window_size*2):
                v_x = [0]*self.voc_size
                v_x[x[j]] = 1
                x_window.append(v_x)
                y_window.append(v_y)
            
            x_batch.append(x_window)
            y_batch.append(y_window)
            i += 1
            if len(x_batch)==self.batch_size:
                yield np.array(x_batch), np.array(y_batch)
    
    def generator(self):
        i = self.window_size
        x_batch = []
        y_batch = []
        
        while i < self.corpus_size-self.window_size:
            if len(x_batch)==self.batch_size:
                x_batch = []
                y_batch = []
                
            x = self.corpus[i-self.window_size: i] + self.corpus[i+1: i+self.window_size+1]
            x_window = []
            y = [self.corpus[i]]*self.window_size*2
            
            for j in range(self.window_size*2):
                v_x = [0]*self.voc_size
                v_x[x[j]] = 1
                x_window.append(v_x)
            
            x_batch.append(x_window)
            y_batch.append(y)
            i += 1
            if len(x_batch)==self.batch_size:
                yield np.array(x_batch), np.array(y_batch)

In [255]:
batcher = Batcher(window_size=2, corpus_path='text8', min_freq=None, max_freq=None, max_voc_size=1000000, batch_size=32)
batcher.read_data(S=None)

Len of S =  100000000
Size of words =  17005207
Size of counter =  253854
Size of counter after min_freq =  253854
Size of counter after max_freq =  253854
Size of freq dict =  253854
Number of vocabulary words =  253854
Number of unknown words =  0


In [256]:
for x, y in batcher.generator():
    print(x.shape, y.shape)
    break

(32, 4, 253854) (32, 4)


### CBOW

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fa806a6ef10>

In [259]:
class CBOW(nn.Module):
    def __init__(self, voc_size, embedding_dim, window_size, batch_size):
        super(CBOW, self).__init__()
        self.voc_size = voc_size
        self.W1 = nn.Parameter(torch.tensor(torch.randn(voc_size, embedding_dim).float(), requires_grad=True))
        self.W2 = nn.Parameter(torch.tensor(torch.randn(embedding_dim, voc_size).float(), requires_grad=True))
    
    def forward(self, inputs):
        x = torch.tensor(inputs).float().view(-1, self.voc_size)
        z1 = torch.matmul(x, self.W1)
        z2 = torch.matmul(z1, self.W2)
        log_softmax = F.log_softmax(z2, dim=1)
        return log_softmax

In [260]:
losses = []
loss_function = nn.NLLLoss()
model = CBOW(voc_size=batcher.voc_size, embedding_dim=300, window_size=batcher.window_size, batch_size=batcher.batch_size)
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in [0]:
    total_loss = 0
    for context, target in batcher.generator():
        model.zero_grad()
        
        log_probs = model(context)
        loss = torch.nn.functional.nll_loss(log_probs, torch.tensor(target, dtype=torch.long).view(-1))

        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print(loss)
        losses.append(loss)
    #losses.append(total_loss)
#print(losses)  # The loss decreased every iteration over the training data!

  """
  


tensor(76.7165, grad_fn=<NllLossBackward>)
tensor(78.9987, grad_fn=<NllLossBackward>)
tensor(78.1081, grad_fn=<NllLossBackward>)
tensor(78.0884, grad_fn=<NllLossBackward>)


KeyboardInterrupt: 