In [131]:
import numpy as np
import pandas as pd
import torch 
import torch.nn.functional as F


# BagofWord

In [62]:

def bagofWord(corpus):
    vocabulary = np.unique(' '.join(corpus).lower().split())
  
    doc_vectors = []
    for doc in corpus:
        doc_tuple = {}
        for word in vocabulary:
            if word not in doc_tuple:
                doc_tuple[word] = 0

        for word in doc.lower().split():
            if word in vocabulary:
                doc_tuple[word]+=1
        doc_vectors.append(doc_tuple)

    print(vocabulary)
    for i, doc_vector in enumerate(doc_vectors):
        print(corpus[i], list(doc_vector.values()))




In [63]:
bagofWord([
  'the cat sat',
  'the cat sat in the hat',
  'the cat with the hat',
])


['cat' 'hat' 'in' 'sat' 'the' 'with']
the cat sat [1, 0, 0, 1, 1, 0]
the cat sat in the hat [1, 1, 1, 1, 2, 0]
the cat with the hat [1, 1, 0, 0, 2, 1]


# Skipgram

In [69]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]


### Vocabulary

In [116]:
vocabulary = np.unique(' '.join(corpus).lower().split())

vocabulary_size = len(vocabulary)

vocabulary_size, vocabulary


(15,
 array(['a', 'berlin', 'capital', 'france', 'germany', 'he', 'is', 'king',
        'man', 'paris', 'poland', 'queen', 'she', 'warsaw', 'woman'],
       dtype='<U7'))

In [74]:
word_idx = { w: i for i, w in enumerate(vocabulary) }

idx_word = { i: w for i, w in enumerate(vocabulary) }

idx_word

{0: 'a',
 1: 'berlin',
 2: 'capital',
 3: 'france',
 4: 'germany',
 5: 'he',
 6: 'is',
 7: 'king',
 8: 'man',
 9: 'paris',
 10: 'poland',
 11: 'queen',
 12: 'she',
 13: 'warsaw',
 14: 'woman'}

In [108]:
# pair
window_size = 2
arr_idx = []
arr_word = []

for doc in corpus:
    word_per_doc = doc.split()
    for idx, w in enumerate(word_per_doc):
        for i in range(-window_size, window_size+1):
            neighbour_idx = idx + i
                
            if neighbour_idx == idx or neighbour_idx < 0 or neighbour_idx >= len(word_per_doc):
                continue
        
            neighbour_idx = min(len(word_per_doc) - 1, max(0, idx + i))
            
            arr_idx.append((word_idx[w], word_idx[ word_per_doc[neighbour_idx] ]))
            arr_word.append((w, word_per_doc[neighbour_idx]))


In [139]:
arr_word[0], arr_idx[0]

(('he', 'is'), (5, 6))

### One hot word

In [149]:
def getInput(word_idx, dtype=torch.float):
    x = torch.zeros(vocabulary_size, dtype=dtype)
    x[word_idx] = 1
    return x


In [150]:
vocabulary

array(['a', 'berlin', 'capital', 'france', 'germany', 'he', 'is', 'king',
       'man', 'paris', 'poland', 'queen', 'she', 'warsaw', 'woman'],
      dtype='<U7')

In [151]:
getInput(8)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

### Training

In [236]:
# the first hidden layer Count(Vpcabulary) by Count(Embedding)
embedding_size = 5
W1 = torch.randn((vocabulary_size, embedding_size), dtype=torch.float, requires_grad=True)
W2 = torch.randn((embedding_size, vocabulary_size), dtype=torch.float, requires_grad=True)

num_epoch=100
lr=0.001
for epoch in range(num_epoch):
    loss_val = 0
    for x_idx, y_idx in arr_idx:
        x = getInput(x_idx)
        y = torch.tensor([y_idx], dtype=torch.long)

        z1 = x @ W1
        a2 = z1 @ W2
        
        log_softmax = F.log_softmax(a2, dim=0)
        loss = F.nll_loss(log_softmax.reshape(1, -1), y)
        loss_val += loss.data.item()
        loss.backward()
        
        W1.data -= lr * W1.grad.data
        W2.data -= lr * W2.grad.data

        W1.grad=None
        W2.grad=None
        
    if epoch % 10 == 0: 
        print(f'epoch at {epoch}: {loss_val/len(arr_idx)}')


epoch at 0: 4.775860179322106
epoch at 10: 4.159932239992278
epoch at 20: 3.778918784005301
epoch at 30: 3.5125010077442442
epoch at 40: 3.3135516132627214
epoch at 50: 3.159255078009197
epoch at 60: 3.0368846314293996
epoch at 70: 2.938060017142977
epoch at 80: 2.8563636686120715
epoch at 90: 2.7868152767419816
