In [137]:
import torch
import torch.nn as nn
import nltk
import urllib.request
import bs4
import string

In [138]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/French_Revolution')
article = scrapped_data.read()

parsed_article = bs4.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all("p")

articles = ""

for p in paragraphs:
    articles += p.text
    
train_data = articles.translate(str.maketrans('','',string.punctuation)).lower()
train_data = train_data.translate(str.maketrans('','',string.digits))

text = nltk.word_tokenize(train_data)
stop_words = set(nltk.corpus.stopwords.words('english'))

text = [words for words in text if words not in stop_words]
print(len(text))

vocab = set(text)
print(len(vocab))
vocab_size = len(set(text))

10625
3740


In [139]:
window = 2
ls = []

for center_index in range(len(text)):
    
    for sub in range(max(0, center_index - window), center_index):
        ls.append((center_index, sub))
        
    for add in range(center_index + 1, min(vocab_size - 1, center_index + window)+1):
        ls.append((center_index, add))

In [140]:
word2int = { word : i for i,word in enumerate(vocab) }
int2word = { i : word for i,word in enumerate(vocab) }

In [206]:
class skipgram(nn.Module):
    
    def __init__(self, vocab_size, embedding_size):
        super(skipgram, self).__init__()
        self.projection = torch.randn((vocab_size, embedding_size), requires_grad = True)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, target_idx):
        embedding = self.projection[target_idx]
        prediction = self.linear(embedding.cuda())
        
        return prediction
    
net = skipgram(vocab_size, 150).cuda()

In [188]:
params = list(net.parameters())
params.extend(list(net.projection))

In [189]:
alpha = 0.003

lossfn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params, lr=alpha)

In [229]:
epochs = 20
i = 1

for iterations in range(epochs):
    
    for (center, context) in ls:
        
        context = word2int[text[context]]
        center = word2int[text[center]]
        
        target = torch.tensor([context])
        
        score = net.forward(center).reshape(1,-1)
        loss = lossfn(score.cuda(), target.cuda())
        loss.requires_grad = True
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(net.projection)
        
        i += 1
        if i%10 == 0:
            break
    break

tensor([[ 1.0872,  0.6562, -0.3686,  ..., -0.9246, -1.0376, -1.1621],
        [ 1.3623,  1.2609, -0.3707,  ...,  0.1771, -2.2316,  2.1135],
        [-1.1943,  1.1342,  1.0836,  ..., -0.4386,  0.2492, -1.3635],
        ...,
        [-0.9176, -1.0747, -0.1555,  ..., -0.4639, -2.2423, -1.6554],
        [-1.1830,  0.6142,  0.1926,  ...,  0.1227,  1.4026,  0.3817],
        [ 1.4625,  1.3568, -0.0613,  ...,  1.6573, -1.1089, -1.9093]],
       requires_grad=True)
tensor([[ 1.0872,  0.6562, -0.3686,  ..., -0.9246, -1.0376, -1.1621],
        [ 1.3623,  1.2609, -0.3707,  ...,  0.1771, -2.2316,  2.1135],
        [-1.1943,  1.1342,  1.0836,  ..., -0.4386,  0.2492, -1.3635],
        ...,
        [-0.9176, -1.0747, -0.1555,  ..., -0.4639, -2.2423, -1.6554],
        [-1.1830,  0.6142,  0.1926,  ...,  0.1227,  1.4026,  0.3817],
        [ 1.4625,  1.3568, -0.0613,  ...,  1.6573, -1.1089, -1.9093]],
       requires_grad=True)
tensor([[ 1.0872,  0.6562, -0.3686,  ..., -0.9246, -1.0376, -1.1621],
        

In [186]:
len(params)

3742

In [175]:
hex(id(next(net.parameters()))) , hex(id(net.projection))

('0x7f395dd50a50', '0x7f3996b4c460')

In [204]:
loss = nn.CrossEntropyLoss()
inp = torch.randn(3, 5, requires_grad=True)
print(inp, inp.shape)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target, target.shape)
output = loss(inp, target)
output.requires_grad = True
output.backward()

tensor([[ 0.6820, -0.6981,  0.0846,  0.1088, -1.8055],
        [-0.7347, -0.8329, -0.1547,  1.5112,  0.8998],
        [ 1.0291, -1.5034, -0.9791,  0.4642, -0.6520]], requires_grad=True) torch.Size([3, 5])
tensor([1, 1, 1]) torch.Size([3])
