In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pickle
import datetime

## 1.Load Data

In [2]:
import nltk
nltk.download('reuters')
nltk.download('punkt')  # For tokenization


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.corpus import reuters as rt 
rt.words()

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]

In [4]:
categories = rt.categories() 

print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [5]:
#. tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

# Get raw text from the reuters corpus
raw_text = rt.raw() 

# Tokenize into sentences and then words
sentences = sent_tokenize(raw_text)  # Tokenize the raw text into sentences
corpus = [word_tokenize(sent.lower()) for sent in sentences]  # Tokenize sentences into words

# Check the number of tokenized sentences
print(f"Number of tokenized sentences: {len(corpus)}")


print(corpus[0])


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Number of tokenized sentences: 50981
['asian', 'exporters', 'fear', 'damage', 'from', 'u.s.-japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'u.s.', 'and', 'japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'asia', "'s", 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far-reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']


In [6]:
len(corpus[0])

39

In [7]:
#2. numeralizaition
# find uniquee words
flatten=lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs=list(set(flatten(corpus))) # all the words we have in the system
vocabs

['taxes',
 'fathers',
 'benedetti',
 'marley',
 '4,600,199',
 '100.00',
 'hospitalized',
 'attendance',
 '175,000',
 're-introduction',
 'loma',
 '716,361',
 'shambles',
 'noverco',
 'rata',
 'deflation',
 'deserts',
 '7,075,000',
 'miss.',
 '1,246,992',
 'resrouces',
 'mackay',
 'pnp',
 'mechanically',
 'kazakhstan',
 'wchi',
 'showroom',
 'ivb',
 'pellets',
 'sullivan',
 'malvern',
 '9,514,115',
 'negotiates',
 'mania',
 'hengst',
 'graniere',
 'sandustry',
 '7,450,000',
 'travelling',
 '3.55/65',
 '105.82',
 '47.5',
 '239',
 'uniforce',
 'afraid',
 'succession-issues',
 '5.16',
 'a.soriano',
 '18-april',
 'amnount',
 '1.6300',
 '12,500,000',
 'repeat',
 'tampico/north',
 'monte',
 '225',
 '7,856,000',
 '2,135,315',
 '2.562',
 '667',
 '233',
 '4,548,000',
 'exaias',
 'printronix',
 'candidate',
 '868,000',
 'herley',
 'shifted',
 '292,014',
 'alleges',
 'wanb',
 '694,000',
 '1,297,881',
 '488.3',
 'player',
 '2,603,000',
 'interfunding',
 '43,449,000',
 '5,167,573',
 'participate',
 

In [8]:
# create hand mapping between iteger and word
word2index={v:idx for idx, v in enumerate(vocabs)}


In [9]:
lst_idx=len(vocabs)

In [10]:
lst_idx

52229

In [11]:
#append UNK for unknown
vocabs.append('<UNK>')

In [12]:
word2index['<UNK>']= lst_idx

In [13]:
len(vocabs)

52230

In [14]:
index2word={v:k for k, v in word2index.items()}

## 2. Prepare train data

In [15]:
# create pairs of center word, and outsidee word
def random_batch(batch_size,corpus):
    skipgrams=[]
    # loop  each corpus
    for doc in corpus:
        #look from the 3rd word until third last word since window size =2
        for i in range(2, len(doc)-2):
            #center word
            center=word2index[doc[i]]
            #outside words=2 words
            outside = (word2index[doc[i-2]],word2index[doc[i-1]], 
                       word2index[doc[i+1]],word2index[doc[i+2]])
            #print(center, outside)
            #for each for these two outside words, we gonna append to a list
            for each_out in outside:
                #print(each_out)
                skipgrams.append([center,each_out])
            # center, outeside1; center, outside2

    random_index=np.random.choice(range(len(skipgrams)),batch_size,replace=False)
    input, label=[], []
    for index in random_index:
        input.append([skipgrams[index][0]])
        label.append([skipgrams[index][1]])
    return np.array(input), np.array(label)

x, y=random_batch(2,corpus)

In [16]:
x

array([[41996],
       [ 3037]])

In [17]:
y

array([[21537],
       [26348]])

## 3.Model

### Word2Vec(Skipgram)

In [18]:
voc_size   = len(vocabs)
emb_size = 2

In [19]:
# create skipgram model
class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_center=nn.Embedding(voc_size,emb_size)
        self.embedding_outside=nn.Embedding(voc_size,emb_size)
    def forward(self,center,outside,all_vocabs):
        center_embedding=self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embedding=self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding=self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size

        top_term=torch.exp(outside_embedding.bmm(center_embedding.transpose(1,2)).squeeze(2))  # bmm is dot product (ignore batch size) and reduce dim to 2
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        lower_term=all_vocabs_embedding.bmm(center_embedding.transpose(1,2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum=torch.sum(torch.exp(lower_term),1) #(batch_size,1)
        
        #calculate loss
        loss=-torch.mean(torch.log(top_term/lower_term_sum))
        
        return loss

In [20]:
# prepare all vocabs
batch_size=2
voc_size= len(vocabs)
def prepare_sequence(seq, word2index):
    idxs=list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"],seq))
    return  torch.LongTensor(idxs)
all_vocabs=prepare_sequence(list(vocabs),word2index).expand(batch_size,voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 52227, 52228, 52229],
        [    0,     1,     2,  ..., 52227, 52228, 52229]])

In [21]:
model_skipgram=Skipgram(voc_size,2)

In [22]:
x_skipgram, y_skipgram = random_batch(batch_size, corpus)
x_tensor_skipgram = torch.LongTensor(x_skipgram)
y_tensor_skipgram = torch.LongTensor(y_skipgram)

In [23]:
loss_skipgram = model_skipgram(x_tensor_skipgram, y_tensor_skipgram, all_vocabs)
loss_skipgram

tensor(13.2141, grad_fn=<NegBackward0>)

## 4. Training

In [24]:
batch_size     = 2 # mini-batch size
model_skipgram      = Skipgram(voc_size, emb_size)
optimizer_skipgram  = optim.Adam(model_skipgram.parameters(), lr=0.001)

In [25]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = elapsed_time // 60
    secs = elapsed_time % 60
    return int(mins), int(secs)


In [26]:
# Training
import time
num_epochs = 1000
start = time.time()

for epoch in range(num_epochs):
    
    #get batch
    input_batch_skipgram, label_batch_skipgram = random_batch(batch_size, corpus)
    input_tensor_skipgram = torch.LongTensor(input_batch_skipgram)
    label_tensor_skipgram = torch.LongTensor(label_batch_skipgram)
     
    #predict
    loss_skipgram = model_skipgram(input_tensor_skipgram, label_tensor_skipgram, all_vocabs)
    
    #backprogate
    optimizer_skipgram.zero_grad()
    loss_skipgram.backward()
    
    #update alpha
    optimizer_skipgram.step()
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss_skipgram:2.6f}") #Epoch 6 front space, 0 back space

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")

Epoch 100 | Loss: 11.976206
Epoch 200 | Loss: 10.799342
Epoch 300 | Loss: 10.325340
Epoch 400 | Loss: 11.294731
Epoch 500 | Loss: 10.697835
Epoch 600 | Loss: 11.016223
Epoch 700 | Loss: 11.694645
Epoch 800 | Loss: 10.522962
Epoch 900 | Loss: 10.382369
Epoch 1000 | Loss: 10.521408
time: 59m 39s


In [27]:
# Saving the model for testing
torch.save(model_skipgram.state_dict(), 'models/skipgram.pt')

In [28]:
Data = {
    'corpus': corpus,
    'vocab': vocabs,
    'word2index': word2index,
    'voc_size': voc_size,
    'embedding_size': emb_size
}

In [29]:
pickle.dump(Data,open('./models/Data.pkl', 'wb'))