In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pickle
import datetime

In [2]:
import nltk
nltk.download('reuters')
nltk.download('punkt')  # For tokenization

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.corpus import reuters as rt 
rt.words()

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]

In [4]:
categories = rt.categories() 

print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [5]:
#. tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

# Get raw text from the reuters corpus
raw_text = rt.raw(categories=['fuel', 'gas','gold', 'grain', 'heat', 'housing', 'income', 'interest']) 

# Tokenize into sentences and then words
sentences = sent_tokenize(raw_text)  # Tokenize the raw text into sentences
corpus = [word_tokenize(sent.lower()) for sent in sentences]  # Tokenize sentences into words

# Check the number of tokenized sentences
print(f"Number of tokenized sentences: {len(corpus)}")


print(corpus[0])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Number of tokenized sentences: 8458
['china', 'daily', 'says', 'vermin', 'eat', '7-12', 'pct', 'grain', 'stocks', 'a', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'china', "'s", 'grain', 'stocks', ',', 'the', 'china', 'daily', 'said', '.']


In [6]:
#2. numeralizaition
# find uniquee words
flatten=lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs=list(set(flatten(corpus))) # all the words we have in the system

In [7]:
# create hand mapping between iteger and word
word2index={v:idx for idx, v in enumerate(vocabs)}

In [8]:
lst_idx=len(vocabs)
lst_idx

14269

In [9]:
#append UNK for unknown
vocabs.append('<UNK>')

In [10]:
word2index['<UNK>']= lst_idx

In [11]:
index2word={v:k for k, v in word2index.items()}

In [12]:
#vocab size
voc_size = len(vocabs)
voc_size

14270

In [13]:
# create pairs of center word, and outsidee word
def random_batch(batch_size,corpus):
    skipgrams=[]
    # loop  each corpus
    for doc in corpus:
        #look from the 3rd word until third last word since window size =2
        for i in range(2, len(doc)-2):
            #center word
            center=word2index[doc[i]]
            #outside words=2 words
            outside = (word2index[doc[i-2]],word2index[doc[i-1]], 
                       word2index[doc[i+1]],word2index[doc[i+2]])
            #print(center, outside)
            #for each for these two outside words, we gonna append to a list
            for each_out in outside:
                #print(each_out)
                skipgrams.append([center,each_out])
            # center, outeside1; center, outside2

    random_index=np.random.choice(range(len(skipgrams)),batch_size,replace=False)
    input, label=[], []
    for index in random_index:
        input.append([skipgrams[index][0]])
        label.append([skipgrams[index][1]])
    return np.array(input), np.array(label)

x, y=random_batch(2,corpus)

In [14]:
x

array([[ 4953],
       [12783]])

In [15]:
y

array([[ 3225],
       [13529]])

## 3.Negative Sampling
### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [16]:
z=0.001

In [17]:
from collections import Counter

word_count = Counter(flatten(corpus))

#count the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

241109

In [18]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

## 4. Model

In [19]:
voc_size   = len(vocabs)
emb_size = 2

In [20]:
# prepare all vocabs
batch_size=2
voc_size= len(vocabs)
def prepare_sequence(seq, word2index):
    idxs=list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"],seq))
    return  torch.LongTensor(idxs)
all_vocabs=prepare_sequence(list(vocabs),word2index).expand(batch_size,voc_size)
all_vocabs

tensor([[    0,     1,     2,  ..., 14267, 14268, 14269],
        [    0,     1,     2,  ..., 14267, 14268, 14269]])

In [21]:
import random
# sample 5 words on corpus
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

In [22]:
x_neg, y_neg = random_batch(batch_size, corpus)
x_tensor_neg = torch.LongTensor(x_neg)
y_tensor_neg = torch.LongTensor(y_neg)

In [23]:
k = 5
# check negative_sampling
neg_samples = negative_sampling(y_tensor_neg, unigram_table, k)

In [24]:
y_tensor_neg[1]

tensor([5952])

In [25]:
neg_samples[1]

tensor([ 4204,  9695,  7608,  4435, 13810])

In [26]:
# create skipgram negative sampling
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.center_embedding(center) #(bs, 1, emb_size)
        outside_embed  = self.outside_embedding(outside) #(bs, 1, emb_size)
        negative_embed = self.outside_embedding(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1) #sum on second dim
        
        # calculate loss
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)
    
    def get_vector(self, word):
        id_tensor = torch.LongTensor([word2index[word]])
        id_tensor = id_tensor
        v_embed = self.embedding_center(id_tensor)  # Corrected
        u_embed = self.embedding_outside(id_tensor)  # Corrected
        word_embed = (v_embed + u_embed) / 2 

        return word_embed

In [27]:

#testing my model
test_model_neg = SkipgramNeg(voc_size, emb_size)

In [None]:
loss_neg = test_model_neg(x_tensor_neg, y_tensor_neg, neg_samples)
loss_neg

tensor(0.7395, grad_fn=<NegBackward0>)

## 5. Training

In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = elapsed_time // 60
    secs = elapsed_time % 60
    return int(mins), int(secs)

In [30]:
model_neg     = SkipgramNeg(voc_size, emb_size)
optimizer_neg = optim.Adam(model_neg.parameters(), lr=0.001)

In [31]:
# Training
import time
num_epochs = 1000
start = time.time()
for epoch in range(num_epochs):
    
    #get batch
    input_batch_neg, label_batch_neg = random_batch(batch_size, corpus)
    input_tensor_neg = torch.LongTensor(input_batch_neg)
    label_tensor_neg = torch.LongTensor(label_batch_neg)
    
    #predict
    neg_samples = negative_sampling(label_tensor_neg, unigram_table, k)
    loss_neg = model_neg(input_tensor_neg, label_tensor_neg, neg_samples)
    
    #backprogate
    optimizer_neg.zero_grad()
    loss_neg.backward()
    
    #update alpha
    optimizer_neg.step()
    
    #print the loss
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch+1:6.0f} | Loss: {loss_neg:2.6f}")

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")

Epoch    100 | Loss: 0.752042
Epoch    200 | Loss: 1.440988
Epoch    300 | Loss: 2.100292
Epoch    400 | Loss: 1.283021
Epoch    500 | Loss: 3.125569
Epoch    600 | Loss: 3.268190
Epoch    700 | Loss: 2.173976
Epoch    800 | Loss: 1.126065
Epoch    900 | Loss: 1.351720
Epoch   1000 | Loss: 3.110745
time: 10m 30s


In [32]:
print(f'Training Loss: {loss_neg}, Training Time: {epoch_mins}m {epoch_secs}s')

Training Loss: 3.1107451915740967, Training Time: 10m 30s


In [33]:
# Saving the model for testing
torch.save(model_neg.state_dict(), 'D:/AIT_lecture/NLP/code\Assignment/NLP-2025/NLP-A1/models/skipgramNeg_v1.pt')

In [34]:
# Importing training data
Data = pickle.load(open(r'D:\AIT_lecture\NLP\code\Assignment\NLP-2025\NLP-A1\models\Data.pkl', 'rb'))

corpus = Data['corpus']
vocab = Data['vocab']
word2index = Data['word2index']
voc_size = Data['voc_size']
embed_size = Data['embedding_size']