# 0. Importing Libaries

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pickle
import datetime

In [2]:
print(torch.cuda.is_available())

False


# 1. Loading Data

In [3]:
import nltk
nltk.download('reuters')
nltk.download('punkt')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.corpus import reuters
reuters.words()

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...]

In [5]:
categories = reuters.categories() 

print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [6]:
#. tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

# Get raw text from the reuters corpus
raw_text = reuters.raw(categories=['fuel', 'gas','gold', 'grain', 'heat', 'housing', 'income', 'interest']) 

# Tokenize into sentences and then words
sentences = sent_tokenize(raw_text)  # Tokenize the raw text into sentences
corpus = [word_tokenize(sent.lower()) for sent in sentences]  # Tokenize sentences into words

# Check the number of tokenized sentences
print(f"Number of tokenized sentences: {len(corpus)}")


print(corpus[0])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Number of tokenized sentences: 8458
['china', 'daily', 'says', 'vermin', 'eat', '7-12', 'pct', 'grain', 'stocks', 'a', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'china', "'s", 'grain', 'stocks', ',', 'the', 'china', 'daily', 'said', '.']


In [7]:
# Convert all characters to lowercase to handle case sensitivity
corpus = [[word.lower() for word in sent] for sent in corpus]

In [8]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))

In [9]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}

In [10]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14269


In [11]:
#append UNK
vocab.append('<UNK>')

In [12]:
word2index['<UNK>'] = voc_size # since current word2index[0:voc_size-1]

In [13]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

In [14]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14270


# 2. Co occurance matrix

In [15]:
from collections import Counter

X_i = Counter(flatten(corpus))

In [16]:
skip_grams = []

for doc in corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1],
                   doc[i+1], doc[i-2]]
        for each_out in outside:
            skip_grams.append((center, each_out))

In [17]:
X_ik_skipgrams = Counter(skip_grams)

In [18]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [19]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function
pairs = list(combinations_with_replacement(vocab, 2))

In [20]:
len_of_pairs = len(pairs)

In [21]:
batch_size = 2  # adjust the batch size based on your memory constraints

for i in range(0, len_of_pairs, batch_size):
    batch = pairs[i:i + batch_size]

    for bigram in batch:
        if X_ik_skipgrams.get(bigram):
            co = X_ik_skipgrams[bigram]
            X_ik[bigram] = co + 1  # for stability
            X_ik[(bigram[1], bigram[0])] = co + 1  # basically apple, banana = banana, apple
        else:
            pass

        weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
        weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)


# 3. Preparing training data

### Check corpus (sentences)

In [22]:
2
for c in corpus:
    print(c)
    # Stop after 3
    i+=1
    if i > 2:
        break

['china', 'daily', 'says', 'vermin', 'eat', '7-12', 'pct', 'grain', 'stocks', 'a', 'survey', 'of', '19', 'provinces', 'and', 'seven', 'cities', 'showed', 'vermin', 'consume', 'between', 'seven', 'and', '12', 'pct', 'of', 'china', "'s", 'grain', 'stocks', ',', 'the', 'china', 'daily', 'said', '.']


In [23]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

# 4. Model

In [24]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    def get_vector(self, word):
        id_tensor = torch.LongTensor([word2index[word]])
        id_tensor = id_tensor
        v_embed = self.center_embedding(id_tensor)  # Corrected
        u_embed = self.outside_embedding(id_tensor)  # Corrected
        word_embed = (v_embed + u_embed) / 2 

        return word_embed

# 5. Training

In [25]:
batch_size     = 2 # mini-batch size
embedding_size = 2 
model          = Glove(voc_size, embedding_size)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    mins = elapsed_time // 60
    secs = elapsed_time % 60
    return int(mins), int(secs)

In [27]:
import time

# Training
# Since our training data is small, 1000 epochs instead of between 3 and 50 as per the paper
num_epochs = 1000


start = time.time()

for epoch in range(num_epochs):
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]

  
    optimizer.zero_grad()
    
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
    
        print(f"Epoch: {epoch + 1:6.0f} | Loss: {loss:.6f}")

end = time.time()
epoch_mins, epoch_secs = epoch_time(start, end)

print(f"time: {epoch_mins}m {epoch_secs}s")

Epoch:    100 | Loss: 17.206343
Epoch:    200 | Loss: 1.057487
Epoch:    300 | Loss: 17.068998
Epoch:    400 | Loss: 21.850616
Epoch:    500 | Loss: 0.246774
Epoch:    600 | Loss: 28.551416
Epoch:    700 | Loss: 0.066611
Epoch:    800 | Loss: 33.240028
Epoch:    900 | Loss: 3.650887
Epoch:   1000 | Loss: 0.423747
time: 2m 27s


In [29]:
print(f'Training Loss: {loss}, Training Time: {epoch_mins}m {epoch_secs}s')

Training Loss: 0.4237470328807831, Training Time: 2m 27s


In [32]:
# Saving the model for testing
torch.save(model.state_dict(), 'models/GloVe-v1.pt')