In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import Counter


In [3]:
with open("data/text8_20m.txt") as f:
    text= f.read()
    
text[:100]

'anarchism originated as a term of abuse first used against early working class radicals including th'

**Text Preprocessing** 


In [4]:
words= text.split(" ")

words[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

**Building the Vocabulary**

In [5]:
words_count= Counter(words).most_common(60000)

most_freq_words= [item for item, _ in words_count]

#assigning unique id to every word
word_idx= {word: i+1 for i, word in enumerate(most_freq_words)}
word_idx['<UNK>']= 0

#converting text to idx
text_idx= [word_idx.get(w, word_idx['<UNK>']) for w in words]

#add freq of unfrequent words
words_count.append(('UNK', text_idx.count(0)))

text_idx[:10]

[1336, 2862, 13, 7, 194, 2, 4067, 49, 60, 137]

**Generate training data**

In [6]:
def generate_training_pairs(words, C):
    training_pairs= []
    n= len(words)
    
    for i, center_word in enumerate(text_idx):
        start_idx= max(0, i-C)
        end_idx= min(n-1, i+C+1)
        
        for j in range(start_idx, end_idx):
            if j!=i: #skip center word
                training_pairs.append((center_word, words[j]))
    
    return training_pairs

C= 2

training_pairs= generate_training_pairs(text_idx, C)

training_pairs[:10]

[(1336, 2862),
 (1336, 13),
 (2862, 1336),
 (2862, 13),
 (2862, 7),
 (13, 1336),
 (13, 2862),
 (13, 7),
 (13, 194),
 (7, 2862)]

Calculate unigram and smoothed unigram distribution

In [7]:
import numpy as np
N= sum([c for _,c in words_count])
alpha= 3/4

unigram= {word: freq/N for word,freq in words_count}

unigram_sum= sum(u**alpha for u in unigram.values())

smoothed_unigram= {word: (uni**alpha)/unigram_sum for word, uni in unigram.items()}

unigram_table= np.array(list(smoothed_unigram.values()))


**Training Loop**

In [None]:
from torch.utils.data import DataLoader
from torch.optim import Adam
from Skipgram import Skipgram
from NegativeSamplingLoss import NegativeSamplingLoss
import torch
import numpy as np

def train_skipgram(dataset, vocab_size, embedding_dim, unigram, batch_size=32, epochs=5, learning_rate=0.01, num_negatives=20):
    
    dataloader= DataLoader(dataset, batch_size)
    
    model= Skipgram(vocab_size, embedding_dim)
    criterion= NegativeSamplingLoss()
    optim= Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        total_loss= 0
        
        for center, contexts in dataloader:
            center, contexts = center.long(), contexts.long()
            
            #make sure batch_size is equal
            current_batch_size= len(center)
            
            #sample negatives from unigram
            negatives= torch.tensor(
                np.random.choice(
                    vocab_size,
                    (current_batch_size, num_negatives),
                    p= unigram
                ),
                dtype=torch.long
            )
            
            #forward pass
            positive_score, negative_score= model(center, contexts, negatives)
            
            loss= criterion(positive_score, negative_score)
            
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            total_loss+= loss.item()
            
        print(f"Epoch {epoch+1}/{epochs} -- Loss: {total_loss/len(dataloader):.4F}")
    
    return model
            

In [12]:
from SkipgramDataset import SkipgramDataset
import importlib



pairs= training_pairs[:100]

dataset= SkipgramDataset(pairs)

vocab_size= 60001
embedding_dim= 128

model = train_skipgram(dataset, vocab_size, embedding_dim, unigram_table)


testing
Negative Embedding Shape: torch.Size([32, 20, 128])
Center Embedding Shape (Unsqueezed): torch.Size([32, 128, 1])
testing
Negative Embedding Shape: torch.Size([32, 20, 128])
Center Embedding Shape (Unsqueezed): torch.Size([32, 128, 1])
testing
Negative Embedding Shape: torch.Size([32, 20, 128])
Center Embedding Shape (Unsqueezed): torch.Size([32, 128, 1])
testing
Negative Embedding Shape: torch.Size([32, 20, 128])
Center Embedding Shape (Unsqueezed): torch.Size([4, 128, 1])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [32, 128] but got: [4, 128].