# Skipgram vs sipgramNeg 

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load data

In [2]:
import pandas as pd

# Read the CSV file which contain spotify song lyric 
df = pd.read_csv("C:\\Users\\ASUS\\My_Journal\\Text\\My-NLP\\spotify_millsongdata.csv")

# Randomly select 30 song
sample = df.sample(30)

In [3]:
# 1. tokenize

import spacy
# clean some data
corpus = sample["text"].replace(r"\\|\r|\n|", "").tolist()

nlp = spacy.load("en_core_web_sm")
corpus_tokenized = [nlp(sent) for sent in corpus]
# convert scapy token to str
corpus_tokenized = [[str(word) for word in sublist] for sublist in corpus_tokenized]

In [4]:
#2. numericalize

#2.1 get all the unique words
#we want to flatten this (basically merge all list)
flatten = lambda l: [item for sublist in l for item in sublist]

vocabs  = list(set(flatten(corpus_tokenized)))  #vocabs is a term defining all unique words your system know

#2.2 assign id to all these vocabs
word2index = {v: idx for idx, v in enumerate(vocabs)}

#add <UNK>, which is a very normal token exists in the world
vocabs.append('<UNK>') #chaky, can it be ##UNK, or UNKKKKKK, or anything

#now we have a way to know what is the id of <UNK>
word2index['<UNK>'] = len(word2index)  #usually <UNK> is 0

#create index2word dictionary
#2 min    
index2word = {v:k for k, v in word2index.items()}


## 2. Prepare train data
You move the window along, and create those tuples as we said in class

In [5]:

def skipgram_random_batch(batch_size, corpus, window_size=1):
    
    skipgrams = []

    #for each corpus
    for sent in corpus_tokenized:
        #for each sent ["apple", "banana", "fruit"]
        # # start from window_size end at window_size before last
        for i in range(window_size,len(sent)-window_size):
            center_word = word2index[sent[i]]
            # outside words 
            outside_words = [word2index[sent[j]] for j in range(max(0, i - window_size), min(len(sent), i + window_size + 1)) if j != i]
            for o in outside_words:
                # append outside word as input center word as output
                skipgrams.append([center_word,o])
                
    #only get a batch, not the entire list
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
             
    #appending some list of inputs and labels
    random_inputs, random_labels = [], []   
    for index in random_index:
        random_inputs.append([skipgrams[index][0]])  #outside words, this will be a shape of (1, ) --> (1, 1) for modeling
        random_labels.append([skipgrams[index][1]])
        
    return np.array(random_inputs), np.array(random_labels)
    

## 3. Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

Defining the probability of sampling negative words

In [6]:
#basically create a distribution of all the words you have in your vocabs

In [7]:
z = 0.001  #scaling up low frequency terms

In [8]:
#count all the occurrences of vocabs
from collections import Counter

word_count = Counter(flatten(corpus_tokenized))
# word_count
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

9512

In [9]:
unigram_table = []

for v in vocabs:
    uw = word_count[v]/num_total_words
    uw_alpha = uw ** 0.75
    uw_alpha_dividebyz = int(uw_alpha / z)
    # print("vocab: ", v)
    # print("distribution: ", uw_alpha_dividebyz)
    unigram_table.extend([v] * uw_alpha_dividebyz)
    
# Counter(unigram_table)

## 4. Negative sampling

A function to get negative samples, based on the current center and outside words in the batch

In [10]:
def prepare_sequence(seq, word2index):
    #map(function, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [11]:
import random
#you don't want to pick samples = targets, basically negative samples
#k = number of negative samples - how many? they found 10 is the best
#will be run during training
#after random_batch, 
def negative_sampling(targets, unigram_table, k):
    #targets is already in id.....
    #but the unigram_table is in word....
    #1. get the batch size of this targets
    batch_size = targets.shape[0]
    neg_samples = []
    #2. for each batch
    for i in range(batch_size):
        #randomly pick k negative words from unigram_table
        target_index = targets[i].item()  #looping each of the batch....
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            #if this word == target, skip this word
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        #append this word to some list
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))  #tensor[], tensor[]
    return torch.cat(neg_samples)  #tensor[[], []]

## 5. Model

SkipgramNeg

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

Skipgram

$$J(\theta) = -\frac{1}{T}\sum_{t=1}^{T}\sum_{\substack{-m \leq j \leq m \\ j \neq 0}}\log P(w_{t+j} | w_t; \theta)$$

where $P(w_{t+j} | w_t; \theta) = $

$$P(o|c)=\frac{\exp(\mathbf{u_o^{\top}v_c})}{\sum_{w=1}^V\exp(\mathbf{u_w^{\top}v_c})}$$

where $o$ is the outside words and $c$ is the center word

In [12]:

def prepare_voc(seq, word2index):
    #map(function, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [13]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center_word  = nn.Embedding(voc_size, emb_size)  #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        #center_word, outside_word: (batch_size, 1)
        #all_vocabs: (batch_size, voc_size)
        
        #convert them into embedding
        center_word_embed  = self.embedding_center_word(center_word)     #(batch_size, 1, emb_size)
        outside_word_embed = self.embedding_outside_word(outside_word)   #(batch_size, 1, emb_size)
        all_vocabs_embed   = self.embedding_outside_word(all_vocabs)     #(batch_size, voc_size, emb_size)
        
        #bmm is basically @ or .dot , but across batches (i.e., ignore the batch dimension)
        top_term = outside_word_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) ===> (batch_size, 1)
        
        top_term_exp = torch.exp(top_term)  #exp(uo vc)
        #(batch_size, 1)
        
        lower_term = all_vocabs_embed.bmm(center_word_embed.transpose(1, 2)).squeeze(2)
         #(batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size)
         
        lower_term_sum = torch.sum(torch.exp(lower_term), 1) #sum exp(uw vc)
        #(batch_size, 1)
        
        loss_fn = -torch.mean(torch.log(top_term_exp / lower_term_sum))
        #(batch_size, 1) / (batch_size, 1) ==mean==> scalar
        
        return loss_fn

In [14]:
class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center_word  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
        self.logsigmoid = nn.LogSigmoid()
        
    def forward(self, center_words, outside_words, negative_words):
        #center_words, outside_words: (batch_size, 1)
        #negative_words:  (batch_size, k)
        
        center_embed  = self.embedding_center_word(center_words)    #(batch_size, 1, emb_size)
        outside_embed = self.embedding_outside_word(outside_words)  #(batch_size, 1, emb_size)
        neg_embed     = self.embedding_outside_word(negative_words) #(batch_size, k, emb_size)
        
        uovc          =  outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2)  #(batch_size, 1)
        ukvc          = -neg_embed.bmm(center_embed.transpose(1, 2)).squeeze(2)  #(batch_size, k)
        ukvc_sum      =  torch.sum(ukvc, 1).view(-1, 1) #(batch_size, 1)
        
        loss = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)  #(batch_size, 1) + (batch_size, 1)
                
        return -torch.mean(loss)  #scalar, loss should be scalar, to call backward()


## 4. Training

In [15]:
voc_size   = len(vocabs)
batch_size = 10 
emb_size   = 2 

model      = SkipgramNeg(voc_size, emb_size)
num_neg = 10 
window_size = 2
optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [16]:
import time

num_epochs = 5000
#start time
start_time = time.time()
pre_time = start_time
#for epoch
for epoch in range(num_epochs):

    #get random batch
    input_batch, label_batch = skipgram_random_batch(batch_size, corpus,window_size)
    input_batch = torch.LongTensor(input_batch)
    label_batch = torch.LongTensor(label_batch)
    neg_batch   = negative_sampling(label_batch, unigram_table,num_neg)    
    
    #loss = model
    loss = model(input_batch, label_batch, neg_batch)
    
    #backpropagate
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print epoch loss
    if (epoch + 1) % 1000 == 0:
        curr_time = time.time()
        print(f"Epoch {epoch+1} | Loss: {loss:.6f} | Time: {curr_time-pre_time:.2f} sec")
        pre_time = curr_time

print(f"total time : {curr_time-start_time:.2f} sec")

Epoch 1000 | Loss: 1.759729 | Time: 52.91 sec
Epoch 2000 | Loss: 1.093236 | Time: 51.29 sec
Epoch 3000 | Loss: 2.276028 | Time: 52.07 sec
Epoch 4000 | Loss: 3.247149 | Time: 51.36 sec
Epoch 5000 | Loss: 1.234343 | Time: 51.89 sec
total time : 259.52 sec


In [17]:
model = Skipgram(voc_size, emb_size)
optimizer  = optim.Adam(model.parameters(), lr=0.001)
all_vocabs = prepare_voc(list(vocabs), word2index).expand(batch_size, voc_size)
# all_vocabs.shape

In [18]:
num_epochs = 5000

#start time
start_time = time.time()
pre_time = start_time

#for epoch
for epoch in range(num_epochs):

    #get random batch
    input_batch, label_batch = skipgram_random_batch(batch_size, corpus,window_size)
    input_batch = torch.LongTensor(input_batch)
    label_batch = torch.LongTensor(label_batch)
    
    # print(input_batch.shape, label_batch.shape, all_vocabs.shape)
    
    #loss = model
    loss = model(input_batch, label_batch, all_vocabs)
    
    #backpropagate
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print epoch loss
    if (epoch + 1) % 1000 == 0:
        curr_time = time.time()
        print(f"Epoch {epoch+1} | Loss: {loss:.6f} | Time: {curr_time-pre_time:.2f} sec")
        pre_time = curr_time

print(f"total time : {curr_time-start_time:.2f} sec")

Epoch 1000 | Loss: 7.178596 | Time: 53.16 sec
Epoch 2000 | Loss: 7.211318 | Time: 52.91 sec
Epoch 3000 | Loss: 6.714290 | Time: 53.86 sec
Epoch 4000 | Loss: 6.343014 | Time: 56.58 sec
Epoch 5000 | Loss: 5.382660 | Time: 55.34 sec
total time : 271.86 sec
