In [None]:
#Make a sigle list of collections of strigs of partial sentences
#dependencies
import numpy as np
import random
import nltk
from nltk.corpus import stopwords
import csv
import re
nltk.download('stopwords')
path = '/content/Data.csv'

#read the book
def read_corpus(path):
    f = open(path, encoding="utf-8")
    csv_f = csv.reader(f)

    book = []
    for row in csv_f:
        book.append(row[0])
    
    return book

#processing get desired corpus, finding uniq set of words
def preprocess(corpus):
    
    #join the string into a long single string
    whole_corpus=' '.join(corpus)

    #lower case and remove all the symbols
    #then remove the digits
    words_no_dig_punc = (re.sub(r'[^\w]', ' ', whole_corpus.lower())).split()
    words_no_dig_punc = [x for x in words_no_dig_punc if not any(c.isdigit() for c in x)]

    #finding unique words
    #Count and find most common words
    from collections import Counter
    word_counts = Counter(words_no_dig_punc)
    word_counts = word_counts.most_common()   #A sorted version

    stop_words = set(stopwords.words('english'))
    
    #Make a corpus without these stop words
    vocab = list(filter(lambda x: x not in stop_words, words_no_dig_punc))
    
    #From corpus get all uniq words --> to be indexed and tokenized (to one hot vectors)
    uniq_words = list(set(vocab))
    
    words_to_ints ={k: v for v, k in enumerate(uniq_words)}  #Redundant, Not using it
    ints_to_words ={v: k for v, k in enumerate(uniq_words)}  #Redundant, Not using it

    #To tokenize all the words in corpus the indices of words in uniq_words work as look up table
    vocab_int_pair= []
    for i in range(len(vocab)):
        vocab_int_pair.append([vocab[i], uniq_words.index(vocab[i])])

    #Finally just take the tokenized version of corpus to be loaded into network to train    
    int_arr_of_vocab = np.array(vocab_int_pair)[:, 1].astype(np.int)
    
    return (whole_corpus, vocab, uniq_words, words_to_ints,  vocab_int_pair, ints_to_words, int_arr_of_vocab)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus = read_corpus(path)
whole_corpus, vocab, uniq_words, words_to_ints,  vocab_int_pair, ints_to_words, int_arr_of_vocab =preprocess(corpus[:400]) 
for i in range(5):
  print(vocab[i],"->",int_arr_of_vocab[i])

prime -> 149
minister -> 1394
narendra -> 747
modi -> 716
met -> 1231


In [None]:
###############################################################################################
#Generating pairs of  true samples -->co-occuring within a given frame of window size, labeling the pair 1
#int_arr_of_vocab --> Tokenized corpus
#window --> Selected window
#True pairs start at index +w and ends at index -w (length - w), as only these have full complement of pairs
#Much easier to code this way than accomodating all elements for all window sizes

def gen_true(int_arr_of_vocab, window):
    
    true_list = []
    
    #temp list initially collects 11 indices, however middle one is deleted
    #On the second go, middle index is combined with all 10 in temp seperately and labeled as 1
    for i in range(len(int_arr_of_vocab)-window*2):
        tempp = []
        tempp=list(int_arr_of_vocab[i:i+ (window *2) + 1 ]) 
        tempp.remove(int_arr_of_vocab[i+window])

        for j in range(window*2):
            true_list.append([int_arr_of_vocab[i+window], tempp[j], 1])
    return true_list
###################################################################################################

In [None]:
###############################################################################################

#int_arr_of_vocab --> Tokenized corpus
#window --> Selected window
#k --> is number of negative samples for each word --> keep 20 as each word as 10 positive pairs when window is size  10
#Simply take k random samples from whole set of uniq_words and pair with each input word
#Note that each input has window *2 true pairs and k has to be proportionately large

#Alternate version, not used here
#speed it up --> draw enough random samples in a range
#concatenate --> the each item in true copied 20 x, random samples, 0s


import random
def gen_false(int_arr_of_vocab, uniq_words, k, window):
    false_pairs = []
    for i in range(len(int_arr_of_vocab)):
        rnd_indices = random.sample(range(len(uniq_words)),  k)
        for j in range(k):
                false_pairs.append([int_arr_of_vocab[i], rnd_indices[j], 0])
    
    return false_pairs[k*window:-k*window]

#Remove the first few pairs and last few pairs as true pairs start at index +w and ends at index -w (length - w)
##############################################################################################

In [None]:
true_list = gen_true(int_arr_of_vocab, window=2)


In [None]:
vocab_int_pair[:11], true_list[:11], len(true_list)

([['prime', 149],
  ['minister', 1394],
  ['narendra', 747],
  ['modi', 716],
  ['met', 1231],
  ['majesty', 1863],
  ['queen', 1430],
  ['maxima', 1764],
  ['kingdom', 588],
  ['netherlands', 518],
  ['today', 823]],
 [[747, 149, 1],
  [747, 1394, 1],
  [747, 716, 1],
  [747, 1231, 1],
  [716, 1394, 1],
  [716, 747, 1],
  [716, 1231, 1],
  [716, 1863, 1],
  [1231, 747, 1],
  [1231, 716, 1],
  [1231, 1863, 1]],
 17264)

In [None]:
false_list = gen_false(int_arr_of_vocab, uniq_words, k = 100, window = 2)

In [None]:
###########################################################################################################
#Concatenate true_list, false_list
#False list keeps changing each time joint list is drawn
def gen_joint_list(true_list,int_arr_of_vocab, uniq_words, k, window ):
    joint_list = np.concatenate((np.array(true_list), np.array(gen_false(int_arr_of_vocab, uniq_words, k, window))), axis = 0)
    np.random.shuffle(joint_list)
    return joint_list
############################################################################################################

In [None]:
joint_list=gen_joint_list(true_list,int_arr_of_vocab,uniq_words,100,window=2)

In [None]:
#########################################################################################
#As joint list is too long and takes a lot of memory to process at one go --> load small batches
# i --> is used to return one batch at a time, it is a counter and a markers for selecting size
#len(joint_list)//batch_size +1 --> gives the total numbers of batches

def gen_batch(joint_list, batch_size, i):

    if i < len(joint_list)//batch_size:
        
        batch = joint_list[i*batch_size:i*batch_size+batch_size]
        
    else:
        
        batch = joint_list[i*batch_size:]
        
    return batch
############################################################################################

In [None]:
#Test batch b1
b1 = gen_batch(joint_list, batch_size = 100, i =0)


In [None]:
###########################################################################################################
import torch

#generate tensors of one hot vector for each tokenized pair in a batch of joint list
#Also note labels are simply 3rd column in each batch

def one_hot_auto_batchwise(batch, uniq_words):
    
    iol_tensor = torch.Tensor(batch).long()
    
    
    middle_word_arr = torch.zeros(iol_tensor.shape[0], len(uniq_words))
    sur_word_arr = torch.zeros(iol_tensor.shape[0], len(uniq_words))
    for i in range(len(iol_tensor)):
        middle_word_arr[i, iol_tensor[i, 0]] = 1
        sur_word_arr[i, iol_tensor[i, 1]] = 1
    labels = iol_tensor[:, 2].float()
    return (middle_word_arr, sur_word_arr, labels)
#################################################################################################################

In [None]:
#with test batch b1 get the respective vectors and labels
mh, sh, ll = one_hot_auto_batchwise(b1, uniq_words)

In [None]:
#Defining the network
#2 linear fully connected layers are used, bias units are not use
#fc_midl_word takes all the input words/tokens
#fc_sur_word takes all the targets (true or false counterpart) of the pair
#Using sigmoid activation hence BCE loss
#Also note parameters of both networks are combined in a list which helps in back prop and stepping

import torch.nn as nn
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embed_size = 30
def gen_model(uniq_words, embed_size, LR=0.0001):
    

    fc_midl_word = nn.Linear(len(uniq_words), embed_size, bias = False)
    fc_sur_word = nn.Linear(len(uniq_words), embed_size, bias = False)

    fc_midl_word = fc_midl_word.to(device)
    fc_sur_word =fc_sur_word.to(device)

    
    criterion = nn.BCELoss()
    
    params = list(fc_midl_word.parameters()) + list(fc_sur_word.parameters())
    optimizer = optim.Adam(params, lr = LR)
    
    return(fc_midl_word, fc_sur_word, criterion, optimizer )

In [None]:
fc_midl_word, fc_sur_word, criterion, optimizer = gen_model(uniq_words, embed_size =30, LR=0.0001)

In [None]:
#Collect losses
#k --> proporion of negative samples --> with window of 5, k of 100 --> 10 negative samples for each tre pair
#window --> selected/defined window for cooccurence
#LR - Learing rate
#embed_size - Size of embedding for vector representation
#embed_size --> Arbitrary, May be scaled down by some log when uniq_set is large or by root when uniq_set is not so large

#Get all the variable and network ready before feeding/starting algorithm
#xavier init is vital for better and faster convergence

losses = []
k = 100
window = 2
LR = 0.001
embed_size =30
whole_corpus, vocab, uniq_words, words_to_ints,  vocab_int_pair, ints_to_words, int_arr_of_vocab =preprocess(corpus[:400]) 
true_list = gen_true(int_arr_of_vocab, window =2)


fc_midl_word, fc_sur_word, criterion, optimizer = gen_model(uniq_words, embed_size, LR)
torch.nn.init.xavier_uniform_(fc_midl_word.weight)
torch.nn.init.xavier_uniform_(fc_sur_word.weight)

Parameter containing:
tensor([[ 0.0263, -0.0342,  0.0287,  ...,  0.0177,  0.0549, -0.0181],
        [-0.0223,  0.0334,  0.0498,  ...,  0.0293,  0.0514,  0.0071],
        [-0.0428, -0.0257, -0.0249,  ..., -0.0329,  0.0011,  0.0004],
        ...,
        [-0.0533, -0.0244,  0.0492,  ...,  0.0383,  0.0178, -0.0486],
        [-0.0320,  0.0350,  0.0095,  ...,  0.0382, -0.0264, -0.0246],
        [ 0.0017,  0.0112, -0.0447,  ..., -0.0155, -0.0122, -0.0344]],
       requires_grad=True)

In [None]:
#Implement 


epochs = 200
print_every = 20
batch_size = 512



for epoch in range(epochs):
    
    #Get fresh joint list with different random false samples
    joint_list = gen_joint_list(true_list,int_arr_of_vocab, uniq_words, k, window )
    num_batches = (len(joint_list)//batch_size) +1
    
    #Get i.th batch from joint list and proceed forward, backward
    for i in range(num_batches):  
        
        batch = gen_batch(joint_list, batch_size, i)
        mid_word_oh, sur_word_oh, labels = one_hot_auto_batchwise(batch, uniq_words)
    
    
        z_midl = fc_midl_word(torch.Tensor(mid_word_oh))
        
        z_sur = fc_sur_word(torch.Tensor(sur_word_oh))
        
        #vector product of word as input and word as target, not the product is parallelized and not looped
        #after training product/score for true pairs will be high and low/neg for false pairs
        dot_inp_tar = torch.sum(torch.mul(z_midl, z_sur), dim =1).reshape(-1, 1)
        
        #sigmoid activation squashes the scores to 1 or 0
        sig_logits = nn.Sigmoid()(dot_inp_tar)
        
        optimizer.zero_grad()
        loss = criterion(sig_logits, torch.Tensor(labels).view(sig_logits.shape[0], 1))
        loss.backward()
        optimizer.step()
        
        
    if epoch % print_every == 0:

        losses.append(loss.item())
        print(loss.item())

0.16694580018520355
0.08674576878547668
0.04377136006951332
0.03595566004514694
0.026900755241513252
0.028999585658311844
0.02330128662288189
0.03742794692516327
0.022730093449354172
0.017504464834928513


In [None]:
#################################################################################################################
#Given the set of uniq_words used to train, the function finds the cosine distances from selected word to all words
#top_n words are returned, sim_score is simply the cosine distance
import torch
def find_dist(uniq_words, word, top_n):
    distances = []
    idx =  uniq_words.index(word)
    for i in range(fc_midl_word.weight.t().shape[0]):
        dist = nn.CosineSimilarity(dim = 0)(fc_midl_word.weight.t()[idx, :], fc_midl_word.weight.t()[i, :])
        distances.append(dist)
    sim_score, indices = torch.topk(torch.Tensor(distances), top_n)
    indices = indices.tolist()
    similar_words = [uniq_words[i] for i  in indices]
    #print(similar_words)
    print(sim_score)
    return similar_words


In [None]:
uniq_words

['elaborated',
 'approach',
 'approximately',
 'non',
 'methanol',
 'generations',
 'spread',
 'stockholding',
 'tried',
 'dollars',
 'star',
 'half',
 'ncc',
 'cabinet',
 'rains',
 'enterprises',
 'bhanupratappur',
 'early',
 'improve',
 'msme',
 'enjoy',
 'advice',
 'caste',
 'breakthroughs',
 'act',
 'bakula',
 'high',
 'alliance',
 'kutumbakam',
 'market',
 'solving',
 'embrace',
 'time',
 'led',
 'globe',
 'operationalization',
 'initially',
 'noted',
 'eleventh',
 'wto',
 'protects',
 'useful',
 'jointly',
 'last',
 'upcoming',
 'sessions',
 'face',
 'foster',
 'globalization',
 'aviation',
 'lalchhahimi',
 'helps',
 'olympian',
 'memorandum',
 'financial',
 'srinagar',
 'conversion',
 'rich',
 'advanced',
 'expert',
 'wide',
 'statue',
 'active',
 'well',
 'forensic',
 'tough',
 'appropriate',
 'stone',
 'several',
 'raj',
 'difference',
 'train',
 'bhimrao',
 'fighter',
 'involvement',
 'girls',
 'decades',
 'primary',
 'working',
 'competitively',
 'objectives',
 'often',
 'ef

In [None]:
find_dist(uniq_words, 'prime', 10)

tensor([1.0000, 0.7824, 0.6576, 0.5876, 0.5871, 0.5843, 0.5785, 0.5697, 0.5670,
        0.5661])


['prime',
 'minister',
 'saluted',
 'maharashtra',
 'rich',
 'indulging',
 'spiritual',
 'bj',
 'occasion',
 'medical']

In [None]:
find_dist(uniq_words, 'president', 10)

tensor([1.0000, 0.8288, 0.8194, 0.7697, 0.7630, 0.6891, 0.6882, 0.6512, 0.6435,
        0.6354])


['president',
 'tomorrow',
 'sochi',
 'former',
 'russian',
 'prior',
 'putin',
 'huge',
 'electricity',
 'asserted']

In [None]:
find_dist(uniq_words, 'develop', 10)

tensor([1.0000, 0.6639, 0.6576, 0.6506, 0.6452, 0.6436, 0.6428, 0.6399, 0.6345,
        0.6277])


['develop',
 'shining',
 'mechanism',
 'clean',
 'unnecessary',
 'tunisia',
 'executing',
 'successful',
 'upon',
 'newer']