# Lab 10: Word Embeddings
Thinking of using stuff from here
https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

conda install -c conda-forge tqdm

conda install -c conda-forge ipywidgets
conda install -c anaconda nltk 


## Janitorial Work

In [1]:
testCorpus = ["First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.",
             "No. Like this. You gotta look mean, or people won’t respect you.",
              " people will run all over you if you don’t look mean.",
              "You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.",
             "But our tribe never hunted buffalo. We were fishermen."
             "What? You wanna look like you just came back from catching a fish?",
             "This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior."]
maxDocs = 500

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ob2285/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read in pubmed corpus into a text file

import glob
pubMedDataFolderPath = "data/pubMed_corpus/"
pubMedDataFiles = glob.glob(pubMedDataFolderPath + "*.txt")
pubMedCorpus = [""]*len(pubMedDataFiles)
for idx, pubMedDataPath in enumerate(pubMedDataFiles):
    with open(pubMedDataPath, "r") as pubMedFile:
        text = pubMedFile.read().strip()
        pubMedCorpus[idx] = text
pubMedCorpus = pubMedCorpus[0:maxDocs]
print("{} pub med abstracts".format(len(pubMedCorpus)))

500 pub med abstracts


In [4]:
# Read in the ap corpus
apTextFile = "data/ap.txt"
apCorpus = []
readText = False
with open(apTextFile) as apDataFile:
    for line in apDataFile:
        if readText:
            apCorpus.append(line.strip())
            readText = False
        if line == "<TEXT>\n":
            readText = True
apCorpus = apCorpus[0:maxDocs]
print("{} ap articles".format(len(apCorpus)))

500 ap articles


In [5]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def removePunctuation(myStr):
    excludedCharacters = string.punctuation + "’"
    newStr = "".join(char for char in myStr if char not in excludedCharacters)
    return(newStr)
def removeStopWords(tokenList):
    newTokenList = [tok for tok in tokenList if tok not in stopwords.words('english')]
    return(newTokenList)
def tokenize_corpus(corpus):
    tokens = [removeStopWords(removePunctuation(x).lower().split()) for x in corpus]
    return tokens

apCorpusTokenized = tokenize_corpus(apCorpus)
pubMedCorpusTokenized = tokenize_corpus(pubMedCorpus)
testCorpusTokenized = tokenize_corpus(testCorpus)

[nltk_data] Downloading package stopwords to /home/ob2285/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import time
from tqdm import tqdm, tqdm_notebook
from collections import Counter

minVocabOccurence = 5

def extractVocabMappers(tokenizedCorpus, vocabSizeMax = None, minVocabOccurence = 0):
    UNK = "<UNK>"
    flattenedCorpus = [item for sublist in tokenizedCorpus for item in sublist]
    wordCounts = Counter(flattenedCorpus).most_common()
    wordCounts = [(w, c) for w,c in wordCounts if c > minVocabOccurence]
#     wordCounts = wordCounts.most_common(vocabSizeMax)
    vocabulary = [word for word, count in wordCounts]
    
    # below is more readable but significantly slower code
    if False:
        vocabulary = []
        for sentence in tqdm(tokenizedCorpus):
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
    vocabulary.append(UNK)
    print("Vocab size: {}".format(len(vocabulary)))
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    newTokenizedCorpus = []# all words missing from vocab replaced with <UNK>
    for doc in tokenizedCorpus:
        newDoc = [word if word in word2idx else UNK for word in doc]
        newTokenizedCorpus.append(newDoc)
    return(word2idx, idx2word, wordCounts, newTokenizedCorpus)

start = time.time()
print("Building ap corpus vocabulary")
word2Idx_ap, idx2Word_ap, vocabCount_ap, finalTokenizedCorpus_ap = extractVocabMappers(apCorpusTokenized,
#                                                                                        vocabSizeMax = maxVocabSize,
                                                                                      minVocabOccurence = minVocabOccurence)
print("ap data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building pubMed corpus vocabulary")
word2Idx_pubMed, idx2Word_pubMed, vocabCount_pubMed, finalTokenizedCorpus_pubMed = extractVocabMappers(pubMedCorpusTokenized,
#                                                                                                        vocabSizeMax = maxVocabSize,
                                                                                                      minVocabOccurence = minVocabOccurence)
print("pubmed data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building test corpus vocabulary")
word2Idx_test, idx2Word_test, vocabCount_test, finalTokenizedCorpus_test = extractVocabMappers(testCorpusTokenized,
#                                                                                                vocabSizeMax = maxVocabSize,
                                                                                              minVocabOccurence = 0)
print("test data tokenized in {} seconds".format(time.time() - start))

Building ap corpus vocabulary
Vocab size: 4123
ap data tokenized in 0.051291465759277344 seconds

Building pubMed corpus vocabulary
Vocab size: 2333
pubmed data tokenized in 0.026688814163208008 seconds

Building test corpus vocabulary
Vocab size: 36
test data tokenized in 0.0004177093505859375 seconds


## Word2Vec Implementation

In [8]:
import numpy as np
import torch
from torch import nn
import random

In [291]:
def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 2
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
#         indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
#         w,c = vocabCount[idx]
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
#         self.centerEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.contextEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.embeddings = nn.Embedding(vocab_size, embd_size)
        
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        
#         initrange = (2.0 / (vocabSize + embedSize)) ** 0.5  # Xavier init
        nn.init.xavier_uniform_(self.contextEmbeddings)
        nn.init.xavier_uniform_(self.centerEmbeddings)
        
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
#         self.paramList = nn.ModuleList([self.centerEmbeddings, self.contextEmbeddings] )
    def getNegSample(self, k, centerWord):
        vocabSizeWithoutUnk = self.vocabSize - 1
#         negSample = np.random.choice(vocabSizeWithoutUnk,
#                                      size = k, replace = True, p = self.wordSampleProbs)
        negSample = random.sample(self.wordSampleProbs, k)
        while self.word2Idx[centerWord] in negSample:
#             negSample = np.random.choice(vocabSizeWithoutUnk,
#                                          size = k, replace = True, p = self.wordSampleProbs)
            negSample = random.sample(self.wordSampleProbs, k)
        return(negSample)
    def forward(self, center, context, negSampleIndices = None):
#         focus = torch.autograd.Variable(torch.LongTensor([0]))
#         context = torch.autograd.Variable(torch.LongTensor([0]))
#         allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))


#         embedCenter = self.centerEmbeddings(center).view((1, -1))
#         embedContext = self.contextEmbeddings(context).view((1, -1))
# #         print(allEmbeddingIdxs)
#         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
#         num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#         denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
#         logProb = torch.log(num/denom)
        embedCenter = self.centerEmbeddings[center].view((1, -1))
        embedContext = self.contextEmbeddings[context].view((1, -1))       
        if negSampleIndices is not None:
#             print("hey")
#             posVal = self.logSigmoid(torch.mm(embedContext, torch.t(embedCenter)))
#             print(posVal)
            posVal = self.logSigmoid(torch.sum(embedContext * embedCenter))
#             print(posVal)
#             start = time.time()
#             for i in range(1000):
            negVal = torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter))
            negVal = self.logSigmoid(-torch.sum(negVal))
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter)).shape)
#             1/0
            logProb = posVal + negVal
        else:
#             allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))



    #         print(allEmbeddingIdxs)
    #         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
            num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#             start = time.time()
#             for i in range(1000):
            denom = torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).sum()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).shape)
#             1/0
            logProb = torch.log(num/denom)
#         print(logProb)
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, idx2Word, k, referenceWords):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
#     print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    listNearestWords(model = model, idx2Word = idx2Word,
     referenceWords = referenceWords, topN = 5)
    batchSize = 32
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
#         listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
        total_loss = .0
        avgLoss = 0.0
        iteration = 0
#         for step in range(0, len(trainingData), batchSize):
        for in_w, out_w in tqdm_notebook(trainingData, position = 1):
#             endIdx = np.min((i+batchSize), len(trainingData))
#             myBatch = trainingData[i:(i+batchSize)]
            if k is not None:
                negSamples = model.getNegSample(k = k, centerWord = in_w)
            else:
                negSamples = None
#             print("neg samples found")
#             print(negSamples)
            in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             if in_w in word2Idx:
#                 in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
#             else:
#                 in_w_var = word2Idx["<UNK>"]
#             if out_w in word2Idx:
#                 out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             else:
#                 out_w_var = word2Idx["<UNK>"]
            
            model.zero_grad()
            log_probs = model(in_w_var, out_w_var, negSampleIndices = negSamples)
            loss = -log_probs#loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
            avgLoss += loss.data.numpy()
            iteration += 1
            if iteration % 10000 == 0:
                avgLoss = total_loss/(iteration)
                print("avg loss: {}".format(avgLoss))
                avgLoss = 0.0
            if iteration % 20000 == 0:
                listNearestWords(model = model, idx2Word = idx2Word,
                 referenceWords = referenceWords, topN = 5)
        losses.append(total_loss)    
        print(f'Loss at epoch {epoch}: {total_loss/len(trainingData)}')
        listNearestWords(model = model, idx2Word = idx2Word,
                     referenceWords = referenceWords, topN = 5)
    return(model, losses)

In [13]:
##### BATCH VERSION ######


def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 5
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
#         indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
#         w,c = vocabCount[idx]
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
#         self.centerEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.contextEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.embeddings = nn.Embedding(vocab_size, embd_size)
        
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        
#         initrange = (2.0 / (vocabSize + embedSize)) ** 0.5  # Xavier init
        nn.init.xavier_uniform_(self.contextEmbeddings)
        nn.init.xavier_uniform_(self.centerEmbeddings)
        
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
#         self.paramList = nn.ModuleList([self.centerEmbeddings, self.contextEmbeddings] )
    def getNegSample(self, k, centerWords):
        vocabSizeWithoutUnk = self.vocabSize - 1
#         negSample = np.random.choice(vocabSizeWithoutUnk,
#                                      size = k, replace = True, p = self.wordSampleProbs)
        negSamples = []
        for centerWord in centerWords:
            negSample = random.sample(self.wordSampleProbs, k)
            while self.word2Idx[centerWord] in negSample:
                negSample = random.sample(self.wordSampleProbs, k)
            negSamples.append(negSample)
        return(negSamples)
    def forward(self, center, context, negSampleIndices = None):
#         focus = torch.autograd.Variable(torch.LongTensor([0]))
#         context = torch.autograd.Variable(torch.LongTensor([0]))
#         allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))


#         embedCenter = self.centerEmbeddings(center).view((1, -1))
#         embedContext = self.contextEmbeddings(context).view((1, -1))
# #         print(allEmbeddingIdxs)
#         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
#         num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#         denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
#         logProb = torch.log(num/denom)
#         print("center indices\n",center)
#         print("context indices\n",context)
        embedCenter = self.centerEmbeddings[center]#.view((1, -1))
        embedContext = self.contextEmbeddings[context]#.view((1, -1))       
        if negSampleIndices is not None:
#             print("hey")
#             posVal = self.logSigmoid(torch.mm(embedContext, torch.t(embedCenter)))
#             print(posVal)
            posVal = self.logSigmoid(torch.sum(embedContext * embedCenter, dim = 1)).squeeze()
#             print(posVal)
#             start = time.time()
#             for i in range(1000):
#             print(torch.autograd.Variable(torch.LongTensor(negSampleIndices)))
            negSampleIndices = torch.autograd.Variable(torch.LongTensor(negSampleIndices))
#             print(self.contextEmbeddings[negSampleIndices].shape)
#             print(embedCenter.shape)
            negVal = torch.bmm(self.contextEmbeddings[negSampleIndices], embedCenter.unsqueeze(2)).squeeze(2)
            negVal = self.logSigmoid(-torch.sum(negVal, dim = 1)).squeeze()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter)).shape)
#             1/0
#             print(posVal.shape)
#             print(negVal.shape)
#             1/0
            logProb = -(posVal + negVal).mean()
        else:
#             allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))



    #         print(allEmbeddingIdxs)
    #         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
            num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#             start = time.time()
#             for i in range(1000):
            denom = torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).sum()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).shape)
#             1/0
            logProb = torch.log(num/denom)
#         print(logProb)
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, idx2Word, k, referenceWords):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
#     print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    listNearestWords(model = model, idx2Word = idx2Word,
     referenceWords = referenceWords, topN = 5)
    batchSize = 512
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
#         listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
        total_loss = .0
        avgLoss = 0.0
        iteration = 0
        for step in tqdm_notebook(range(0, len(trainingData), batchSize), position = 1):
#         for in_w, out_w in tqdm_notebook(trainingData, position = 1):
            endIdx = np.min([(step+batchSize), len(trainingData)])
            myBatch = trainingData[step:(step+batchSize)]
#             print(myBatch)
            centerWords = [elem[0] for elem in myBatch]
            contextWords = [elem[1] for elem in myBatch]
#             print(centerWords)
            if k is not None:
                negSamples = model.getNegSample(k = k, centerWords = centerWords)
            else:
                negSamples = None
#             print("neg samples found")
#             print(negSamples)
            centerIDs = [word2Idx[idx] for idx in centerWords]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            contextIDs = [word2Idx[idx] for idx in contextWords]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             if in_w in word2Idx:
#                 in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
#             else:
#                 in_w_var = word2Idx["<UNK>"]
#             if out_w in word2Idx:
#                 out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             else:
#                 out_w_var = word2Idx["<UNK>"]
            
            model.zero_grad()
            loss = model(centerIDs, contextIDs, negSampleIndices = negSamples)
#             print(loss)
#             print(model.centerEmbeddings.mean())
#             loss = -log_probs#loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
            avgLoss += loss.data.numpy()
            iteration += 1
            if iteration % 500 == 0:
                avgLoss = avgLoss/(500)
                print("avg loss: {}".format(avgLoss))
#                 avgLoss = 0.0
#             if iteration % 2000 == 0:
#                 listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
        losses.append(total_loss)    
        print(f'Loss at epoch {epoch}: {total_loss/iteration}')
        listNearestWords(model = model, idx2Word = idx2Word,
                     referenceWords = referenceWords, topN = 5)
    return(model, losses)

In [10]:
from scipy.spatial.distance import cdist
def listNearestWords(model, idx2Word, referenceWords, topN):
    assert len(idx2Word) == len(model.word2Idx), "Possibly passed in two different vocabularies"
    embeddings = model.centerEmbeddings.data.numpy()
#     embeddings = model.contextEmbeddings.data.numpy()
    distMat = cdist(embeddings, embeddings, metric = "cosine")
    for word in referenceWords:
        wordIdx = model.word2Idx[word]
#         print(np.argmin(distMat[wordIdx,:]))
        closestIndices = np.argsort(distMat[wordIdx,:])[0:topN]
        closestWords = [(idx2Word[idx], distMat[wordIdx, idx]) for idx in closestIndices]
        for elem in closestWords:
            print(elem)
        print("*"*50 + "\n")

In [11]:
# embd_size = 100
# learning_rate = 0.001
# n_epoch = 60
# idxPairsTest = generateObservations(tokenizedCorpus = finalTokenizedCorpus_test, word2Idx = word2Idx_test)
# sg_model, sg_losses = train_skipgram(embeddingSize = 5, trainingData = idxPairsTest, vocabCount = vocabCount_test,
#                                      word2Idx = word2Idx_test, k = None)

In [None]:
embeddingSize = 50
learning_rate = 0.1
n_epoch = 60
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model_ap, sg_losses_ap = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP,
                                     vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, idx2Word = idx2Word_ap, k = 20,
                                          referenceWords = ["bush", "soviet", "president", "military", "american"])

training on 1195636 observations
('bush', 0.0)
('ny', 0.5722449167339645)
('degrees', 0.58268658862842)
('fitzwater', 0.59456595395935)
('review', 0.6011509501625227)
**************************************************

('soviet', 0.0)
('mind', 0.5187260751221663)
('trump', 0.5457773526336366)
('chemicals', 0.5646036039292068)
('shortly', 0.5694139908820548)
**************************************************

('president', 1.1102230246251565e-16)
('service', 0.5355918862435511)
('peaceful', 0.5405652230804536)
('keeping', 0.5586084850826956)
('scheme', 0.5725764880682768)
**************************************************

('military', 1.1102230246251565e-16)
('44', 0.5150267718218834)
('ive', 0.5166936271712037)
('californias', 0.5347822524786181)
('cavazos', 0.5514847430360431)
**************************************************

('american', 0.0)
('relative', 0.463291456093445)
('apartment', 0.4872718948682221)
('search', 0.5156335821608905)
('talks', 0.5300147132595971)
*************

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2336), HTML(value='')))

avg loss: 1.3492616987228394
avg loss: 1.21932355284214
avg loss: 1.1854211774611376
avg loss: 1.161834837811618
Loss at epoch 0: 1.2151192926788983
('bush', 0.0)
('said', 0.1647495999569355)
('also', 0.1761844887434828)
('<UNK>', 0.18910804454657448)
('percent', 0.1922690944961828)
**************************************************

('soviet', 0.0)
('<UNK>', 0.06951568610943837)
('said', 0.06973861499696665)
('would', 0.08786484580133114)
('years', 0.11004755355922058)
**************************************************

('president', 0.0)
('would', 0.07809489798761304)
('<UNK>', 0.08611170296054982)
('said', 0.09290938252018188)
('new', 0.10922523419730479)
**************************************************

('military', 0.0)
('would', 0.2330165741270377)
('percent', 0.23949223567287792)
('also', 0.24962935078846005)
('years', 0.2715237974312834)
**************************************************

('american', 0.0)
('<UNK>', 0.136442419580748)
('said', 0.1470133354347164)
('people', 0

HBox(children=(IntProgress(value=0, max=2336), HTML(value='')))

avg loss: 1.11722132563591
avg loss: 1.0835749207139016


In [None]:
embeddingSize = 50
learning_rate = 0.001
n_epoch = 3
idxPairsPubMed = generateObservations(tokenizedCorpus = finalTokenizedCorpus_pubMed, word2Idx = word2Idx_pubMed)
sg_model_pubMed, sg_losses_pubMed = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsPubMed,
                                     vocabCount = vocabCount_pubMed,
                                     word2Idx = word2Idx_pubMed, idx2Word = idx2Word_pubMed, k = 15,
                                                  referenceWords = ["clinical", "obesity", "microbial", "microbiome"])

## Exploring Word Embeddings

In [48]:
from scipy.spatial.distance import cdist
def listNearestWords(model, idx2Word, referenceWords, topN):
    assert len(idx2Word) == len(model.word2Idx), "Possibly passed in two different vocabularies"
    embeddings = model.centerEmbeddings.data.numpy()
    distMat = cdist(embeddings, embeddings, metric = "cosine")
    for word in referenceWords:
        wordIdx = model.word2Idx[word]
#         print(np.argmin(distMat[wordIdx,:]))
        closestIndices = np.argsort(distMat[wordIdx,:])[0:topN]
        closestWords = [(idx2Word[idx], distMat[wordIdx, idx]) for idx in closestIndices]
        for elem in closestWords:
            print(elem)
        print("*"*50 + "\n")

In [49]:
listNearestWords(model = sg_model_ap, idx2Word = idx2Word_ap,
                 referenceWords = ["bush", "soviet", "stock", "dukakis"], topN = 10)

('bush', 2.220446049250313e-16)
('would', 0.245172107921238)
('one', 0.2542772042354078)
('last', 0.26639998381930596)
('<UNK>', 0.2878025745606383)
('said', 0.2929848618692378)
('government', 0.3014082414158188)
('president', 0.3090025777889278)
('also', 0.31906111583316044)
('two', 0.32561755407928095)
**************************************************

('soviet', 0.0)
('also', 0.26371813187150006)
('us', 0.2754812328259487)
('one', 0.2757421412874528)
('government', 0.28862726565176866)
('said', 0.2917928720369569)
('people', 0.32292569939644367)
('would', 0.3236237758736388)
('could', 0.32972287786631693)
('two', 0.33019233004600756)
**************************************************

('stock', 0.0)
('jacksons', 0.5139301958926246)
('mention', 0.5246623586362416)
('library', 0.5472043281618073)
('produce', 0.5616844124533857)
('crisis', 0.5690514279470098)
('expectations', 0.5712754139059562)
('midday', 0.5726643036563688)
('case', 0.5733000645351246)
('angolan', 0.5740486019632982

In [30]:
listNearestWords(model = sg_model_pubMed, idx2Word = idx2Word_pubMed,
                 referenceWords = ["cancer", "drug", "microbiome"], topN = 10)

('cancer', 1.1102230246251565e-16)
('considerations', 0.5357610920425107)
('mg2deficient', 0.600471133270606)
('day', 0.6008124548358114)
('numerous', 0.6035130472995224)
('classifier', 0.6086428712998291)
('ratio', 0.6112479094677603)
('kappa', 0.611541393621376)
('metagenome', 0.6140737843923152)
('manuscript', 0.6154556158382964)
**************************************************

('drug', 2.220446049250313e-16)
('complexity', 0.5405080936495106)
('59', 0.5560538680379796)
('workers', 0.5597953026126847)
('found', 0.5682576198020208)
('regions', 0.5741363153547547)
('clinical', 0.5891034315026564)
('created', 0.6040641849145176)
('templates', 0.6098548625635937)
('true', 0.610169663686098)
**************************************************

('microbiome', 1.1102230246251565e-16)
('therefore', 0.4981334883145526)
('delivery', 0.5116358247873145)
('eating', 0.5532279306206287)
('identification', 0.5647300339111128)
('experts', 0.5667386588455259)
('middleaged', 0.5876831614139415)
('8

In [87]:
"cancer" in sg_model.word2Idx

True

In [137]:
sg_model_.word2Idx

{'said': 0,
 'percent': 1,
 'year': 2,
 'new': 3,
 'us': 4,
 'years': 5,
 'people': 6,
 'one': 7,
 'would': 8,
 'two': 9,
 'also': 10,
 'soviet': 11,
 'president': 12,
 'police': 13,
 'last': 14,
 'oil': 15,
 'government': 16,
 'bank': 17,
 'officials': 18,
 'could': 19,
 'ago': 20,
 'first': 21,
 'national': 22,
 'state': 23,
 'million': 24,
 'states': 25,
 'prices': 26,
 'three': 27,
 'official': 28,
 'reported': 29,
 'back': 30,
 'monday': 31,
 'dukakis': 32,
 'rose': 33,
 'rate': 34,
 'bush': 35,
 'fire': 36,
 'war': 37,
 'get': 38,
 'military': 39,
 'economic': 40,
 'thursday': 41,
 'company': 42,
 'time': 43,
 'made': 44,
 'saying': 45,
 'today': 46,
 'american': 47,
 'since': 48,
 'roberts': 49,
 'dont': 50,
 'federal': 51,
 'told': 52,
 'mrs': 53,
 'noriega': 54,
 'forces': 55,
 'months': 56,
 'may': 57,
 'rating': 58,
 'good': 59,
 'friday': 60,
 'long': 61,
 'day': 62,
 'top': 63,
 'united': 64,
 'saudi': 65,
 'use': 66,
 'economy': 67,
 'campaign': 68,
 'gorbachev': 69,
 'ki

## How Domains Affect Word Embeddings

In [118]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 1000
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



w1 = w1.data.numpy()
w2 = w2.data.numpy()
start = time.time()
for i in range(1000):
    temp = np.matmul(w2[0:negSampleSize], w1[0])
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



avg time: 5.7178974151611327e-05
torch.Size([1000, 1])
avg time: 2.7730941772460937e-05
(1000,)


In [139]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 15
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



# w1 = w1.data.numpy()
# w2 = w2.data.numpy()
# start = time.time()
# for i in range(1000):
#     temp = np.matmul(w2[0:negSampleSize], w1[0])
# print("avg time: {}".format((time.time() - start)/nIters))
# print(temp.shape)

avg time: 1.8839120864868163e-05
torch.Size([15, 1])


In [111]:
w1 = w1.data.numpy()
w2 = w2.data.numpy()

In [114]:
np.matmul(w2[0:15], w1[0]).shape

(15,)

In [126]:
finalTokenizedCorpus_test

[['first',
  'quit',
  'grinnin',
  'like',
  'idiot',
  'indians',
  'aint',
  'supposed',
  'smile',
  'like',
  'get',
  'stoic'],
 ['like', 'gotta', 'look', 'mean', 'people', 'wont', 'respect'],
 ['people', 'run', 'dont', 'look', 'mean'],
 ['gotta',
  'look',
  'like',
  'warrior',
  'gotta',
  'look',
  'like',
  'came',
  'back',
  'killing',
  'buffalo'],
 ['tribe',
  'never',
  'hunted',
  'buffalo',
  'fishermenwhat',
  'wanna',
  'look',
  'like',
  'came',
  'back',
  'catching',
  'fish'],
 ['aint',
  'dances',
  'salmon',
  'know',
  'thomas',
  'gotta',
  'look',
  'like',
  'warrior']]

In [None]:
def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 2
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
#         indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
#         w,c = vocabCount[idx]
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
#         self.centerEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.contextEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.embeddings = nn.Embedding(vocab_size, embd_size)
        
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        
#         initrange = (2.0 / (vocabSize + embedSize)) ** 0.5  # Xavier init
        nn.init.xavier_uniform_(self.contextEmbeddings)
        nn.init.xavier_uniform_(self.centerEmbeddings)
        
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
#         self.paramList = nn.ModuleList([self.centerEmbeddings, self.contextEmbeddings] )
    def getNegSample(self, k, centerWords):
        vocabSizeWithoutUnk = self.vocabSize - 1
#         negSample = np.random.choice(vocabSizeWithoutUnk,
#                                      size = k, replace = True, p = self.wordSampleProbs)
        negSamples = []
        for centerWord in centerWords:
        negSample = random.sample(self.wordSampleProbs, k)
            while self.word2Idx[centerWord] in negSample:
    #             negSample = np.random.choice(vocabSizeWithoutUnk,
    #                                          size = k, replace = True, p = self.wordSampleProbs)
                negSample = random.sample(self.wordSampleProbs, k)
                negSamples.append(negSample)
        return(negSamples)
    def forward(self, center, context, negSampleIndices = None):
#         focus = torch.autograd.Variable(torch.LongTensor([0]))
#         context = torch.autograd.Variable(torch.LongTensor([0]))
#         allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))


#         embedCenter = self.centerEmbeddings(center).view((1, -1))
#         embedContext = self.contextEmbeddings(context).view((1, -1))
# #         print(allEmbeddingIdxs)
#         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
#         num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#         denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
#         logProb = torch.log(num/denom)
        embedCenter = self.centerEmbeddings[center].view((1, -1))
        embedContext = self.contextEmbeddings[context].view((1, -1))       
        if negSampleIndices is not None:
#             print("hey")
#             posVal = self.logSigmoid(torch.mm(embedContext, torch.t(embedCenter)))
#             print(posVal)
            posVal = self.logSigmoid(torch.sum(embedContext * embedCenter, dim = 1)).squeeze()
#             print(posVal)
#             start = time.time()
#             for i in range(1000):
            negVal = torch.bmm(self.contextEmbeddings[negSampleIndices], embedCenter.unsqueeze(2)).squeeze(2)
            negVal = self.logSigmoid(-torch.sum(negVal, dim = 1)).squeeze()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter)).shape)
#             1/0
            logProb = (posVal + negVal).mean()
        else:
#             allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))



    #         print(allEmbeddingIdxs)
    #         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
            num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#             start = time.time()
#             for i in range(1000):
            denom = torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).sum()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).shape)
#             1/0
            logProb = torch.log(num/denom)
#         print(logProb)
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, idx2Word, k, referenceWords):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
#     print(model)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    listNearestWords(model = model, idx2Word = idx2Word,
     referenceWords = referenceWords, topN = 5)
    batchSize = 32
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
#         listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
        total_loss = .0
        avgLoss = 0.0
        iteration = 0
        for step in range(0, len(trainingData), batchSize):
#         for in_w, out_w in tqdm_notebook(trainingData, position = 1):
            endIdx = np.min((i+batchSize), len(trainingData))
            myBatch = trainingData[i:(i+batchSize)]
            centerWords = [centerword for centerWord, _ in myBatch]
            contextWords = [contextWord for _, contextWord in myBatch]
            if k is not None:
                negSamples = model.getNegSample(k = k, centerWords = centerWords)
            else:
                negSamples = None
#             print("neg samples found")
#             print(negSamples)
            centerIDs = [word2Idx[idx] for idx in centerWords]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            contextIDs = [word2Idx[idx] for idx in contextWords]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             if in_w in word2Idx:
#                 in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
#             else:
#                 in_w_var = word2Idx["<UNK>"]
#             if out_w in word2Idx:
#                 out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             else:
#                 out_w_var = word2Idx["<UNK>"]
            
            model.zero_grad()
            log_probs = model(centerIDs, contextIDs, negSampleIndices = negSamples)
            loss = -log_probs#loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
            avgLoss += loss.data.numpy()
            iteration += 1
            if iteration % 10000 == 0:
                avgLoss = total_loss/(iteration)
                print("avg loss: {}".format(avgLoss))
                avgLoss = 0.0
            if iteration % 20000 == 0:
                listNearestWords(model = model, idx2Word = idx2Word,
                 referenceWords = referenceWords, topN = 5)
        losses.append(total_loss)    
        print(f'Loss at epoch {epoch}: {total_loss/len(trainingData)}')
        listNearestWords(model = model, idx2Word = idx2Word,
                     referenceWords = referenceWords, topN = 5)
    return(model, losses)