# Lab 10: Word Embeddings
Thinking of using stuff from here
https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

conda install -c conda-forge tqdm

conda install -c conda-forge ipywidgets
conda install -c anaconda nltk 


## Janitorial Work

In [57]:
testCorpus = ["First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.",
             "No. Like this. You gotta look mean, or people won’t respect you.",
              " people will run all over you if you don’t look mean.",
              "You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.",
             "But our tribe never hunted buffalo. We were fishermen."
             "What? You wanna look like you just came back from catching a fish?",
             "This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior."]
maxDocs = 1500

In [58]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ob2285/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [59]:
# Read in pubmed corpus into a text file

import glob
pubMedDataFolderPath = "data/pubMed_corpus/"
pubMedDataFiles = glob.glob(pubMedDataFolderPath + "*.txt")
pubMedCorpus = [""]*len(pubMedDataFiles)
for idx, pubMedDataPath in enumerate(pubMedDataFiles):
    with open(pubMedDataPath, "r") as pubMedFile:
        text = pubMedFile.read().strip()
        pubMedCorpus[idx] = text
pubMedCorpus = pubMedCorpus[0:maxDocs]
print("{} pub med abstracts".format(len(pubMedCorpus)))

1500 pub med abstracts


In [60]:
# Read in the ap corpus
apTextFile = "data/ap.txt"
apCorpus = []
readText = False
with open(apTextFile) as apDataFile:
    for line in apDataFile:
        if readText:
            apCorpus.append(line.strip())
            readText = False
        if line == "<TEXT>\n":
            readText = True
apCorpus = apCorpus[0:maxDocs]
print("{} ap articles".format(len(apCorpus)))

1500 ap articles


In [61]:
import string
import nltk
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import re
def removePunctuation(myStr):
    excludedCharacters = string.punctuation + "’"
    newStr = "".join(char for char in myStr if char not in excludedCharacters)
    return(newStr)
def removeStopWords(tokenList):
    newTokenList = [tok for tok in tokenList if tok not in stopwords.words('english')]
    return(newTokenList)
def cleanDocStr(docStr):
    docStr = docStr.lower()
    docStr = removePunctuation(docStr)
    docStr = re.sub('\d', '%d%', docStr)
    docStrTokenized = nltk.tokenize.word_tokenize(docStr)
    myStopWords = set(stopwords.words('english'))
    docStrTokenized = [tok for tok in docStrTokenized if tok not in myStopWords]
    return(docStrTokenized)
def tokenize_corpus(corpus):
    tokens = [cleanDocStr(x) for x in corpus]
    return tokens

apCorpusTokenized = tokenize_corpus(apCorpus)
pubMedCorpusTokenized = tokenize_corpus(pubMedCorpus)
testCorpusTokenized = tokenize_corpus(testCorpus)

[nltk_data] Downloading package stopwords to /home/ob2285/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ob2285/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
import time
from tqdm import tqdm, tqdm_notebook
from collections import Counter

minVocabOccurence = 5

def extractVocabMappers(tokenizedCorpus, vocabSizeMax = None, minVocabOccurence = 0):
    UNK = "<UNK>"
    flattenedCorpus = [item for sublist in tokenizedCorpus for item in sublist]
    wordCounts = Counter(flattenedCorpus).most_common()
    wordCounts = [(w, c) for w,c in wordCounts if c > minVocabOccurence]
#     wordCounts = wordCounts.most_common(vocabSizeMax)
    vocabulary = [word for word, count in wordCounts]
    
    # below is more readable but significantly slower code
    if False:
        vocabulary = []
        for sentence in tqdm(tokenizedCorpus):
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
    vocabulary.append(UNK)
    print("Vocab size: {}".format(len(vocabulary)))
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    newTokenizedCorpus = []# all words missing from vocab replaced with <UNK>
    for doc in tokenizedCorpus:
        newDoc = [word if word in word2idx else UNK for word in doc]
        newTokenizedCorpus.append(newDoc)
    return(word2idx, idx2word, wordCounts, newTokenizedCorpus)

start = time.time()
print("Building ap corpus vocabulary")
word2Idx_ap, idx2Word_ap, vocabCount_ap, finalTokenizedCorpus_ap = extractVocabMappers(apCorpusTokenized,
#                                                                                        vocabSizeMax = maxVocabSize,
                                                                                      minVocabOccurence = minVocabOccurence)
print("ap data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building pubMed corpus vocabulary")
word2Idx_pubMed, idx2Word_pubMed, vocabCount_pubMed, finalTokenizedCorpus_pubMed = extractVocabMappers(pubMedCorpusTokenized,
#                                                                                                        vocabSizeMax = maxVocabSize,
                                                                                                      minVocabOccurence = minVocabOccurence)
print("pubmed data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building test corpus vocabulary")
word2Idx_test, idx2Word_test, vocabCount_test, finalTokenizedCorpus_test = extractVocabMappers(testCorpusTokenized,
#                                                                                                vocabSizeMax = maxVocabSize,
                                                                                              minVocabOccurence = 0)
print("test data tokenized in {} seconds".format(time.time() - start))

Building ap corpus vocabulary
Vocab size: 8330
ap data tokenized in 0.16865897178649902 seconds

Building pubMed corpus vocabulary
Vocab size: 4527
pubmed data tokenized in 0.07795190811157227 seconds

Building test corpus vocabulary
Vocab size: 38
test data tokenized in 0.00021338462829589844 seconds


## Word2Vec Implementation

In [63]:
import numpy as np
import torch
from torch import nn
import random

In [64]:
##### BATCH VERSION ######


def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 3
    idxPairs = []
    for sentence in tokenizedCorpus:
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make sure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs)
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        
#         initrange = (2.0 / (vocabSize + embedSize)) ** 0.5  # Xavier init
        nn.init.xavier_uniform_(self.contextEmbeddings)
#         self.output_emb.weight.data.uniform_(-0, 0)
        nn.init.uniform_(self.centerEmbeddings, -0,0)
        
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
    def getNegSample(self, k, centerWords):
        vocabSizeWithoutUnk = self.vocabSize - 1
        negSamples = []
        for centerWord in centerWords:
            negSample = random.sample(self.wordSampleProbs, k)
            while self.word2Idx[centerWord] in negSample:
                negSample = random.sample(self.wordSampleProbs, k)
            negSamples.append(negSample)
        return(negSamples)
    def forward(self, center, context, negSampleIndices):
        embedCenter = self.centerEmbeddings[center]#.view((1, -1))
        embedContext = self.contextEmbeddings[context]#.view((1, -1))       
        posVal = self.logSigmoid(torch.sum(embedContext * embedCenter, dim = 1)).squeeze()
        negSampleIndices = torch.autograd.Variable(torch.LongTensor(negSampleIndices))
        negVal = torch.bmm(self.contextEmbeddings[negSampleIndices], embedCenter.unsqueeze(2)).squeeze(2)
        negVal = self.logSigmoid(-torch.sum(negVal, dim = 1)).squeeze()
        logProb = -(posVal + negVal).mean()
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, idx2Word, k, referenceWords):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    listNearestWords(model = model, idx2Word = idx2Word,
     referenceWords = referenceWords, topN = 5)
    batchSize = 1024
    #         listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
        total_loss = .0
        avgLoss = 0.0
        iteration = 0
        for step in tqdm_notebook(range(0, len(trainingData), batchSize), position = 1):
            endIdx = np.min([(step+batchSize), len(trainingData)])
            myBatch = trainingData[step:(step+batchSize)]
            centerWords = [elem[0] for elem in myBatch]
            contextWords = [elem[1] for elem in myBatch]
            negSamples = model.getNegSample(k = k, centerWords = centerWords)
            centerIDs = [word2Idx[idx] for idx in centerWords]
            contextIDs = [word2Idx[idx] for idx in contextWords]
            model.zero_grad()
            loss = model(centerIDs, contextIDs, negSampleIndices = negSamples)
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
            avgLoss += loss.data.numpy()
            iteration += 1
            if iteration % 500 == 0:
                avgLoss = avgLoss/(500)
                print("avg loss: {}".format(avgLoss))
#                 avgLoss = 0.0
#             if iteration % 2000 == 0:
#                 listNearestWords(model = model, idx2Word = idx2Word,
#                  referenceWords = referenceWords, topN = 5)
        losses.append(total_loss)    
        print("Loss at epoch {}: {}".format(epoch, total_loss/iteration))
        if epoch % 1 == 0 and epoch != 0:
            listNearestWords(model = model, idx2Word = idx2Word,
                         referenceWords = referenceWords, topN = 5)
    return(model, losses)

In [65]:
from scipy.spatial.distance import cdist
def listNearestWords(model, idx2Word, referenceWords, topN):
    assert len(idx2Word) == len(model.word2Idx), "Possibly passed in two different vocabularies"
    embeddings = model.centerEmbeddings.data.numpy()
    distMat = cdist(embeddings, embeddings, metric = "cosine")
    for word in referenceWords:
        wordIdx = model.word2Idx[word]
        closestIndices = np.argsort(distMat[wordIdx,:])[0:topN]
        closestWords = [(idx2Word[idx], distMat[wordIdx, idx]) for idx in closestIndices]
        for elem in closestWords:
            print(elem)
        print("*"*50 + "\n")

In [66]:
# embd_size = 100
# learning_rate = 0.001
# n_epoch = 60
# idxPairsTest = generateObservations(tokenizedCorpus = finalTokenizedCorpus_test, word2Idx = word2Idx_test)
# sg_model, sg_losses = train_skipgram(embeddingSize = 5, trainingData = idxPairsTest, vocabCount = vocabCount_test,
#                                      word2Idx = word2Idx_test, k = None)

In [None]:
embeddingSize = 50
learning_rate = 0.01
n_epoch = 60
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model_ap, sg_losses_ap = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP,
                                     vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, idx2Word = idx2Word_ap, k = 20,
                                          referenceWords = ["bush", "soviet", "president", "military", "american"])

training on 2640138 observations
('%', nan)
('coral', nan)
('austerity', nan)
('varied', nan)
('hardware', nan)
**************************************************

('%', nan)
('coral', nan)
('austerity', nan)
('varied', nan)
('hardware', nan)
**************************************************

('%', nan)
('coral', nan)
('austerity', nan)
('varied', nan)
('hardware', nan)
**************************************************

('%', nan)
('coral', nan)
('austerity', nan)
('varied', nan)
('hardware', nan)
**************************************************

('%', nan)
('coral', nan)
('austerity', nan)
('varied', nan)
('hardware', nan)
**************************************************



HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2579), HTML(value='')))

In [None]:
embeddingSize = 50
learning_rate = 0.001
n_epoch = 3
idxPairsPubMed = generateObservations(tokenizedCorpus = finalTokenizedCorpus_pubMed, word2Idx = word2Idx_pubMed)
sg_model_pubMed, sg_losses_pubMed = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsPubMed,
                                     vocabCount = vocabCount_pubMed,
                                     word2Idx = word2Idx_pubMed, idx2Word = idx2Word_pubMed, k = 15,
                                                  referenceWords = ["clinical", "obesity", "microbial", "microbiome"])

## Exploring Word Embeddings

In [48]:
from scipy.spatial.distance import cdist
def listNearestWords(model, idx2Word, referenceWords, topN):
    assert len(idx2Word) == len(model.word2Idx), "Possibly passed in two different vocabularies"
    embeddings = model.centerEmbeddings.data.numpy()
    distMat = cdist(embeddings, embeddings, metric = "cosine")
    for word in referenceWords:
        wordIdx = model.word2Idx[word]
#         print(np.argmin(distMat[wordIdx,:]))
        closestIndices = np.argsort(distMat[wordIdx,:])[0:topN]
        closestWords = [(idx2Word[idx], distMat[wordIdx, idx]) for idx in closestIndices]
        for elem in closestWords:
            print(elem)
        print("*"*50 + "\n")

In [None]:
# listNearestWords(model = sg_model_ap, idx2Word = idx2Word_ap,
#                  referenceWords = ["bush", "soviet", "stock", "dukakis"], topN = 10)

In [None]:
# listNearestWords(model = sg_model_pubMed, idx2Word = idx2Word_pubMed,
#                  referenceWords = ["cancer", "drug", "microbiome"], topN = 10)

## How Domains Affect Word Embeddings

In [118]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 1000
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



w1 = w1.data.numpy()
w2 = w2.data.numpy()
start = time.time()
for i in range(1000):
    temp = np.matmul(w2[0:negSampleSize], w1[0])
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



avg time: 5.7178974151611327e-05
torch.Size([1000, 1])
avg time: 2.7730941772460937e-05
(1000,)


In [139]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 15
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



# w1 = w1.data.numpy()
# w2 = w2.data.numpy()
# start = time.time()
# for i in range(1000):
#     temp = np.matmul(w2[0:negSampleSize], w1[0])
# print("avg time: {}".format((time.time() - start)/nIters))
# print(temp.shape)

avg time: 1.8839120864868163e-05
torch.Size([15, 1])


In [111]:
w1 = w1.data.numpy()
w2 = w2.data.numpy()

In [114]:
np.matmul(w2[0:15], w1[0]).shape

(15,)

In [126]:
finalTokenizedCorpus_test

[['first',
  'quit',
  'grinnin',
  'like',
  'idiot',
  'indians',
  'aint',
  'supposed',
  'smile',
  'like',
  'get',
  'stoic'],
 ['like', 'gotta', 'look', 'mean', 'people', 'wont', 'respect'],
 ['people', 'run', 'dont', 'look', 'mean'],
 ['gotta',
  'look',
  'like',
  'warrior',
  'gotta',
  'look',
  'like',
  'came',
  'back',
  'killing',
  'buffalo'],
 ['tribe',
  'never',
  'hunted',
  'buffalo',
  'fishermenwhat',
  'wanna',
  'look',
  'like',
  'came',
  'back',
  'catching',
  'fish'],
 ['aint',
  'dances',
  'salmon',
  'know',
  'thomas',
  'gotta',
  'look',
  'like',
  'warrior']]