# Lab 10: Word Embeddings
Thinking of using stuff from here
https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

conda install -c conda-forge tqdm

conda install -c conda-forge ipywidgets

## Janitorial Work

In [11]:
testCorpus = ["First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.",
             "No. Like this. You gotta look mean, or people won’t respect you.",
              " people will run all over you if you don’t look mean.",
              "You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.",
             "But our tribe never hunted buffalo. We were fishermen."
             "What? You wanna look like you just came back from catching a fish?",
             "This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior."]
maxDocs = 250

In [12]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/oliver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Read in pubmed corpus into a text file

import glob
pubMedDataFolderPath = "data/pubMed_corpus/"
pubMedDataFiles = glob.glob(pubMedDataFolderPath + "*.txt")
pubMedCorpus = [""]*len(pubMedDataFiles)
for idx, pubMedDataPath in enumerate(pubMedDataFiles):
    with open(pubMedDataPath, "r") as pubMedFile:
        text = pubMedFile.read().strip()
        pubMedCorpus[idx] = text
pubMedCorpus = pubMedCorpus[0:maxDocs]
print("{} pub med abstracts".format(len(pubMedCorpus)))

250 pub med abstracts


In [14]:
# Read in the ap corpus
apTextFile = "data/ap.txt"
apCorpus = []
readText = False
with open(apTextFile) as apDataFile:
    for line in apDataFile:
        if readText:
            apCorpus.append(line.strip())
            readText = False
        if line == "<TEXT>\n":
            readText = True
apCorpus = apCorpus[0:maxDocs]
print("{} ap articles".format(len(apCorpus)))

250 ap articles


In [15]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def removePunctuation(myStr):
    excludedCharacters = string.punctuation + "’"
    newStr = "".join(char for char in myStr if char not in excludedCharacters)
    return(newStr)
def removeStopWords(tokenList):
    newTokenList = [tok for tok in tokenList if tok not in stopwords.words('english')]
    return(newTokenList)
def tokenize_corpus(corpus):
    tokens = [removeStopWords(removePunctuation(x).lower().split()) for x in corpus]
    return tokens

apCorpusTokenized = tokenize_corpus(apCorpus)
pubMedCorpusTokenized = tokenize_corpus(pubMedCorpus)
testCorpusTokenized = tokenize_corpus(testCorpus)

[nltk_data] Downloading package stopwords to /home/oliver/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
import time
from tqdm import tqdm, tqdm_notebook
from collections import Counter

maxVocabSize = None

def extractVocabMappers(tokenizedCorpus, vocabSizeMax = None):
    UNK = "<UNK>"
    flattenedCorpus = [item for sublist in tokenizedCorpus for item in sublist]
    wordCounts = Counter(flattenedCorpus)
    wordCounts = wordCounts.most_common(vocabSizeMax)
    vocabulary = [word for word, count in wordCounts]
    
    # below is more readable but significantly slower code
    if False:
        vocabulary = []
        for sentence in tqdm(tokenizedCorpus):
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
    vocabulary.append(UNK)
    print("Vocab size: {}".format(len(vocabulary)))
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    newTokenizedCorpus = []# all words missing from vocab replaced with <UNK>
    for doc in tokenizedCorpus:
        newDoc = [word if word in word2idx else UNK for word in doc]
        newTokenizedCorpus.append(newDoc)
    return(word2idx, idx2word, wordCounts, newTokenizedCorpus)

start = time.time()
print("Building ap corpus vocabulary")
word2Idx_ap, idx2Word_ap, vocabCount_ap, finalTokenizedCorpus_ap = extractVocabMappers(apCorpusTokenized,
                                                                                       vocabSizeMax = maxVocabSize)
print("ap data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building pubMed corpus vocabulary")
word2Idx_pubMed, idx2Word_pubMed, vocabCount_pubMed, finalTokenizedCorpus_pubMed = extractVocabMappers(pubMedCorpusTokenized,
                                                                                                       vocabSizeMax = maxVocabSize)
print("pubmed data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building test corpus vocabulary")
word2Idx_test, idx2Word_test, vocabCount_test, finalTokenizedCorpus_test = extractVocabMappers(testCorpusTokenized,
                                                                                               vocabSizeMax = maxVocabSize)
print("test data tokenized in {} seconds".format(time.time() - start))

Building ap corpus vocabulary
Vocab size: 12864
ap data tokenized in 0.04893183708190918 seconds

Building pubMed corpus vocabulary
Vocab size: 6888
pubmed data tokenized in 0.02375960350036621 seconds

Building test corpus vocabulary
Vocab size: 36
test data tokenized in 0.0003077983856201172 seconds


## Word2Vec Implementation

In [17]:
import numpy as np
import torch
from torch import nn
import random

In [18]:
def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 5
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
#         indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
#         w,c = vocabCount[idx]
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
#         self.centerEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.contextEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.embeddings = nn.Embedding(vocab_size, embd_size)
        
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
#         self.paramList = nn.ModuleList([self.centerEmbeddings, self.contextEmbeddings] )
    def getNegSample(self, k, centerWord):
        vocabSizeWithoutUnk = self.vocabSize - 1
#         negSample = np.random.choice(vocabSizeWithoutUnk,
#                                      size = k, replace = True, p = self.wordSampleProbs)
        negSample = random.sample(self.wordSampleProbs, k)
        while self.word2Idx[centerWord] in negSample:
#             negSample = np.random.choice(vocabSizeWithoutUnk,
#                                          size = k, replace = True, p = self.wordSampleProbs)
            negSample = random.sample(self.wordSampleProbs, k)
        return(negSample)
    def forward(self, center, context, negSampleIndices = None):
#         focus = torch.autograd.Variable(torch.LongTensor([0]))
#         context = torch.autograd.Variable(torch.LongTensor([0]))
#         allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))


#         embedCenter = self.centerEmbeddings(center).view((1, -1))
#         embedContext = self.contextEmbeddings(context).view((1, -1))
# #         print(allEmbeddingIdxs)
#         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
#         num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#         denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
#         logProb = torch.log(num/denom)
        embedCenter = self.centerEmbeddings[center].view((1, -1))
        embedContext = self.contextEmbeddings[context].view((1, -1))       
        if negSampleIndices is not None:
#             print("hey")
            posVal = self.logSigmoid (torch.mm(embedContext, torch.t(embedCenter)))
#             start = time.time()
#             for i in range(1000):
            negVal = torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter))
            negVal = self.logSigmoid (-torch.sum(negVal))
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter)).shape)
#             1/0
            logProb = posVal + negVal
        else:
#             allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))



    #         print(allEmbeddingIdxs)
    #         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
            num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#             start = time.time()
#             for i in range(1000):
            denom = torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).sum()
#             print("avg time: {}".format((time.time() - start)/100))
#             print(torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).shape)
#             1/0
            logProb = torch.log(num/denom)
    
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, k):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
#     print(model)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
#         print("entered epoch")
        total_loss = .0
        for in_w, out_w in tqdm_notebook(trainingData, position = 1):
            if k is not None:
                negSamples = model.getNegSample(k = k, centerWord = in_w)
            else:
                negSamples = None
#             print("neg samples found")
#             print(negSamples)
            in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             if in_w in word2Idx:
#                 in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
#             else:
#                 in_w_var = word2Idx["<UNK>"]
#             if out_w in word2Idx:
#                 out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             else:
#                 out_w_var = word2Idx["<UNK>"]
            
            model.zero_grad()
            log_probs = model(in_w_var, out_w_var, negSampleIndices = negSamples)
            loss = -log_probs[0]#loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
        losses.append(total_loss)
        if epoch % 10 == 0 or n_epoch <= 10:    
            print(f'Loss at epoch {epoch}: {total_loss/len(trainingData)}')
    return(model, losses)

In [123]:
embd_size = 100
learning_rate = 0.001
n_epoch = 60
idxPairsTest = generateObservations(tokenizedCorpus = finalTokenizedCorpus_test, word2Idx = word2Idx_test)
sg_model, sg_losses = train_skipgram(embeddingSize = 5, trainingData = idxPairsTest, vocabCount = vocabCount_test,
                                     word2Idx = word2Idx_test, k = None)

training on 380 observations


HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 0: [5.711178]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 10: [4.792516]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 20: [4.375297]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 30: [4.117554]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 40: [3.9415078]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

Loss at epoch 50: [3.8134084]


HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

HBox(children=(IntProgress(value=0, max=380), HTML(value='')))

In [20]:
embeddingSize = 50
learning_rate = 0.001
n_epoch = 6
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP,
                                     vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = None)

training on 587990 observations


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 0: [24.51307335]


HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 1: [21.3692954]


HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 2: [19.56708107]


HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 3: [18.27246042]


HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 4: [17.27497066]


HBox(children=(IntProgress(value=0, max=587990), HTML(value='')))

Loss at epoch 5: [16.4724298]


In [102]:
embeddingSize = 20
learning_rate = 0.001
n_epoch = 1
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP,
                                     vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = None)

training on 238040 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=238040), HTML(value='')))

avg time: 0.001267857551574707
torch.Size([7492, 1])


ZeroDivisionError: division by zero

In [78]:
embeddingSize = 20
learning_rate = 0.001
n_epoch = 1
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP,
                                     vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = 15)

training on 238040 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=238040), HTML(value='')))

Loss at epoch 0: [18.14535792]


In [89]:
embeddingSize = 20
learning_rate = 0.001
n_epoch = 5
idxPairsPubMed = generateObservations(tokenizedCorpus = finalTokenizedCorpus_pubMed, word2Idx = word2Idx_pubMed)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsPubMed,
                                     vocabCount = vocabCount_pubMed,
                                     word2Idx = word2Idx_pubMed, k = 15)

training on 128126 observations


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

Loss at epoch 0: [-9.896313]


HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

In [92]:
embeddingSize = 20
learning_rate = 0.001
n_epoch = 5
idxPairsPubMed = generateObservations(tokenizedCorpus = finalTokenizedCorpus_pubMed, word2Idx = word2Idx_pubMed)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsPubMed,
                                     vocabCount = vocabCount_pubMed,
                                     word2Idx = word2Idx_pubMed, k = None)

training on 128126 observations


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

Loss at epoch 0: [15.96258663]


HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=128126), HTML(value='')))

In [93]:
sg_losses

[array([2045222.4], dtype=float32),
 array([1862422.1], dtype=float32),
 array([1746688.], dtype=float32),
 array([1662227.6], dtype=float32),
 array([1596516.5], dtype=float32)]

## Exploring Word Embeddings

In [24]:
from scipy.spatial.distance import cdist
def listNearestWords(model, idx2Word, referenceWords, topN):
    assert len(idx2Word) == len(model.word2Idx), "Possibly passed in two different vocabularies"
    embeddings = model.centerEmbeddings.data.numpy()
    distMat = cdist(embeddings, embeddings, metric = "cosine")
    for word in referenceWords:
        wordIdx = model.word2Idx[word]
#         print(np.argmin(distMat[wordIdx,:]))
        closestIndices = np.argsort(distMat[wordIdx,:])[0:topN]
        closestWords = [(idx2Word[idx], distMat[wordIdx, idx]) for idx in closestIndices]
        for elem in closestWords:
            print(elem)
        print("*"*50 + "\n")

In [25]:
listNearestWords(model = sg_model, idx2Word = idx2Word_ap,
                 referenceWords = ["bush", "soviet", "stock", "dukakis"], topN = 10)

('bush', 1.1102230246251565e-16)
('president', 0.13244959422732927)
('would', 0.13662114021683736)
('people', 0.1433898401874425)
('government', 0.18918568623643628)
('also', 0.19677553426517247)
('said', 0.20960274614864016)
('us', 0.21823901837751392)
('years', 0.22026618420593613)
('could', 0.2219275275544691)
**************************************************

('soviet', 0.0)
('union', 0.19148760713372004)
('people', 0.24888523336252888)
('officials', 0.2694699100602378)
('bush', 0.27084279256233157)
('government', 0.27761080050898956)
('us', 0.2848540779235168)
('could', 0.2928006077097679)
('also', 0.303375641062432)
('national', 0.3039809919761105)
**************************************************

('stock', 0.0)
('figures', 0.4962425348903996)
('firearms', 0.499091976045016)
('capiz', 0.5366904780385411)
('drugrelated', 0.5462987123293082)
('calypso', 0.5514191343034507)
('roses', 0.5667759478604235)
('incorporated', 0.5673369273215838)
('advanced', 0.5682758029526187)
('nelso

In [142]:
for elem in finalTokenizedCorpus_ap:
    if "dukakis" in elem:
        print(elem)

['organized', 'union', 'boost', 'behind', 'single', 'candidate', 'saturdays', 'democratic', 'caucuses', 'michigan', 'state', 'union', 'members', 'wield', 'clout', 'almost', 'anywhere', 'else', 'national', 'labor', 'leaders', 'assuming', 'michael', 'dukakis', 'eventual', 'nominee', 'prevented', 'endorsing', 'appears', 'growing', 'rankandfile', 'support', 'jesse', 'jackson', 'gotten', 'union', 'votes', 'candidates', 'primaries', 'far', 'richard', 'gephardt', 'also', 'considerable', 'union', 'support', 'none', 'democratic', 'candidates', 'appears', 'hearts', 'votes', 'majority', 'states', '750000', 'rankandfile', 'union', 'workers', 'nearly', 'half', 'members', 'united', 'auto', 'workers']
['republican', 'nominee', 'george', 'bush', 'said', 'felt', 'nervous', 'voted', 'today', 'adopted', 'home', 'state', 'texas', 'ended', 'presidential', 'campaign', 'telling', 'voters', 'election', 'referendum', 'philosophy', 'way', 'life', 'vice', 'president', 'wife', 'barbara', 'voted', 'hotel', 'confer

In [87]:
"cancer" in sg_model.word2Idx

True

In [137]:
sg_model.word2Idx

{'said': 0,
 'percent': 1,
 'year': 2,
 'new': 3,
 'us': 4,
 'years': 5,
 'people': 6,
 'one': 7,
 'would': 8,
 'two': 9,
 'also': 10,
 'soviet': 11,
 'president': 12,
 'police': 13,
 'last': 14,
 'oil': 15,
 'government': 16,
 'bank': 17,
 'officials': 18,
 'could': 19,
 'ago': 20,
 'first': 21,
 'national': 22,
 'state': 23,
 'million': 24,
 'states': 25,
 'prices': 26,
 'three': 27,
 'official': 28,
 'reported': 29,
 'back': 30,
 'monday': 31,
 'dukakis': 32,
 'rose': 33,
 'rate': 34,
 'bush': 35,
 'fire': 36,
 'war': 37,
 'get': 38,
 'military': 39,
 'economic': 40,
 'thursday': 41,
 'company': 42,
 'time': 43,
 'made': 44,
 'saying': 45,
 'today': 46,
 'american': 47,
 'since': 48,
 'roberts': 49,
 'dont': 50,
 'federal': 51,
 'told': 52,
 'mrs': 53,
 'noriega': 54,
 'forces': 55,
 'months': 56,
 'may': 57,
 'rating': 58,
 'good': 59,
 'friday': 60,
 'long': 61,
 'day': 62,
 'top': 63,
 'united': 64,
 'saudi': 65,
 'use': 66,
 'economy': 67,
 'campaign': 68,
 'gorbachev': 69,
 'ki

## How Domains Affect Word Embeddings

In [118]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 1000
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



w1 = w1.data.numpy()
w2 = w2.data.numpy()
start = time.time()
for i in range(1000):
    temp = np.matmul(w2[0:negSampleSize], w1[0])
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



avg time: 5.7178974151611327e-05
torch.Size([1000, 1])
avg time: 2.7730941772460937e-05
(1000,)


In [120]:
w1 = nn.Parameter(torch.randn(1000, 100).float(), requires_grad=True)
w2 = nn.Parameter(torch.randn(1000,  100).float(), requires_grad=True)
nIters = 1000
negSampleSize = 15
start = time.time()
for i in range(1000):
    temp = torch.mm(w2[0:negSampleSize], torch.t(w2[0].view(1, -1)))
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)



w1 = w1.data.numpy()
w2 = w2.data.numpy()
start = time.time()
for i in range(1000):
    temp = np.matmul(w2[0:negSampleSize], w1[0])
print("avg time: {}".format((time.time() - start)/nIters))
print(temp.shape)

avg time: 3.156447410583496e-05
torch.Size([15, 1])
avg time: 5.325794219970703e-06
(15,)


In [None]:
np.matmul()

In [111]:
w1 = w1.data.numpy()
w2 = w2.data.numpy()

In [114]:
np.matmul(w2[0:15], w1[0]).shape

(15,)

In [126]:
finalTokenizedCorpus_test

[['first',
  'quit',
  'grinnin',
  'like',
  'idiot',
  'indians',
  'aint',
  'supposed',
  'smile',
  'like',
  'get',
  'stoic'],
 ['like', 'gotta', 'look', 'mean', 'people', 'wont', 'respect'],
 ['people', 'run', 'dont', 'look', 'mean'],
 ['gotta',
  'look',
  'like',
  'warrior',
  'gotta',
  'look',
  'like',
  'came',
  'back',
  'killing',
  'buffalo'],
 ['tribe',
  'never',
  'hunted',
  'buffalo',
  'fishermenwhat',
  'wanna',
  'look',
  'like',
  'came',
  'back',
  'catching',
  'fish'],
 ['aint',
  'dances',
  'salmon',
  'know',
  'thomas',
  'gotta',
  'look',
  'like',
  'warrior']]