# Lab 10: Word Embeddings
Thinking of using stuff from here
https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

conda install -c conda-forge tqdm
conda install -c conda-forge ipywidgets

## Janitorial Work

In [58]:
testCorpus = ["First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.",
             "No. Like this. You gotta look mean, or people won’t respect you.",
              " people will run all over you if you don’t look mean.",
              "You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.",
             "But our tribe never hunted buffalo. We were fishermen."
             "What? You wanna look like you just came back from catching a fish?",
             "This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior."]
maxDocs = 50

In [59]:
# Read in pubmed corpus into a text file

import glob
pubMedDataFolderPath = "data/pubMed_corpus/"
pubMedDataFiles = glob.glob(pubMedDataFolderPath + "*.txt")
pubMedCorpus = [""]*len(pubMedDataFiles)
for idx, pubMedDataPath in enumerate(pubMedDataFiles):
    with open(pubMedDataPath, "r") as pubMedFile:
        text = pubMedFile.read().strip()
        pubMedCorpus[idx] = text
pubMedCorpus = pubMedCorpus[0:maxDocs]
print("{} pub med abstracts".format(len(pubMedCorpus)))

50 pub med abstracts


In [60]:
# Read in the ap corpus
apTextFile = "data/ap.txt"
apCorpus = []
readText = False
with open(apTextFile) as apDataFile:
    for line in apDataFile:
        if readText:
            apCorpus.append(line.strip())
            readText = False
        if line == "<TEXT>\n":
            readText = True
apCorpus = apCorpus[0:maxDocs]
print("{} ap articles".format(len(apCorpus)))

50 ap articles


In [61]:
import string
def removePunctuation(myStr):
    excludedCharacters = string.punctuation + "’"
    newStr = "".join(char for char in myStr if char not in excludedCharacters)
    return(newStr)
def tokenize_corpus(corpus):
    tokens = [removePunctuation(x).split() for x in corpus]
    return tokens

apCorpusTokenized = tokenize_corpus(apCorpus)
pubMedCorpusTokenized = tokenize_corpus(pubMedCorpus)
testCorpusTokenized = tokenize_corpus(testCorpus)

In [62]:
import time
from tqdm import tqdm, tqdm_notebook
from collections import Counter

maxVocabSize = 10000

def extractVocabMappers(tokenizedCorpus, vocabSizeMax = None):
    UNK = "<UNK>"
    flattenedCorpus = [item for sublist in tokenizedCorpus for item in sublist]
    wordCounts = Counter(flattenedCorpus)
    wordCounts = wordCounts.most_common(vocabSizeMax)
    vocabulary = [word for word, count in wordCounts]
    
    # below is more readable but significantly slower code
    if False:
        vocabulary = []
        for sentence in tqdm(tokenizedCorpus):
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
    vocabulary.append(UNK)
    print("Vocab size: {}".format(len(vocabulary)))
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    newTokenizedCorpus = []# all words missing from vocab replaced with <UNK>
    for doc in tokenizedCorpus:
        newDoc = [word if word in word2idx else UNK for word in doc]
        newTokenizedCorpus.append(newDoc)
    return(word2idx, idx2word, wordCounts, newTokenizedCorpus)

start = time.time()
print("Building ap corpus vocabulary")
word2Idx_ap, idx2Word_ap, vocabCount_ap, finalTokenizedCorpus_ap = extractVocabMappers(apCorpusTokenized,
                                                                                       vocabSizeMax = maxVocabSize)
print("ap data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building pubMed corpus vocabulary")
word2Idx_pubMed, idx2Word_pubMed, vocabCount_pubMed, finalTokenizedCorpus_pubMed = extractVocabMappers(pubMedCorpusTokenized,
                                                                                                       vocabSizeMax = maxVocabSize)
print("pubmed data tokenized in {} seconds\n".format(time.time() - start))
start = time.time()
print("Building test corpus vocabulary")
word2Idx_test, idx2Word_test, vocabCount_test, finalTokenizedCorpus_test = extractVocabMappers(testCorpusTokenized,
                                                                                               vocabSizeMax = maxVocabSize)
print("test data tokenized in {} seconds".format(time.time() - start))

Building ap corpus vocabulary
Vocab size: 5168
ap data tokenized in 0.011765718460083008 seconds

Building pubMed corpus vocabulary
Vocab size: 2847
pubmed data tokenized in 0.0056684017181396484 seconds

Building test corpus vocabulary
Vocab size: 59
test data tokenized in 0.0002300739288330078 seconds


## Word2Vec Implementation

In [63]:
import numpy as np
import torch
from torch import nn
import random

In [64]:
def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 1
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
#         indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)


def generateWordSamplingProb(vocabCount, word2Idx):
    wordSampleProbs = [0.0]*len(vocabCount)
    numWords = np.sum([count**0.75 for word, count in vocabCount])
    for idx in range(len(vocabCount)):
        w,c = vocabCount[idx]
        wordSampleProbs[word2Idx[w]] = (c**0.75)/(numWords)
        
        
        
    wordSampleProbs = []
    numWords = np.sum([count for word, count in vocabCount])
    for w,c in vocabCount:
#         w,c = vocabCount[idx]
        wordSampleProbs.extend([word2Idx[w]] * int(((c/numWords)**0.75)/0.001))
    return(wordSampleProbs)
    
class SkipGram(nn.Module):
    def __init__(self, vocabSize, embedSize, vocabCount, word2Idx):
        super(SkipGram, self).__init__()
        self.vocabSize = vocabSize
        self.word2Idx = word2Idx
#         self.centerEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.contextEmbeddings = nn.Embedding(vocab_size, embd_size)
#         self.embeddings = nn.Embedding(vocab_size, embd_size)
        
        self.centerEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                     embedSize).float(), requires_grad=True)
        self.contextEmbeddings = nn.Parameter(torch.randn(vocabSize,
                                                      embedSize).float(), requires_grad=True)
        self.wordSampleProbs = generateWordSamplingProb(vocabCount, word2Idx)
        self.logSigmoid = nn.LogSigmoid()
#         self.paramList = nn.ModuleList([self.centerEmbeddings, self.contextEmbeddings] )
    def getNegSample(self, k, centerWord):
        vocabSizeWithoutUnk = self.vocabSize - 1
#         negSample = np.random.choice(vocabSizeWithoutUnk,
#                                      size = k, replace = True, p = self.wordSampleProbs)
        negSample = random.sample(self.wordSampleProbs, k)
        while self.word2Idx[centerWord] in negSample:
#             negSample = np.random.choice(vocabSizeWithoutUnk,
#                                          size = k, replace = True, p = self.wordSampleProbs)
            negSample = random.sample(self.wordSampleProbs, k)
        return(negSample)
    def forward(self, center, context, negSampleIndices = None):
#         focus = torch.autograd.Variable(torch.LongTensor([0]))
#         context = torch.autograd.Variable(torch.LongTensor([0]))
#         allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))


#         embedCenter = self.centerEmbeddings(center).view((1, -1))
#         embedContext = self.contextEmbeddings(context).view((1, -1))
# #         print(allEmbeddingIdxs)
#         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
#         num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
#         denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
#         logProb = torch.log(num/denom)
        embedCenter = self.centerEmbeddings[center].view((1, -1))
        embedContext = self.contextEmbeddings[context].view((1, -1))       
        if negSampleIndices is not None:
#             print("hey")
            posVal = self.logSigmoid (torch.mm(embedContext, torch.t(embedCenter)))
            negVal = torch.mm(self.contextEmbeddings[negSampleIndices], torch.t(embedCenter))
            negVal = self.logSigmoid (-torch.sum(negVal))
            logProb = -(posVal + negVal)
        else:
#             allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,self.vocabSize)]))



    #         print(allEmbeddingIdxs)
    #         allContextEmbeddings = self.contextEmbeddings(allEmbeddingIdxs).squeeze()
            num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
            denom = torch.exp(torch.mm(self.contextEmbeddings, torch.t(embedCenter))).sum()
            logProb = torch.log(num/denom)
    
        return(logProb)


def train_skipgram(embeddingSize, trainingData, vocabCount, word2Idx, k):
    print("training on {} observations".format(len(trainingData)))
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize = len(word2Idx), embedSize = embeddingSize,
                     vocabCount = vocabCount, word2Idx = word2Idx)
#     print(model)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in tqdm_notebook(range(n_epoch), position = 0):
#         print("entered epoch")
        total_loss = .0
        for in_w, out_w in tqdm_notebook(trainingData, position = 1):
            if k is not None:
                negSamples = model.getNegSample(k = k, centerWord = in_w)
            else:
                negSamples = None
#             print("neg samples found")
#             print(negSamples)
            in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             if in_w in word2Idx:
#                 in_w_var = word2Idx[in_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
#             else:
#                 in_w_var = word2Idx["<UNK>"]
#             if out_w in word2Idx:
#                 out_w_var = word2Idx[out_w]#torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
#             else:
#                 out_w_var = word2Idx["<UNK>"]
            
            model.zero_grad()
            log_probs = model(in_w_var, out_w_var, negSampleIndices = negSamples)
            loss = -log_probs[0]#loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
        losses.append(total_loss)
        if epoch % 10 == 0:    
            print(f'Loss at epoch {epoch}: {total_loss/len(trainingData)}')
    return(model, losses)

In [8]:
embd_size = 100
learning_rate = 0.001
n_epoch = 60
idxPairsTest = generateObservations(tokenizedCorpus = finalTokenizedCorpus_test, word2Idx = word2Idx_test)
sg_model, sg_losses = train_skipgram(embeddingSize = 5, trainingData = idxPairsTest, vocabCount = vocabCount_test,
                                     word2Idx = word2Idx_test, k = 5)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 0: [2.925287]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 10: [2.6161726]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 20: [2.5666747]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 30: [2.4666207]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 40: [2.2420926]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

Loss at epoch 50: [2.0216222]


HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))




In [65]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP, vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = 10)

training on 39788 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=39788), HTML(value='')))

Loss at epoch 0: [-26.419334]


In [40]:
1413/11

128.45454545454547

In [41]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
# idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP, vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = 5)

training on 3627632 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3627632), HTML(value='')))

KeyboardInterrupt: 

In [42]:
1705/11

155.0

In [36]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
# idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP, vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = None)

training on 3627632 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3627632), HTML(value='')))

KeyboardInterrupt: 

In [37]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
idxPairsPubMed = generateObservations(tokenizedCorpus = finalTokenizedCorpus_pubMed, word2Idx = word2Idx_pubMed)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsPubMed, vocabCount = vocabCount_pubMed,
                                     word2Idx = word2Idx_pubMed, k = 20)

training on 1467190 observations


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1467190), HTML(value='')))

KeyboardInterrupt: 

In [14]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
# idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP, vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = 20)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3627632), HTML(value='')))

Loss at epoch 0: [2.2236288]


In [19]:
embeddingSize = 100
learning_rate = 0.001
n_epoch = 1
# idxPairsAP = generateObservations(tokenizedCorpus = finalTokenizedCorpus_ap, word2Idx = word2Idx_ap)
sg_model, sg_losses = train_skipgram(embeddingSize = embeddingSize, trainingData = idxPairsAP, vocabCount = vocabCount_ap,
                                     word2Idx = word2Idx_ap, k = None)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3627632), HTML(value='')))

Loss at epoch 0: [nan]


In [148]:
word2Idx_test

{'the': 0,
 'of': 1,
 'to': 2,
 'and': 3,
 'a': 4,
 'in': 5,
 'said': 6,
 'for': 7,
 'The': 8,
 'that': 9,
 'was': 10,
 'on': 11,
 'is': 12,
 'with': 13,
 'by': 14,
 'at': 15,
 'he': 16,
 'from': 17,
 'as': 18,
 'be': 19,
 'were': 20,
 'have': 21,
 'it': 22,
 'his': 23,
 'an': 24,
 'has': 25,
 'not': 26,
 'are': 27,
 'who': 28,
 'had': 29,
 'will': 30,
 'would': 31,
 'about': 32,
 'but': 33,
 'been': 34,
 'they': 35,
 'its': 36,
 'I': 37,
 'their': 38,
 'percent': 39,
 'which': 40,
 'or': 41,
 'this': 42,
 'after': 43,
 'He': 44,
 'more': 45,
 'up': 46,
 'people': 47,
 'million': 48,
 'US': 49,
 'also': 50,
 'one': 51,
 'In': 52,
 'other': 53,
 'than': 54,
 'year': 55,
 'two': 56,
 'when': 57,
 'government': 58,
 'A': 59,
 'years': 60,
 'last': 61,
 'But': 62,
 'no': 63,
 'out': 64,
 'all': 65,
 'we': 66,
 'It': 67,
 'could': 68,
 'over': 69,
 'new': 70,
 'into': 71,
 'first': 72,
 'because': 73,
 'Soviet': 74,
 'some': 75,
 'them': 76,
 'she': 77,
 'New': 78,
 'her': 79,
 'United': 80

In [101]:
model = SkipGram(20, 5)
focus = torch.autograd.Variable(torch.LongTensor([0]))
context = torch.autograd.Variable(torch.LongTensor([0]))
allEmbeddingIdxs = torch.autograd.Variable(torch.LongTensor([np.arange(0,20)]))


embedCenter = model.centerEmbeddings(focus).view((1, -1))
embedContext = model.contextEmbeddings(context).view((1, -1))
allContextEmbeddings = model.contextEmbeddings(allEmbeddingIdxs).squeeze()
num = torch.exp(torch.mm(embedContext, torch.t(embedCenter)))
denom = torch.exp(torch.mm(allContextEmbeddings, torch.t(embedCenter))).sum()
logProb = torch.log(num/denom)
# score = torch.mm(embed_focus, torch.t(embed_ctx))
# log_probs = nn.functional.logsigmoid(score)

In [102]:
logProb

tensor([[-4.3745]], grad_fn=<LogBackward>)

In [73]:
torch.exp(logProb)

tensor([[0.0443]], grad_fn=<ExpBackward>)

In [68]:
torch.mm(model.contextEmbeddings[0].view((1, -1)), torch.t(embedCenter))

tensor([[-0.2262]], grad_fn=<MmBackward>)

In [69]:
torch.mm(model.contextEmbeddings[1].view((1, -1)), torch.t(embedCenter))

tensor([[-1.5118]], grad_fn=<MmBackward>)

In [70]:
denom.sum()

tensor(1.8398, grad_fn=<SumBackward0>)

In [59]:
embed_focus

tensor([-0.4778, -0.8661, -0.2023, -0.4160, -0.7439], grad_fn=<SelectBackward>)

In [56]:
model.centerEmbeddings

tensor([[-0.0176,  1.2786, -0.6968,  0.0286, -0.3026],
        [-0.7574, -0.6157, -1.5389,  0.2034,  0.7011],
        [-1.8946,  0.6623,  1.1087,  1.1338,  0.4926],
        [-0.5137, -0.0341,  0.4722, -2.4054,  1.2151],
        [ 1.8809,  0.3533, -0.1564, -0.0763, -0.0259],
        [-0.1247,  0.1701, -0.1384,  0.6218, -0.7557],
        [ 2.1279,  0.4135, -0.1488, -1.1001,  0.6704],
        [ 1.2196,  0.0604,  0.3532, -0.1608,  0.6684],
        [ 0.8554, -2.7541,  0.9763, -1.4335, -0.6975],
        [-0.2119,  0.2934,  0.6183,  2.2487, -0.9169],
        [ 1.0163,  0.9303, -1.9464,  2.3819, -0.9266],
        [-0.5092, -0.3674, -0.6601,  0.6410,  0.2617],
        [-0.3888, -1.4945,  0.2670,  0.3349, -0.4055],
        [ 1.7100,  0.1870,  1.2487, -0.8660, -0.8817],
        [ 0.2530, -1.1636, -0.4466,  0.7912, -2.3919],
        [-0.9038, -0.8937,  0.4035, -0.8282,  0.9962],
        [-0.3595,  0.0266,  1.8298, -1.8399,  1.0233],
        [ 0.0946, -0.3587,  1.8775,  1.1186, -0.6393],
        [ 

In [49]:
allContext = torch.autograd.Variable(torch.LongTensor(np.arange(0, 20)))
model.contextEmbeddings(context).shape

torch.Size([20, 5])

In [45]:
np.arange(0, 20)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [15]:
def test_skipgram(testData, model, word2Idx):
    print('====Test SkipGram===')
    correct_ct = 0
    for in_w, out_w in testData:
        in_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
        out_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))

        model.zero_grad()
        log_probs = model(in_w_var, out_w_var)
        prob = torch.exp(log_probs)
#         print(torch.max(log_probs.data, 1))
#         _, predicted = torch.max(log_probs.data, 1)
#         predicted = predicted[0]
#         print(log_probs.data)
        if prob > 0.5:#predicted == 1:
            correct_ct += 1

    print('Accuracy: {:.1f}% ({}/{})'.format(correct_ct/len(testData)*100, correct_ct, len(testData)))


test_skipgram(idxPairsTest, sg_model, word2Idx = word2Idx_test)

====Test SkipGram===
Accuracy: 80.1% (269/336)


In [None]:
def test_skipgram(testData, model, word2Idx):
    print('====Test SkipGram===')
    correct_ct = 0
    for in_w, out_w in testData:
        in_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
        out_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))

        model.zero_grad()
        log_probs = model(in_w_var, out_w_var)
        prob = torch.exp(log_probs)
#         print(torch.max(log_probs.data, 1))
        _, predicted = torch.max(log_probs.data, 1)
#         predicted = predicted[0]
#         print(log_probs.data)
        if prob > 0.5:#predicted == 1:
            correct_ct += 1

    print('Accuracy: {:.1f}% ({}/{})'.format(correct_ct/len(testData)*100, correct_ct, len(testData)))


test_skipgram(idxPairsTest, sg_model, word2Idx = word2Idx_test)

In [34]:
torch.max(log_probs.data, 1)

(tensor([-2.1355]), tensor([0]))

In [35]:
torch.max(log_probs.data, 1)

TypeError: max() received an invalid combination of arguments - got (int, int), but expected one of:
 * (Tensor input)
 * (Tensor input, Tensor other, Tensor out)
 * (Tensor input, int dim, bool keepdim, tuple of Tensors out)


## Exploring Word Embeddings

## How Domains Affect Word Embeddings