# Lab 10: Word Embeddings
Thinking of using stuff from here
https://gist.github.com/mbednarski/da08eb297304f7a66a3840e857e060a0

conda install -c conda-forge tqdm

## Janitorial Work

In [1]:
testCorpus = ["First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.",
             "No. Like this. You gotta look mean, or people won’t respect you.",
              " people will run all over you if you don’t look mean.",
              "You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.",
             "But our tribe never hunted buffalo. We were fishermen."
             "What? You wanna look like you just came back from catching a fish?",
             "This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior."]

In [2]:
# Read in pubmed corpus into a text file
import glob
pubMedDataFolderPath = "data/pubMed_corpus/"
pubMedDataFiles = glob.glob(pubMedDataFolderPath + "*.txt")
pubMedCorpus = [""]*len(pubMedDataFiles)
for idx, pubMedDataPath in enumerate(pubMedDataFiles):
    with open(pubMedDataPath, "r") as pubMedFile:
        text = pubMedFile.read().strip()
        pubMedCorpus[idx] = text
print("{} pub med abstracts".format(len(pubMedCorpus)))

1767 pub med abstracts


In [3]:
# Read in the ap corpus
apTextFile = "data/ap.txt"
apCorpus = []
readText = False
with open(apTextFile) as apDataFile:
    for line in apDataFile:
        if readText:
            apCorpus.append(line.strip())
            readText = False
        if line == "<TEXT>\n":
            readText = True
print("{} ap articles".format(len(apCorpus)))

2246 ap articles


In [5]:
import string
def removePunctuation(myStr):
    excludedCharacters = string.punctuation + "’"
    newStr = "".join(char for char in myStr if char not in excludedCharacters)
    return(newStr)
def tokenize_corpus(corpus):
    tokens = [removePunctuation(x).split() for x in corpus]
    return tokens

apCorpusTokenized = tokenize_corpus(apCorpus)
pubMedCorpusTokenized = tokenize_corpus(pubMedCorpus)
testCorpusTokenized = tokenize_corpus(testCorpus)

In [121]:
import time
from tqdm import tqdm
def extractVocabMappers(tokenizedCorpus):
    vocabulary = []
    for sentence in tqdm(tokenizedCorpus):
        for token in sentence:
#             print(token)
            if token not in vocabulary:
                vocabulary.append(token)

    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
    return(word2idx, idx2word)

# start = time.time()
print("Building ap corpus vocabulary")
word2Idx_ap, idx2Word_ap = extractVocabMappers(apCorpusTokenized)
# print("ap data tokenized in {} seconds".format(time.time() - start))
# start = time.time()
print("Building pubMed corpus vocabulary")
word2Idx_pubMed, idx2Word_pubMed = extractVocabMappers(pubMedCorpusTokenized)
# print("pubmed data tokenized in {} seconds".format(time.time() - start))
# start = time.time()
print("Building test corpus vocabulary")
word2Idx_test, idx2Word_test = extractVocabMappers(testCorpusTokenized)
# print("test data tokenized in {} seconds".format(time.time() - start))

  1%|          | 16/2246 [00:00<00:14, 157.40it/s]

Building ap corpus vocabulary


100%|██████████| 2246/2246 [02:34<00:00, 12.87it/s]
  2%|▏         | 39/1767 [00:00<00:04, 382.22it/s]

Building pubMed corpus vocabulary


100%|██████████| 1767/1767 [00:27<00:00, 63.72it/s]
100%|██████████| 6/6 [00:00<00:00, 16090.68it/s]

Building test corpus vocabulary





In [17]:
testCorpus

['First of all, quit grinnin’ like an idiot. Indians ain’t supposed to smile like that. Get stoic.',
 'No. Like this. You gotta look mean, or people won’t respect you.',
 ' people will run all over you if you don’t look mean.',
 'You gotta look like a warrior. You gotta look like you just came back from killing a buffalo.',
 'But our tribe never hunted buffalo. We were fishermen.What? You wanna look like you just came back from catching a fish?',
 'This ain’t dances with salmon, you know. Thomas, you gotta look like a warrior.']

## Word2Vec Implementation

In [43]:
import numpy as np
import torch
from torch import nn

In [73]:
testCorpusTokenized

[['First',
  'of',
  'all',
  'quit',
  'grinnin',
  'like',
  'an',
  'idiot',
  'Indians',
  'aint',
  'supposed',
  'to',
  'smile',
  'like',
  'that',
  'Get',
  'stoic'],
 ['No',
  'Like',
  'this',
  'You',
  'gotta',
  'look',
  'mean',
  'or',
  'people',
  'wont',
  'respect',
  'you'],
 ['people',
  'will',
  'run',
  'all',
  'over',
  'you',
  'if',
  'you',
  'dont',
  'look',
  'mean'],
 ['You',
  'gotta',
  'look',
  'like',
  'a',
  'warrior',
  'You',
  'gotta',
  'look',
  'like',
  'you',
  'just',
  'came',
  'back',
  'from',
  'killing',
  'a',
  'buffalo'],
 ['But',
  'our',
  'tribe',
  'never',
  'hunted',
  'buffalo',
  'We',
  'were',
  'fishermenWhat',
  'You',
  'wanna',
  'look',
  'like',
  'you',
  'just',
  'came',
  'back',
  'from',
  'catching',
  'a',
  'fish'],
 ['This',
  'aint',
  'dances',
  'with',
  'salmon',
  'you',
  'know',
  'Thomas',
  'you',
  'gotta',
  'look',
  'like',
  'a',
  'warrior']]

In [76]:
def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 2
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
        indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)
idxPairsTest = generateObservations(tokenizedCorpus = testCorpusTokenized, word2Idx = word2Idx_test)

In [77]:
idxPairsTest

array([['First', 'of'],
       ['First', 'all'],
       ['of', 'First'],
       ['of', 'all'],
       ['of', 'quit'],
       ['all', 'First'],
       ['all', 'of'],
       ['all', 'quit'],
       ['all', 'grinnin'],
       ['quit', 'of'],
       ['quit', 'all'],
       ['quit', 'grinnin'],
       ['quit', 'like'],
       ['grinnin', 'all'],
       ['grinnin', 'quit'],
       ['grinnin', 'like'],
       ['grinnin', 'an'],
       ['like', 'quit'],
       ['like', 'grinnin'],
       ['like', 'an'],
       ['like', 'idiot'],
       ['an', 'grinnin'],
       ['an', 'like'],
       ['an', 'idiot'],
       ['an', 'Indians'],
       ['idiot', 'like'],
       ['idiot', 'an'],
       ['idiot', 'Indians'],
       ['idiot', 'aint'],
       ['Indians', 'an'],
       ['Indians', 'idiot'],
       ['Indians', 'aint'],
       ['Indians', 'supposed'],
       ['aint', 'idiot'],
       ['aint', 'Indians'],
       ['aint', 'supposed'],
       ['aint', 'to'],
       ['supposed', 'Indians'],
       ['suppose

In [118]:

def generateObservations(tokenizedCorpus, word2Idx):
    window_size = 2
    idxPairs = []
    # for each sentence
    for sentence in tokenizedCorpus:
        indices = [word2Idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(sentence)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                    continue
                idxPairs.append((sentence[center_word_pos], sentence[context_word_pos]))

    idxPairs = np.array(idxPairs) # it will be useful to have this as numpy array
    return(idxPairs)

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))
        score = torch.mm(embed_focus, torch.t(embed_ctx))
        log_probs = nn.functional.logsigmoid(score)
    
        return(log_probs)

idxPairsTest = generateObservations(tokenizedCorpus = testCorpusTokenized, word2Idx = word2Idx_test)
embd_size = 100
learning_rate = 0.001
n_epoch = 30


def train_skipgram(vocabSize, embeddingSize, trainingData, word2Idx):
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocabSize, embeddingSize)
    print(model)
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(n_epoch):
        total_loss = .0
        for in_w, out_w in trainingData:
            in_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
            out_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))
            
            model.zero_grad()
            log_probs = model(in_w_var, out_w_var)
            loss = loss_fn(log_probs[0], torch.autograd.Variable(torch.Tensor([1])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.data.numpy()
        losses.append(total_loss)
        if epoch % 10 == 0:    
            print(f'Loss at epo {epoch}: {total_loss/len(trainingData)}')
    return(model, losses)
    
sg_model, sg_losses = train_skipgram(vocabSize = len(word2Idx_test), embeddingSize = 5,
                                    trainingData = idxPairsTest, word2Idx = word2Idx_test)

SkipGram(
  (embeddings): Embedding(58, 5)
)
Loss at epo 0: 6.003527005513509
Loss at epo 10: 3.265365608036518
Loss at epo 20: 2.597180875284331


In [119]:
def test_skipgram(testData, model, word2Idx):
    print('====Test SkipGram===')
    correct_ct = 0
    for in_w, out_w in testData:
        in_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[in_w]]))
        out_w_var = torch.autograd.Variable(torch.LongTensor([word2Idx[out_w]]))

        model.zero_grad()
        log_probs = model(in_w_var, out_w_var)
        prob = torch.exp(log_probs)
#         print(torch.max(log_probs.data, 1))
#         _, predicted = torch.max(log_probs.data, 1)
#         predicted = predicted[0]
#         print(log_probs.data)
        if prob > 0.5:#predicted == 1:
            correct_ct += 1

    print('Accuracy: {:.1f}% ({}/{})'.format(correct_ct/len(testData)*100, correct_ct, len(testData)))


test_skipgram(idxPairsTest, sg_model, word2Idx = word2Idx_test)

====Test SkipGram===
Accuracy: 79.2% (266/336)


## Exploring Word Embeddings

## How Domains Affect Word Embeddings