In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

In [3]:
import os
import pandas as pd

import torch 
from torch import nn
import multiprocessing as mp
from numpy import random
import numpy as np
import functools
from typing import Dict, List, Iterator, Union, Callable
# In AllenNLP each training example is represented as an Instance containing Fields
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField

from allennlp.data.iterators import BucketIterator

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token

from allennlp.models import Model
from allennlp.training.trainer import Trainer
from allennlp.data.vocabulary import Vocabulary

from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter

from allennlp.predictors import SentenceTaggerPredictor

import time
import re

torch.manual_seed(42)


<torch._C.Generator at 0x7f1d2563a950>

In [24]:
class SentenceDataset():

    def __init__(self, data_series: List[pd.Series], csvs: List[str], 
                 token_indexers: Dict[str, TokenIndexer] = None, 
                 tokenizer: WordTokenizer = SpacyWordSplitter(),
                 sentence_splitter: SpacySentenceSplitter = SpacySentenceSplitter(rule_based=True),
                 context_size: int = 1) -> None:
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.tokenizer = tokenizer
        self.sentence_splitter = sentence_splitter
        self.context_size = context_size
        self.data_series = data_series
        self.csvs = csvs
        self.queue = mp.Queue(100000)
        # self.sentences = []
        # self.next_sentence_ids = []

    @staticmethod
    def text_to_instance(tokens: List[Token], token_indexers, context: List[Token] = None) -> Instance:
        sentence_field = TextField(tokens, token_indexers)
        fields = {"sentence": sentence_field}
        
        if context:
            label_field = TextField(context, token_indexers)
            fields["context"] = label_field
        
        return Instance(fields)
    
    def __iter__(self) -> Iterator[Instance]:
        select_context_part = functools.partial(self.select_context_from_sentence, 
                                                context_size = self.context_size)
        instances = self.iterate_data(select_context_part, True)
        if isinstance(instances, list):
            raise ConfigurationError("For a lazy dataset reader, word_context() must return a generator")
        return instances
         
    @staticmethod       
    def select_context_from_sentence(sentence: List[Token], context_size: int):
        j = random.randint(0, len(sentence))
        context1 = [word for word in sentence[j-context_size:j]]
        while len(context1) < context_size:
            context1.insert(0, Token())
        context2 = [word for word in sentence[j+1:j+context_size+1]]
        while len(context1) < context_size:
            context2.append(Token())
        context1.extend(context2)
        return [sentence[j]], context1

    def it_from_ser(self) -> Iterator[Instance]:
        instances = self.iterate_data(None, False)
        if isinstance(instances, list):
            raise ConfigurationError("For a lazy dataset reader, word_context() must return a generator")
            
        return instances
    
    def process_chunk(self, sentences: List[str], apply_to_sentence: Callable = None, 
                      split_sentences: bool = True):
        if split_sentences:
            sentences = self.sentence_splitter.batch_split_sentences(sentences)
            ret_list = []
            for sent in sentences:
                ret_list.extend(sent)
            sentences = ret_list
        ret_list = self.tokenizer.batch_split_words(sentences)
        return ret_list
                               
    
    def iterate_data(self, apply_to_sentence: Callable,
                     split_sentences: bool = True) -> Iterator[Union[Instance, Instance]]:
        next_sentence_id = 0
        for ser in self.data_series:
            sentences = ser[pd.notnull(ser)].values
            sentence_word_list = self.process_chunk(sentences, apply_to_sentence, split_sentences)
            for sentence in sentence_word_list:
                if len(sentence) < 3:
                    continue
                if apply_to_sentence is not None:
                    sentence, context = apply_to_sentence(sentence)
                    yield SentenceDataset.text_to_instance(sentence, self.token_indexers, context)
                else:
                    yield SentenceDataset.text_to_instance(sentence, self.token_indexers)
        for f in self.csvs:
            reader = pd.read_csv(f, chunksize=int(1e4), dtype=str)
            for chunk in reader:
                sentences = chunk['sentence']
                sentences = sentences[pd.notnull(sentences)].values
                sentence_word_list = self.process_chunk(sentences, apply_to_sentence, False)
                for sentence in sentence_word_list:
                    if len(sentence) < 3:
                        continue
                    if apply_to_sentence is not None:
                        sentence, context = apply_to_sentence(sentence)
                        yield SentenceDataset.text_to_instance(sentence, self.token_indexers, context)
                    else:
                        yield SentenceDataset.text_to_instance(sentence, self.token_indexers)


In [25]:
class SkipGramLanguageModelerSparse(Model):

    def __init__(self, vocab: Vocabulary, embedding_dim, context_size, 
                 embedding_freq, negative_sampling_size):
        super().__init__(vocab)
        vocab_size = vocab.get_vocab_size('tokens')
        print(vocab_size)
        self.embedding_freq = torch.autograd.Variable(torch.Tensor(embedding_freq)**(0.75))
        self.negative_sampling_size = negative_sampling_size
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()
        self.last_loss = None

    def forward(self, sentence, context=None):
        
        word = sentence['tokens']
        
        # Allen NLP demands a dictionary to be returned
        # It has to contain the key loss for training purposes
        output = {}
        word_emb = self.word_embedding(word)
        output['vec'] = word_emb + self.context_embedding(word)

        if context is None:
            #return embedding
            return output
        
        context = context['tokens']
        
        context = self.context_embedding(context)
        word_pos = word_emb.expand_as(context)
        
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        negative_context = self.embedding_freq.multinomial(
            self.negative_sampling_size*context.size(0)*context.size(1), replacement=True).to(self.device())
        negative_context = negative_context.view(-1,self.negative_sampling_size*context.size(1))
        negative_context = self.context_embedding(negative_context)
        word_neg = word_emb.expand_as(negative_context)
        product_neg = (word_neg*negative_context).sum(dim=-1).mean()
        target_neg = torch.autograd.Variable(torch.zeros(product_neg.size()))
        loss_negative = nn.functional.binary_cross_entropy_with_logits(product_neg,target_neg)
        self.last_loss = loss_positive + loss_negative
        output['loss'] = loss_positive + loss_negative
        output['vec'] = word_pos + context
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"bce": self.last_loss}
    
    def device(self):
        return next(self.parameters()).device
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        score = torch.sum(score, dim=1)
        score = nn.functional.logsigmoid(score)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=0)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        return -1 * sum(losses)
    
class SkipGramLanguageModelerSparseNoNeg(Model):

    def __init__(self, vocab: Vocabulary, embedding_dim, context_size):
        super().__init__(vocab)
        vocab_size = vocab.get_vocab_size('tokens')
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()
        self.last_loss = None

    def forward(self, sentence, context=None):
        
        word = sentence['tokens']
        
        # Allen NLP demands a dictionary to be returned
        # It has to contain the key loss for training purposes
        output = {}
        word_emb = self.word_embedding(word)
        output['vec'] = word_emb + self.context_embedding(word)

        if context is None:
            #return embedding
            return output
        
        context = context['tokens']
        
        context = self.context_embedding(context)
        word_pos = word_emb.expand_as(context)
        
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        self.last_loss = loss_positive
        output['loss'] = loss_positive
        output['vec'] = word_pos + context
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"bce": self.last_loss}
    
    def device(self):
        return next(self.parameters()).device
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        score = torch.sum(score, dim=1)
        score = nn.functional.logsigmoid(score)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=0)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        return -1 * sum(losses)
    

In [26]:
path = '/home/jendrik/git/DLWorkshop/notebooks/southpark_full.csv'
corpus = pd.read_csv(path, sep='\t', dtype=str)
wiki_path = '/home/jendrik/data/enwiki_2018_09_25.csv'
ds = SentenceDataset([corpus['spoken']], [wiki_path])
train_dataset = ds.it_from_ser()
word2vec_dataset = ds

In [None]:
vocab = Vocabulary.from_instances(train_dataset, min_count={'': 30})

In [None]:
vocab.save_to_files('vocab.store')
with open('vocab.dict', 'w') as f:
    for k, v in vocab._retained_counter['tokens'].items():
        f.write(f'{k},{v}\n')

In [9]:
add_dict = {}
with open('vocab.dict', 'r') as f:
    content = f.read()
    lines = content.split('\n')
    # line = f.readline()
    for line in lines:
        try:
            comma_index = line.rfind(',')
            token = line[:comma_index]
            if ('|' in token or '/' in token 
                or len(token) == 1 or 'style="' in token or '<' in token
               or '!colspan=' in token):
                continue
            count = int(line[comma_index+1:])
            if count < 500:
                continue
            add_dict[token] = count
            # line = f.readline()
        except:
            print(line)

            
    
vocab_new = Vocabulary(counter={'tokens': add_dict}, min_count={'tokens': 10}) 
print(len(list(add_dict.values())))
print(vocab_new.get_vocab_size('tokens'))


97151
97153


In [10]:
vocab_new.print_statistics()



----Vocabulary Statistics----


Top 10 most frequent tokens in namespace 'tokens':
	Token: the		Frequency: 129471660
	Token: of		Frequency: 71053610
	Token: and		Frequency: 60957403
	Token: in		Frequency: 53374979
	Token: to		Frequency: 43053589
	Token: was		Frequency: 26424022
	Token: is		Frequency: 20789341
	Token: The		Frequency: 19647930
	Token: for		Frequency: 17131870
	Token: as		Frequency: 16128262

Top 10 longest tokens in namespace 'tokens':
	Token: Verwaltungsgemeinschaft		length: 23	Frequency: 751
	Token: ProcellariiformesFamily		length: 23	Frequency: 678
	Token: PricewaterhouseCoopers		length: 22	Frequency: 824
	Token: CharadriiformesFamily		length: 21	Frequency: 2596
	Token: electionParliamentary		length: 21	Frequency: 1309
	Token: AccipitriformesFamily		length: 21	Frequency: 660
	Token: PelecaniformesFamily		length: 20	Frequency: 953
	Token: internationalization		length: 20	Frequency: 935
	Token: uncharacteristically		length: 20	Frequency: 876
	Token: institutionalizati

In [11]:
vocab_new.save_to_files('vocab_new.store')
with open('vocab_new.dict', 'w') as f:
    for k, v in add_dict.items():
        f.write(f'{k},{v}\n')



In [12]:
# vocab_new = Vocabulary.from_files('vocab_new.store')
add_dict = {}
with open('vocab_new.dict', 'r') as f:
    content = f.read()
    lines = content.split('\n')
    # line = f.readline()
    for line in lines:
        try:
            comma_index = line.rfind(',')
            token = line[:comma_index]
            count = int(line[comma_index+1:])
            add_dict[token] = count
            # line = f.readline()
        except ValueError:
            print(line)
vocab_new = Vocabulary(counter={'tokens': add_dict}, min_count={'tokens': 10}) 




In [13]:
print(len(add_dict.values()))
print(vocab_new.get_vocab_size('tokens'))

97151
97153


In [27]:
embedding_dims = 128
batch_size = 256
context_size = 3
n_batches = 64#len(sentences)//batch_size+1
vec_dims = 128
neg_samples = 10
dict_values = list(add_dict.values())
dict_values.insert(0, 0)
dict_values.insert(0, 0)
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModelerSparse(
    vocab_new, embedding_dims, context_size, 
    dict_values, 
    neg_samples)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=1e-4)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

97153


SkipGramLanguageModelerSparse(
  (word_embedding): Embedding(97153, 128, sparse=True)
  (context_embedding): Embedding(97153, 128, sparse=True)
  (linear): Linear(in_features=128, out_features=97153, bias=True)
  (relu): ReLU()
)

In [28]:
import sys
for param in model.parameters():
    print(param.shape)

torch.Size([97153, 128])
torch.Size([97153, 128])
torch.Size([97153, 128])
torch.Size([97153])


In [None]:


iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab_new)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=word2vec_dataset,
                  # validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=2,
                  cuda_device=0)

trainer.train()





  0%|          | 0/1 [00:00<?, ?it/s][A[A
bce: 0.8895, loss: 0.8895 ||: 100%|██████████| 1/1 [00:21<00:00, 21.35s/it][A
bce: 0.8836, loss: 1.3152 ||: : 13it [00:21, 14.95s/it]                    [A
bce: 1.2716, loss: 1.1761 ||: : 24it [00:21, 10.47s/it][A
bce: 0.9747, loss: 1.1275 ||: : 36it [00:21,  7.33s/it][A
bce: 0.9131, loss: 1.0851 ||: : 47it [00:21,  5.13s/it][A
bce: 0.3580, loss: 1.0735 ||: : 59it [00:21,  3.60s/it][A
bce: 0.8715, loss: 1.0573 ||: : 71it [00:21,  2.52s/it][A
bce: 0.6480, loss: 1.0160 ||: : 82it [00:22,  1.77s/it][A
bce: 0.6376, loss: 0.9907 ||: : 93it [00:22,  1.24s/it][A
bce: 0.6708, loss: 0.9860 ||: : 104it [00:22,  1.15it/s][A
bce: 0.9630, loss: 0.9596 ||: : 116it [00:22,  1.63it/s][A
bce: 0.8554, loss: 0.9684 ||: : 127it [00:22,  2.32it/s][A
bce: 0.4161, loss: 0.9371 ||: : 139it [00:22,  3.29it/s][A
bce: 0.6441, loss: 0.9169 ||: : 151it [00:22,  4.64it/s][A
bce: 0.8480, loss: 0.8895 ||: : 163it [00:22,  6.51it/s][A
bce: 0.2427, loss: 0.88

bce: 0.0502, loss: 0.3049 ||: : 1526it [00:35, 127.07it/s][A
bce: 0.0601, loss: 0.3032 ||: : 1539it [00:36, 114.38it/s][A
bce: 0.0509, loss: 0.3014 ||: : 1551it [00:36, 99.51it/s] [A
bce: 0.0307, loss: 0.2998 ||: : 1562it [00:36, 90.87it/s][A
bce: 0.0694, loss: 0.2981 ||: : 1572it [00:36, 85.25it/s][A
bce: 0.1205, loss: 0.2968 ||: : 1581it [00:36, 82.32it/s][A
bce: 0.0529, loss: 0.2952 ||: : 1592it [00:36, 87.97it/s][A
bce: 0.0195, loss: 0.2938 ||: : 1602it [00:36, 89.93it/s][A
bce: 0.0473, loss: 0.2920 ||: : 1614it [00:36, 97.05it/s][A
bce: 0.0250, loss: 0.2902 ||: : 1627it [00:36, 104.26it/s][A
bce: 0.0116, loss: 0.2887 ||: : 1639it [00:37, 106.96it/s][A
bce: 0.0250, loss: 0.2866 ||: : 1653it [00:37, 113.19it/s][A
bce: 0.0107, loss: 0.2849 ||: : 1667it [00:37, 118.07it/s][A
bce: 0.0524, loss: 0.2829 ||: : 1681it [00:37, 122.07it/s][A
bce: 0.0557, loss: 0.2811 ||: : 1695it [00:37, 124.87it/s][A
bce: 0.5113, loss: 0.2797 ||: : 1708it [00:37, 122.41it/s][A
bce: 0.0236, l

bce: 0.0008, loss: 0.1671 ||: : 3251it [00:50, 127.07it/s][A
bce: 0.0099, loss: 0.1666 ||: : 3264it [00:50, 127.16it/s][A
bce: 0.1268, loss: 0.1662 ||: : 3277it [00:56,  7.52it/s] [A
bce: 0.3250, loss: 0.1667 ||: : 3290it [00:56, 10.48it/s][A
bce: 0.0111, loss: 0.1672 ||: : 3303it [00:56, 14.44it/s][A
bce: 0.2965, loss: 0.1672 ||: : 3314it [00:56, 19.32it/s][A
bce: 0.5284, loss: 0.1682 ||: : 3326it [00:56, 25.80it/s][A
bce: 0.0546, loss: 0.1686 ||: : 3339it [00:56, 33.97it/s][A
bce: 0.0204, loss: 0.1687 ||: : 3351it [00:56, 43.27it/s][A
bce: 0.0532, loss: 0.1688 ||: : 3364it [00:56, 54.08it/s][A
bce: 0.3033, loss: 0.1689 ||: : 3376it [00:56, 61.53it/s][A
bce: 0.0226, loss: 0.1687 ||: : 3389it [00:56, 72.33it/s][A
bce: 0.0025, loss: 0.1691 ||: : 3401it [00:57, 81.36it/s][A
bce: 0.4487, loss: 0.1693 ||: : 3414it [00:57, 90.65it/s][A
bce: 0.0887, loss: 0.1703 ||: : 3427it [00:57, 99.68it/s][A
bce: 0.0135, loss: 0.1701 ||: : 3440it [00:57, 107.09it/s][A
bce: 0.0030, loss: 0

bce: 0.0191, loss: 0.1467 ||: : 4929it [01:34, 67.12it/s][A
bce: 0.0019, loss: 0.1466 ||: : 4942it [01:34, 76.24it/s][A
bce: 0.0527, loss: 0.1469 ||: : 4955it [01:35, 77.98it/s][A
bce: 0.1327, loss: 0.1468 ||: : 4967it [01:35, 85.61it/s][A
bce: 0.0029, loss: 0.1466 ||: : 4979it [01:35, 93.62it/s][A
bce: 0.0059, loss: 0.1463 ||: : 4992it [01:35, 100.39it/s][A
bce: 0.0225, loss: 0.1461 ||: : 5004it [01:35, 101.60it/s][A
bce: 0.0001, loss: 0.1459 ||: : 5016it [01:35, 104.01it/s][A
bce: 0.0522, loss: 0.1456 ||: : 5028it [01:35, 98.42it/s] [A
bce: 0.0755, loss: 0.1454 ||: : 5039it [01:35, 97.68it/s][A
bce: 0.0715, loss: 0.1452 ||: : 5050it [01:35, 98.54it/s][A
bce: 0.0008, loss: 0.1449 ||: : 5061it [01:36, 99.55it/s][A
bce: 0.1328, loss: 0.1450 ||: : 5072it [01:36, 96.42it/s][A
bce: 0.0858, loss: 0.1448 ||: : 5082it [01:36, 94.31it/s][A
bce: 0.0024, loss: 0.1448 ||: : 5092it [01:36, 91.38it/s][A
bce: 0.1631, loss: 0.1446 ||: : 5102it [01:36, 89.81it/s][A
bce: 0.2725, loss: 0

bce: 0.0191, loss: 0.1227 ||: : 6596it [02:13, 113.38it/s][A
bce: 0.0078, loss: 0.1225 ||: : 6608it [02:13, 109.28it/s][A
bce: 0.0054, loss: 0.1223 ||: : 6621it [02:14, 113.78it/s][A
bce: 0.0003, loss: 0.1221 ||: : 6633it [02:14, 104.95it/s][A
bce: 0.0891, loss: 0.1220 ||: : 6645it [02:14, 108.77it/s][A
bce: 0.0108, loss: 0.1218 ||: : 6658it [02:14, 113.03it/s][A
bce: 0.0084, loss: 0.1216 ||: : 6670it [02:14, 114.83it/s][A
bce: 0.0014, loss: 0.1213 ||: : 6683it [02:14, 118.13it/s][A
bce: 0.0193, loss: 0.1212 ||: : 6695it [02:19,  7.50it/s] [A
bce: 0.0500, loss: 0.1210 ||: : 6704it [02:19, 10.34it/s][A
bce: 0.0111, loss: 0.1208 ||: : 6717it [02:19, 14.27it/s][A
bce: 0.0039, loss: 0.1206 ||: : 6730it [02:20, 19.42it/s][A
bce: 0.0003, loss: 0.1204 ||: : 6741it [02:20, 25.48it/s][A
bce: 0.0000, loss: 0.1202 ||: : 6754it [02:20, 33.55it/s][A
bce: 0.0003, loss: 0.1200 ||: : 6766it [02:20, 42.26it/s][A
bce: 0.0001, loss: 0.1200 ||: : 6777it [02:20, 51.14it/s][A
bce: 0.0015, lo

bce: 0.0003, loss: 0.1038 ||: : 8280it [02:57, 15.49it/s][A
bce: 0.0018, loss: 0.1037 ||: : 8293it [02:57, 20.99it/s][A
bce: 0.0002, loss: 0.1035 ||: : 8305it [02:57, 27.56it/s][A
bce: 0.0649, loss: 0.1034 ||: : 8318it [02:58, 35.87it/s][A
bce: 0.0203, loss: 0.1033 ||: : 8330it [02:58, 45.22it/s][A
bce: 0.0015, loss: 0.1031 ||: : 8343it [02:58, 55.98it/s][A
bce: 0.0002, loss: 0.1031 ||: : 8356it [02:58, 67.10it/s][A
bce: 0.0003, loss: 0.1029 ||: : 8368it [02:58, 76.95it/s][A
bce: 0.0016, loss: 0.1028 ||: : 8381it [02:58, 87.30it/s][A
bce: 0.0068, loss: 0.1027 ||: : 8394it [02:58, 90.31it/s][A
bce: 0.0117, loss: 0.1026 ||: : 8406it [02:58, 97.06it/s][A
bce: 0.0001, loss: 0.1024 ||: : 8418it [02:58, 102.84it/s][A
bce: 0.0008, loss: 0.1023 ||: : 8431it [02:59, 107.44it/s][A
bce: 0.0057, loss: 0.1021 ||: : 8443it [02:59, 109.39it/s][A
bce: 0.0027, loss: 0.1020 ||: : 8456it [02:59, 113.98it/s][A
bce: 0.0058, loss: 0.1019 ||: : 8469it [02:59, 117.76it/s][A
bce: 0.0008, loss: 

bce: 0.0170, loss: 0.0894 ||: : 9958it [03:36, 103.50it/s][A
bce: 0.0000, loss: 0.0893 ||: : 9971it [03:36, 109.74it/s][A
bce: 0.0022, loss: 0.0892 ||: : 9984it [03:36, 113.98it/s][A
bce: 0.0067, loss: 0.0891 ||: : 9997it [03:36, 117.60it/s][A
bce: 0.0200, loss: 0.0890 ||: : 10010it [03:36, 116.70it/s][A
bce: 0.0000, loss: 0.0889 ||: : 10023it [03:36, 118.30it/s][A
bce: 0.0014, loss: 0.0888 ||: : 10036it [03:37, 118.88it/s][A
bce: 0.0065, loss: 0.0887 ||: : 10049it [03:37, 120.55it/s][A
bce: 0.0022, loss: 0.0886 ||: : 10062it [03:37, 122.61it/s][A
bce: 0.0107, loss: 0.0885 ||: : 10075it [03:37, 122.02it/s][A
bce: 0.0000, loss: 0.0884 ||: : 10088it [03:37, 121.84it/s][A
bce: 0.0114, loss: 0.0883 ||: : 10101it [03:37, 121.10it/s][A
bce: 0.0002, loss: 0.0882 ||: : 10114it [03:42,  8.37it/s] [A
bce: 0.0001, loss: 0.0881 ||: : 10127it [03:42, 11.62it/s][A
bce: 0.0002, loss: 0.0880 ||: : 10138it [03:42, 15.85it/s][A
bce: 0.0008, loss: 0.0879 ||: : 10151it [03:42, 21.46it/s][A

bce: 0.0003, loss: 0.0792 ||: : 11547it [04:14, 86.97it/s][A
bce: 0.0004, loss: 0.0791 ||: : 11560it [04:14, 95.55it/s][A
bce: 0.0000, loss: 0.0790 ||: : 11571it [04:15, 87.45it/s][A
bce: 0.0000, loss: 0.0789 ||: : 11581it [04:15, 82.48it/s][A
bce: 0.0003, loss: 0.0789 ||: : 11590it [04:15, 79.71it/s][A
bce: 0.0190, loss: 0.0790 ||: : 11599it [04:15, 77.70it/s][A
bce: 0.0013, loss: 0.0789 ||: : 11608it [04:15, 74.55it/s][A
bce: 0.0000, loss: 0.0789 ||: : 11616it [04:15, 74.30it/s][A
bce: 0.0004, loss: 0.0788 ||: : 11624it [04:15, 73.79it/s][A
bce: 0.0002, loss: 0.0787 ||: : 11632it [04:15, 73.42it/s][A
bce: 0.0162, loss: 0.0787 ||: : 11640it [04:15, 73.51it/s][A
bce: 0.0001, loss: 0.0786 ||: : 11648it [04:16, 73.25it/s][A
bce: 0.0000, loss: 0.0786 ||: : 11656it [04:16, 73.50it/s][A
bce: 0.0056, loss: 0.0785 ||: : 11664it [04:16, 73.56it/s][A
bce: 0.0017, loss: 0.0785 ||: : 11672it [04:21,  5.26it/s][A
bce: 0.0032, loss: 0.0784 ||: : 11680it [04:21,  7.29it/s][A
bce: 0.0

bce: 0.0000, loss: 0.0716 ||: : 13058it [04:53, 79.23it/s][A
bce: 0.0003, loss: 0.0716 ||: : 13067it [04:53, 78.35it/s][A
bce: 0.0001, loss: 0.0716 ||: : 13077it [04:53, 83.55it/s][A
bce: 0.0004, loss: 0.0715 ||: : 13088it [04:53, 89.23it/s][A
bce: 0.0105, loss: 0.0714 ||: : 13098it [04:53, 81.51it/s][A
bce: 0.0009, loss: 0.0714 ||: : 13108it [04:54, 84.14it/s][A
bce: 0.0460, loss: 0.0713 ||: : 13117it [04:54, 73.66it/s][A
bce: 0.0000, loss: 0.0713 ||: : 13125it [04:54, 74.26it/s][A
bce: 0.0145, loss: 0.0713 ||: : 13133it [04:54, 70.51it/s][A
bce: 0.0114, loss: 0.0712 ||: : 13141it [04:54, 68.76it/s][A
bce: 0.0079, loss: 0.0712 ||: : 13149it [04:54, 70.38it/s][A
bce: 0.0000, loss: 0.0711 ||: : 13159it [04:54, 76.06it/s][A
bce: 0.0009, loss: 0.0711 ||: : 13167it [04:54, 75.59it/s][A
bce: 0.0004, loss: 0.0710 ||: : 13178it [04:54, 82.98it/s][A
bce: 0.0000, loss: 0.0710 ||: : 13187it [04:55, 82.38it/s][A
bce: 0.0019, loss: 0.0710 ||: : 13196it [04:55, 78.37it/s][A
bce: 0.0

bce: 0.0012, loss: 0.0672 ||: : 14042it [05:28, 66.51it/s][A
bce: 0.0000, loss: 0.0671 ||: : 14052it [05:29, 73.31it/s][A
bce: 0.0175, loss: 0.0671 ||: : 14060it [05:29, 73.22it/s][A
bce: 0.0002, loss: 0.0671 ||: : 14069it [05:29, 77.51it/s][A
bce: 0.0000, loss: 0.0670 ||: : 14078it [05:29, 79.84it/s][A
bce: 0.0001, loss: 0.0670 ||: : 14089it [05:29, 85.45it/s][A
bce: 0.0014, loss: 0.0669 ||: : 14098it [05:29, 84.31it/s][A
bce: 0.0002, loss: 0.0669 ||: : 14107it [05:29, 82.79it/s][A
bce: 0.0001, loss: 0.0669 ||: : 14118it [05:29, 87.96it/s][A
bce: 0.0001, loss: 0.0668 ||: : 14130it [05:29, 93.73it/s][A
bce: 0.0005, loss: 0.0668 ||: : 14141it [05:30, 95.55it/s][A
bce: 0.0000, loss: 0.0667 ||: : 14151it [05:30, 87.99it/s][A
bce: 0.0054, loss: 0.0667 ||: : 14161it [05:36,  5.14it/s][A
bce: 0.0000, loss: 0.0666 ||: : 14168it [05:36,  7.07it/s][A
bce: 0.0000, loss: 0.0666 ||: : 14175it [05:36,  9.65it/s][A
bce: 0.0425, loss: 0.0666 ||: : 14182it [05:36, 12.99it/s][A
bce: 0.0

In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=ds)

vec = predictor.predict("Cartman")['vec']

In [None]:
vec

In [None]:
def get_closest_word(vec, model, vocab):
    score = vec.view(1, -1).expand_as(model.word_embedding.weight)*model.word_embedding.weight
    score = score.sum(dim=-1)
    norm = vec.view(1, -1).expand_as(model.word_embedding.weight).norm(dim=1)*model.word_embedding.weight.norm(dim=1)
    word_idx = (score / norm).argmax().numpy()
    print(word_idx)
    word = vocab.get_token_from_index(int(word_idx))
    return word

out = predictor.predict("cartman asshole girl")['vec']
print(out)
vec = torch.tensor(out[0]) - torch.tensor(out[1]) #+ torch.tensor(out[2])
print(out[0], vec)
#vec = (out[0] + out[1]).unsqueeze(0)
print(get_closest_word(vec, model, vocab))

In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)



In [None]:
def print_lines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [None]:
path = '/home/jendrik/git/DLWorkshop/data/all-seasons.csv'
print_lines(path)

In [None]:
corpus = pd.read_csv(path)

In [None]:
corpus

In [None]:
context_size = 2
lmtzr = WordNetLemmatizer()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

ls = tokenizer.tokenize(corpus.iloc[0]['Line'])
print(ls)
for sentence in ls:
    print([lmtzr.lemmatize(word) for word in nltk.word_tokenize(sentence) if word.isalpha()])


In [None]:
sentences = []
vocab = {' ': 1}
corpus_len = 0
for i, row in corpus.iterrows():
    ls = tokenizer.tokenize(row['Line'])
    for sentence in ls:
        stemmed_sentence = [lmtzr.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalpha()]
        if len(stemmed_sentence) > 2:
            corpus_len += len(stemmed_sentence)
            for word in stemmed_sentence:
                try:
                    vocab[word] += 1
                except KeyError:
                    vocab[word] = 1
            sentences.append(stemmed_sentence)


In [None]:
print(len(vocab.keys()))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {v: k for k, v in word_to_ix.items()}
#vocab

In [None]:
drop_dict = {}
sample_dict = {}
for k, v in vocab.items():
    z = v/corpus_len
    drop_dict[word_to_ix[k]] = (np.sqrt(z*1e3) + 1)*1e-3/z
    sample_dict[word_to_ix[k]] = (v/corpus_len)**0.75
sample_dict_sum = sum(sample_dict.values())
for k in sample_dict:
    sample_dict[k] /= sample_dict_sum
    
    

In [None]:
class CBOWLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        out = self.embeddings(inputs).mean(1)#.view((inputs.shape[0], -1))
        #out = self.relu(self.linear1(embeds))
        return out
    
    def log_probs(self, out):
        out = self.linear(out)
        print(out.shape)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class SkipGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        #self.linear1 = nn.Linear(embedding_dim, vec_size)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return embeds
    
    def log_probs(self, out):
        out = self.linear(out)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class NGramsLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, vec_size)
        self.linear2 = nn.Linear(vec_size, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        out = self.relu(self.linear1(embeds))
        return out
    
    def log_probs(self, out):
        out = self.linear2(out)
        print(out.shape)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class SkipGramLanguageModelerSparse(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size, embedding_freq, negative_sampling_size):
        super().__init__()
        self.embedding_freq = torch.autograd.Variable(torch.Tensor(embedding_freq)**(0.75))
        self.negative_sampling_size = negative_sampling_size
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        #self.linear1 = nn.Linear(embedding_dim, vec_size)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        #return embedding
        return self.word_embedding(inputs) + self.context_embedding(inputs)
    
    def device(self):
        return next(self.parameters()).device
    
    def calculate_loss(self, word, context,alpha=0.1):
        #word.size() batch_size
        #context.size() batch_size*window_size
        word_pos = word.view(-1,1).expand_as(context)
        word_pos = self.word_embedding(word_pos)
        context = self.context_embedding(context)
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        negative_context = self.embedding_freq.multinomial(
            self.negative_sampling_size*context.size(0)*context.size(1), replacement=True).to(self.device())
        negative_context = negative_context.view(-1,self.negative_sampling_size*context.size(1))
        word_neg = word.view(-1,1).expand_as(negative_context)
        word_neg = self.word_embedding(word_neg)
        negative_context = self.context_embedding(negative_context)
        product_neg = (word_neg*negative_context).sum(dim=-1).mean()
        target_neg = torch.autograd.Variable(torch.zeros(product_neg.size()))
        loss_negative = nn.functional.binary_cross_entropy_with_logits(product_neg,target_neg)
        loss = loss_positive + loss_negative
        return loss
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        print(score.shape)
        score = torch.sum(score, dim=1)
        print(score.shape)
        score = nn.functional.logsigmoid(score)
        print(score.shape)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        print(neg_score.shape)
        neg_score = torch.sum(neg_score, dim=0)
        print(neg_score.shape)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        print(neg_score.shape)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        input()
        return -1 * sum(losses)

In [None]:
def get_batch(sentences, context_size, word_to_idx, batch_size, mode):
    selected_sentences = np.random.choice(sentences, batch_size)
    inp_tokens = []
    out_tokens = []
    for sentence in selected_sentences:
        if mode == 'cbow':
            selected_word_idx = np.random.randint(context_size, len(sentence)-context_size)
        else:
            selected_word_idx = np.random.randint(context_size, len(sentence))
        context = [word_to_idx[word] for word in sentence[selected_word_idx-context_size:selected_word_idx]]
        while(len(context) < context_size):
            context.insert(0, word_to_idx[' '])
        if mode == 'cbow':
            post_context = [word_to_idx[word] for word in sentence[selected_word_idx+1:selected_word_idx+1+context_size]]
            while(len(post_context) < context_size):
                post_context.append(word_to_idx[' '])
            context.extend(post_context)
        inp_tokens.append(np.array(context))
        out_tokens.append(np.array(word_to_idx[sentence[selected_word_idx]]))
    return np.array(inp_tokens), np.array(out_tokens)

def get_batch_w2v(sentences, context_size, word_to_idx, batch_size, drop_dict, neg_samples):
    selected_sentences = np.random.choice(sentences, batch_size)
    vec = []
    pos = []
    neg = []
    out_tokens = []
    len_longest_sentence = 0
    for sentence in selected_sentences:
        selected_word_idx = np.random.randint(0, len(sentence))
        drop = drop_dict[word_to_idx[sentence[selected_word_idx]]] < np.random.rand()
        while drop:
            sentence = np.random.choice(sentences)
            selected_word_idx = np.random.randint(0, len(sentence))
            drop = drop_dict[word_to_idx[sentence[selected_word_idx]]] < np.random.rand()
        pos_words = sentence[0:selected_word_idx]
        pos_words.extend(sentence[selected_word_idx+1:])
        len_longest_sentence = len(pos_words) if len_longest_sentence < len(pos_words) else len_longest_sentence
        pos_indices = [word_to_idx[word] for word in pos_words]
        vec.append(selected_word_idx)
        pos.append(pos_indices)
    pos_padded = []
    for pos_indices in pos:
        padded = [word_to_idx[' '] for i in range(len_longest_sentence-len(pos_indices))]
        padded.extend(pos_indices)
        pos_padded.append(padded)
    return np.array(vec), np.array(pos_padded)

In [None]:
embedding_dims = 128
batch_size = 128
n_batches = 64#len(sentences)//batch_size+1
vec_dims = 128
neg_samples = 20
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModelerSparse(len(vocab), embedding_dims, context_size, vec_dims, list(vocab.values()), 10)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(3):
    total_loss = 0
    for i in range(n_batches):
        vec, pos = get_batch_w2v(sentences, context_size, word_to_ix,
                                               batch_size, drop_dict, neg_samples=neg_samples)
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        vec = torch.from_numpy(vec).long().to(device)
        pos = torch.from_numpy(pos).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        loss = model.calculate_loss(vec, pos)

        

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
embedding_dims = 128
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = NGramsLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        inp_tokens, out_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='ngrams')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.from_numpy(out_tokens).long().to(device))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
print(f'Cuda available {torch.cuda.is_available()}')
embedding_dims = 128
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        out_tokens, inp_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='cbow')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = None
        for j in range(out_tokens.shape[1]):
            try:
                loss += loss_function(log_probs, torch.from_numpy(out_tokens[:, j]).long().to(device))
            except TypeError:
                loss = loss_function(log_probs, torch.from_numpy(out_tokens[:, j]).long().to(device))
        loss = loss/out_tokens.shape[1]
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
embedding_dims = 128
context_size = 2
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = CBOWLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        inp_tokens, out_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='cbow')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.from_numpy(out_tokens).long().to(device))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    print(total_loss/n_batches)
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_distance(id0, id1, model):
    out = model(torch.from_numpy(np.array([id0, id1])).long().to(device))
    sk_sim = cosine_similarity(out.data.cpu().numpy())
    return (out[0] * out[1] / (out[0].norm() * out[1].norm())).sum(), sk_sim
    
print(cosine_distance(word_to_ix['nice'], word_to_ix['good'], model))
print(cosine_distance(word_to_ix['fantastic'], word_to_ix['good'], model))
print(cosine_distance(word_to_ix['terrible'], word_to_ix['good'], model))


In [None]:
cartman_id = word_to_ix['man']
asshole_id = word_to_ix['boy']
poor_id = word_to_ix['girl']
print([cartman_id, asshole_id, poor_id])
out = model(torch.from_numpy(np.array([cartman_id, asshole_id, poor_id])).long().to(device))
print(out.shape)
print(out[1] - out[2])
vec = out[0] - out[1] + out[2]
print(out[0], vec)
#vec = (out[0] + out[1]).unsqueeze(0)
print(vec.shape)
print(model.word_embedding.weight.shape)
score = vec.view(1, -1).expand_as(model.word_embedding.weight)*model.word_embedding.weight
score = score.sum(dim=-1)
norm = vec.view(1, -1).expand_as(model.word_embedding.weight).norm(dim=1)*model.word_embedding.weight.norm(dim=1)
print(score.shape)
print(norm.shape)
print(score)
print(norm)
word = (score / norm)
print(word.shape)
print(word)
word = word.argmax()
print(ix_to_word[int(word.cpu().numpy())])

In [None]:

corpus_name = "cornell_movie-dialogs_corpus"
corpus = os.path.join("../data", corpus_name)

def print_lines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

print_lines(os.path.join(corpus, "movie_lines.txt"))

In [None]:
# Splits each line of the file into a dictionary of fields
def load_lines(file_name, fields):
    lines = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            line_dict = {}
            for i, field in enumerate(fields):
                line_dict[field] = values[i]
            lines.append(line_dict)
    return pd.DataFrame(lines)

In [None]:
header_lines = ["line_id", "character_id", "movie_id", "character", "text"]
header_conversations = ["character1_id", "character2_id", "movie_id", "utterance_id"]

# Load lines and process conversations
print("\nProcessing corpus...")
lines = load_lines(os.path.join(corpus, "movie_lines.txt"), header_lines)

In [None]:
lines

In [None]:
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def load_conversations(file_name, lines, fields):
    conversations = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            conversation = {}
            for i, field in enumerate(fields):
                conversation[field] = values[i]
            # Convert string to list (conversation["utteranceIDs"] == "['L598485', 'L598486', ...]")
            line_ids = eval(conversation["utteranceIDs"])
            # Reassemble lines
            conversation["lines"] = []
            for line_id in line_ids:
                conversation["lines"].append(lines.loc[lines['line_id'] == line_id])
            conversations.append(conversation)
    return pd.DataFrame(conversations)

In [None]:
load_conversations(os.path.join(corpus, "movie_conversations.txt"),
                                  lines, header_conversations)