In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
import os
import pandas as pd

import torch 
from torch import nn
import multiprocessing as mp
from numpy import random
import numpy as np
import functools
from typing import Dict, List, Iterator, Union, Callable
# In AllenNLP each training example is represented as an Instance containing Fields
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField

from allennlp.data.iterators import BucketIterator

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.tokenizers import Token

from allennlp.models import Model
from allennlp.training.trainer import Trainer
from allennlp.data.vocabulary import Vocabulary

from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter

from allennlp.predictors import SentenceTaggerPredictor

import time
import re
from os.path import expanduser
import csv
import sys

csv.field_size_limit(sys.maxsize)

torch.manual_seed(42)

home = expanduser("~")

In [None]:
class SentenceDataset():

    def __init__(self, data_series: List[pd.Series], csvs: List[str], 
                 token_indexers: Dict[str, TokenIndexer] = None, 
                 tokenizer: WordTokenizer = SpacyWordSplitter(),
                 sentence_splitter: SpacySentenceSplitter = SpacySentenceSplitter(rule_based=True),
                 context_size: int = 1) -> None:
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.tokenizer = tokenizer
        self.sentence_splitter = sentence_splitter
        self.context_size = context_size
        self.data_series = data_series
        self.csvs = csvs
        self.queue = mp.Queue(maxsize=10000)
        self.stop = True
        # self.sentences = []
        # self.next_sentence_ids = []

    @staticmethod
    def text_to_instance(tokens: List[Token], token_indexers, context: List[Token] = None) -> Instance:
        sentence_field = TextField(tokens, token_indexers)
        fields = {"sentence": sentence_field}
        
        if context:
            label_field = TextField(context, token_indexers)
            fields["context"] = label_field
        
        return Instance(fields)
    
    def __iter__(self) -> Iterator[Instance]:
        select_context_part = functools.partial(self.select_context_from_sentence, 
                                                context_size = self.context_size)
        instances = self.iterate_data(select_context_part, True)
        if isinstance(instances, list):
            raise ConfigurationError("For a lazy dataset reader, word_context() must return a generator")
        return instances
         
    @staticmethod       
    def select_context_from_sentence(sentence: List[Token], context_size: int):
        j = random.randint(0, len(sentence))
        context1 = [word for word in sentence[j-context_size:j]]
        while len(context1) < context_size:
            context1.insert(0, Token())
        context2 = [word for word in sentence[j+1:j+context_size+1]]
        while len(context1) < context_size:
            context2.append(Token())
        context1.extend(context2)
        return [sentence[j]], context1

    def it_from_ser(self) -> Iterator[Instance]:
        instances = self.iterate_data(None, False)
        if isinstance(instances, list):
            raise ConfigurationError("For a lazy dataset reader, word_context() must return a generator")
            
        return instances
    
    @staticmethod
    def process_chunk(tokenizer, sentences: List[str], split_sentences: bool = True, sentence_splitter = None):
        print(len(sentences))
        print(sentences[0])
        print(sentences[-1])
        ret_list = []
        if split_sentences:
            arts = sentence_splitter.batch_split_sentences(sentences)
            for sentences in arts: 
                sentences = tokenizer.batch_split_words(sentences)
                for sentence in sentences:
                    ret_list.append(sentence)
        else:
            sentences = tokenizer.batch_split_words(sentences)
            for sentence in sentences:
                ret_list.append(sentence)
        print(len(ret_list))
        print(ret_list[0])
        print(ret_list[-1])
        return ret_list
    
    @staticmethod
    def split_on_sep(sentence, sep=';'):
        return [Token(x) for x in sentence.replace('.', '').split(';')]
    
    @staticmethod
    def split_chunk(sentences):
        ret_list = []
        sentences = sentences['sentence'].values
        for sentence in sentences:
            if not len(sentence):
                ret_list.append([Token('')])
            if not sentence or len(sentence) > 2000:
                ret_list.append([Token('')])
            words = SentenceDataset.split_on_sep(sentence)
            ret_list.append(words)
        return ret_list
        print('Waiting for Queue....')
        queue.put(words)
        print('Put to Queue....')
        return 
        return words
        #sentences = sentences.apply(lambda x: x.replace('\uffef', '')).values
        return SentenceDataset.process_chunk(tokenizer, sentences, False)
    
    
    def iterate_data(self, apply_to_sentence: Callable,
                     split_sentences: bool = True, ) -> Iterator[Union[Instance, Instance]]:
        next_sentence_id = 0
        for ser in self.data_series:
            sentences = ser[pd.notnull(ser)].values
            sentence_word_list = SentenceDataset.process_chunk(
                self.tokenizer, sentences, split_sentences, 
                self.sentence_splitter)
            for sentence in sentence_word_list:
                if len(sentence) < 3:
                    continue
                if apply_to_sentence is not None:
                    sentence, context = apply_to_sentence(sentence)
                    yield SentenceDataset.text_to_instance(sentence, self.token_indexers, context)
                else:
                    yield SentenceDataset.text_to_instance(sentence, self.token_indexers)
        for f in self.csvs:
            pool = mp.Pool(14)
            reader = pd.read_csv(f, chunksize=int(50), dtype=str, sep='\uffef', engine='python')
            for sentences in pool.imap(SentenceDataset.split_chunk, reader):
                for sentence in sentences:
                    if len(sentence) < 3:
                        continue
                    if apply_to_sentence is not None:
                        sentence, context = apply_to_sentence(sentence)
                        yield SentenceDataset.text_to_instance(sentence, self.token_indexers, context)
                    else:
                        yield SentenceDataset.text_to_instance(sentence, self.token_indexers)
            


In [None]:
class SkipGramLanguageModelerSparse(Model):

    def __init__(self, vocab: Vocabulary, embedding_dim, context_size, 
                 embedding_freq, negative_sampling_size):
        super().__init__(vocab)
        vocab_size = vocab.get_vocab_size('tokens')
        print(vocab_size)
        self.embedding_freq = torch.autograd.Variable(torch.Tensor(embedding_freq)**(0.75))
        self.negative_sampling_size = negative_sampling_size
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()
        self.last_loss = None

    def forward(self, sentence, context=None):
        
        word = sentence['tokens']
        
        # Allen NLP demands a dictionary to be returned
        # It has to contain the key loss for training purposes
        output = {}
        word_emb = self.word_embedding(word)
        output['vec'] = word_emb + self.context_embedding(word)

        if context is None:
            #return embedding
            return output
        
        context = context['tokens']
        
        context = self.context_embedding(context)
        word_pos = word_emb.expand_as(context)
        
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        negative_context = self.embedding_freq.multinomial(
            self.negative_sampling_size*context.size(0)*context.size(1), replacement=True).to(self.device())
        negative_context = negative_context.view(-1,self.negative_sampling_size*context.size(1))
        negative_context = self.context_embedding(negative_context)
        word_neg = word_emb.expand_as(negative_context)
        product_neg = (word_neg*negative_context).sum(dim=-1).mean()
        target_neg = torch.autograd.Variable(torch.zeros(product_neg.size()))
        loss_negative = nn.functional.binary_cross_entropy_with_logits(product_neg,target_neg)
        self.last_loss = loss_positive + loss_negative
        output['loss'] = loss_positive + loss_negative
        output['vec'] = word_pos + context
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"bce": self.last_loss}
    
    def device(self):
        return next(self.parameters()).device
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        score = torch.sum(score, dim=1)
        score = nn.functional.logsigmoid(score)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=0)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        return -1 * sum(losses)
    
class SkipGramLanguageModelerSparseNoNeg(Model):

    def __init__(self, vocab: Vocabulary, embedding_dim, context_size):
        super().__init__(vocab)
        vocab_size = vocab.get_vocab_size('tokens')
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()
        self.last_loss = None

    def forward(self, sentence, context=None):
        
        word = sentence['tokens']
        
        # Allen NLP demands a dictionary to be returned
        # It has to contain the key loss for training purposes
        output = {}
        word_emb = self.word_embedding(word)
        output['vec'] = word_emb + self.context_embedding(word)

        if context is None:
            #return embedding
            return output
        
        context = context['tokens']
        
        context = self.context_embedding(context)
        word_pos = word_emb.expand_as(context)
        
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        self.last_loss = loss_positive
        output['loss'] = loss_positive
        output['vec'] = word_pos + context
        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"bce": self.last_loss}
    
    def device(self):
        return next(self.parameters()).device
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        score = torch.sum(score, dim=1)
        score = nn.functional.logsigmoid(score)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        neg_score = torch.sum(neg_score, dim=0)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        return -1 * sum(losses)
    

In [4]:
path = home + '/data/workshop_data/southpark_full.csv'
corpus = pd.read_csv(path, sep='\t', dtype=str, error_bad_lines=False)
wiki_path = home + '/data/workshop_data/wiki_out.csv'
ds = SentenceDataset([corpus['spoken']], [wiki_path])
train_dataset = ds.it_from_ser()
word2vec_dataset = ds

In [None]:
ds.__iter__().__next__()

In [5]:
vocab = Vocabulary.from_instances(train_dataset)

0it [00:00, ?it/s]

66413
School day, school day, teacher's golden ru...
 Billy... What have I done?


11572it [00:11,  5.44s/it]

66413
[School, day, ,, school, day, ,, teacher, 's, golden, ru, ...]
[Billy, ..., What, have, I, done, ?]


61360it [02:40, 382.66it/s]


In [None]:
vocab.save_to_files(home + '/data/workshop_data/vocab.store')
with open(home + '/data/workshop_data/vocab.dict', 'w') as f:
    for k, v in vocab._retained_counter['tokens'].items():
        f.write(f'{k},{v}\n')

In [None]:
vocab.print_statistics()

In [None]:
add_dict = {}
with open(home + '/data/workshop_data/vocab.dict', 'r') as f:
    content = f.read()
    lines = content.split('\n')
    # line = f.readline()
    for line in lines:
        try:
            comma_index = line.rfind(',')
            token = line[:comma_index]
            if (#'|' in token or '/' in token or 
                #len(token) == 1 or 
                len(token) > 300 
                #or 'style="' in token or '<' in token or '!colspan=' in token
            ):
                    continue
            count = int(line[comma_index+1:])
            if count < 300:
                continue
            add_dict[token] = count
            # line = f.readline()
        except:
            print(line)

            
    
vocab_new = Vocabulary(counter={'tokens': add_dict}, min_count={'tokens': 10}) 
print(len(list(add_dict.values())))
print(vocab_new.get_vocab_size('tokens'))

In [None]:
vocab_new.print_statistics()

In [None]:
vocab_new.save_to_files(home + '/data/workshop_data/vocab_new.store')
with open(home + '/data/workshop_data/vocab_new.dict', 'w') as f:
    for k, v in add_dict.items():
        f.write(f'{k},{v}\n')

In [None]:
# vocab_new = Vocabulary.from_files('vocab_new.store')
add_dict = {}
with open(home + '/data/workshop_data/vocab_new.dict', 'r') as f:
    content = f.read()
    lines = content.split('\n')
    # line = f.readline()
    for line in lines:
        try:
            comma_index = line.rfind(',')
            token = line[:comma_index]
            count = int(line[comma_index+1:])
            add_dict[token] = count
            # line = f.readline()
        except ValueError:
            print(line)
vocab_new = Vocabulary(counter={'tokens': add_dict}) 

In [None]:
print(len(add_dict.values()))
print(vocab_new.get_vocab_size('tokens'))

In [None]:
embedding_dims = 128
batch_size = 10240

context_size = 3
n_batches = 64#len(sentences)//batch_size+1
vec_dims = 128
neg_samples = 10
dict_values = list(add_dict.values())
dict_values.insert(0, 0)
dict_values.insert(0, 0)
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModelerSparse(
    vocab_new, embedding_dims, context_size, 
    dict_values, 
    neg_samples)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=1e-4)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
import sys
for param in model.parameters():
    print(param.shape)

In [None]:


iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab_new)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=word2vec_dataset,
                  # validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=2,
                  cuda_device=0)

trainer.train()



In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=ds)

vec = predictor.predict("Cartman")['vec']

In [None]:
vec

In [None]:
def get_closest_word(vec, model, vocab):
    score = vec.view(1, -1).expand_as(model.word_embedding.weight)*model.word_embedding.weight
    score = score.sum(dim=-1)
    norm = vec.view(1, -1).expand_as(model.word_embedding.weight).norm(dim=1)*model.word_embedding.weight.norm(dim=1)
    word_idx = (score / norm).argmax().numpy()
    print(word_idx)
    word = vocab.get_token_from_index(int(word_idx))
    return word

out = predictor.predict("cartman asshole girl")['vec']
print(out)
vec = torch.tensor(out[0]) - torch.tensor(out[1]) #+ torch.tensor(out[2])
print(out[0], vec)
#vec = (out[0] + out[1]).unsqueeze(0)
print(get_closest_word(vec, model, vocab))

In [None]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)



In [None]:
def print_lines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [None]:
path = '/home/jendrik/git/DLWorkshop/data/all-seasons.csv'
print_lines(path)

In [None]:
corpus = pd.read_csv(path)

In [None]:
corpus

In [None]:
context_size = 2
lmtzr = WordNetLemmatizer()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

ls = tokenizer.tokenize(corpus.iloc[0]['Line'])
print(ls)
for sentence in ls:
    print([lmtzr.lemmatize(word) for word in nltk.word_tokenize(sentence) if word.isalpha()])


In [None]:
sentences = []
vocab = {' ': 1}
corpus_len = 0
for i, row in corpus.iterrows():
    ls = tokenizer.tokenize(row['Line'])
    for sentence in ls:
        stemmed_sentence = [lmtzr.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalpha()]
        if len(stemmed_sentence) > 2:
            corpus_len += len(stemmed_sentence)
            for word in stemmed_sentence:
                try:
                    vocab[word] += 1
                except KeyError:
                    vocab[word] = 1
            sentences.append(stemmed_sentence)


In [None]:
print(len(vocab.keys()))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {v: k for k, v in word_to_ix.items()}
#vocab

In [None]:
drop_dict = {}
sample_dict = {}
for k, v in vocab.items():
    z = v/corpus_len
    drop_dict[word_to_ix[k]] = (np.sqrt(z*1e3) + 1)*1e-3/z
    sample_dict[word_to_ix[k]] = (v/corpus_len)**0.75
sample_dict_sum = sum(sample_dict.values())
for k in sample_dict:
    sample_dict[k] /= sample_dict_sum
    
    

In [None]:
class CBOWLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        out = self.embeddings(inputs).mean(1)#.view((inputs.shape[0], -1))
        #out = self.relu(self.linear1(embeds))
        return out
    
    def log_probs(self, out):
        out = self.linear(out)
        print(out.shape)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class SkipGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        #self.linear1 = nn.Linear(embedding_dim, vec_size)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        return embeds
    
    def log_probs(self, out):
        out = self.linear(out)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class NGramsLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, vec_size)
        self.linear2 = nn.Linear(vec_size, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        out = self.relu(self.linear1(embeds))
        return out
    
    def log_probs(self, out):
        out = self.linear2(out)
        print(out.shape)
        log_probs = nn.functional.log_softmax(out, dim=1)
        return log_probs
    
class SkipGramLanguageModelerSparse(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, vec_size, embedding_freq, negative_sampling_size):
        super().__init__()
        self.embedding_freq = torch.autograd.Variable(torch.Tensor(embedding_freq)**(0.75))
        self.negative_sampling_size = negative_sampling_size
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        #self.linear1 = nn.Linear(embedding_dim, vec_size)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        #return embedding
        return self.word_embedding(inputs) + self.context_embedding(inputs)
    
    def device(self):
        return next(self.parameters()).device
    
    def calculate_loss(self, word, context,alpha=0.1):
        #word.size() batch_size
        #context.size() batch_size*window_size
        word_pos = word.view(-1,1).expand_as(context)
        word_pos = self.word_embedding(word_pos)
        context = self.context_embedding(context)
        product_pos = (word_pos*context).sum(dim=-1).mean()
        target_pos = torch.autograd.Variable(torch.ones(product_pos.size()))
        loss_positive = nn.functional.binary_cross_entropy_with_logits(product_pos, target_pos)
        negative_context = self.embedding_freq.multinomial(
            self.negative_sampling_size*context.size(0)*context.size(1), replacement=True).to(self.device())
        negative_context = negative_context.view(-1,self.negative_sampling_size*context.size(1))
        word_neg = word.view(-1,1).expand_as(negative_context)
        word_neg = self.word_embedding(word_neg)
        negative_context = self.context_embedding(negative_context)
        product_neg = (word_neg*negative_context).sum(dim=-1).mean()
        target_neg = torch.autograd.Variable(torch.zeros(product_neg.size()))
        loss_negative = nn.functional.binary_cross_entropy_with_logits(product_neg,target_neg)
        loss = loss_positive + loss_negative
        return loss
    
    def log_probs(self, inp, out, neg):
        losses = []
        inp = self.embeddings(inp)
        out = self.embeddings(out)
        neg = self.embeddings(neg)
        score = torch.mul(inp, out)
        print(score.shape)
        score = torch.sum(score, dim=1)
        print(score.shape)
        score = nn.functional.logsigmoid(score)
        print(score.shape)
        neg_score = torch.bmm(neg, inp.unsqueeze(2)).squeeze()
        print(neg_score.shape)
        neg_score = torch.sum(neg_score, dim=0)
        print(neg_score.shape)
        neg_score = nn.functional.logsigmoid(-1 * neg_score)
        print(neg_score.shape)
        losses.append(sum(neg_score))
        losses.append(sum(score))
        input()
        return -1 * sum(losses)

In [None]:
def get_batch(sentences, context_size, word_to_idx, batch_size, mode):
    selected_sentences = np.random.choice(sentences, batch_size)
    inp_tokens = []
    out_tokens = []
    for sentence in selected_sentences:
        if mode == 'cbow':
            selected_word_idx = np.random.randint(context_size, len(sentence)-context_size)
        else:
            selected_word_idx = np.random.randint(context_size, len(sentence))
        context = [word_to_idx[word] for word in sentence[selected_word_idx-context_size:selected_word_idx]]
        while(len(context) < context_size):
            context.insert(0, word_to_idx[' '])
        if mode == 'cbow':
            post_context = [word_to_idx[word] for word in sentence[selected_word_idx+1:selected_word_idx+1+context_size]]
            while(len(post_context) < context_size):
                post_context.append(word_to_idx[' '])
            context.extend(post_context)
        inp_tokens.append(np.array(context))
        out_tokens.append(np.array(word_to_idx[sentence[selected_word_idx]]))
    return np.array(inp_tokens), np.array(out_tokens)

def get_batch_w2v(sentences, context_size, word_to_idx, batch_size, drop_dict, neg_samples):
    selected_sentences = np.random.choice(sentences, batch_size)
    vec = []
    pos = []
    neg = []
    out_tokens = []
    len_longest_sentence = 0
    for sentence in selected_sentences:
        selected_word_idx = np.random.randint(0, len(sentence))
        drop = drop_dict[word_to_idx[sentence[selected_word_idx]]] < np.random.rand()
        while drop:
            sentence = np.random.choice(sentences)
            selected_word_idx = np.random.randint(0, len(sentence))
            drop = drop_dict[word_to_idx[sentence[selected_word_idx]]] < np.random.rand()
        pos_words = sentence[0:selected_word_idx]
        pos_words.extend(sentence[selected_word_idx+1:])
        len_longest_sentence = len(pos_words) if len_longest_sentence < len(pos_words) else len_longest_sentence
        pos_indices = [word_to_idx[word] for word in pos_words]
        vec.append(selected_word_idx)
        pos.append(pos_indices)
    pos_padded = []
    for pos_indices in pos:
        padded = [word_to_idx[' '] for i in range(len_longest_sentence-len(pos_indices))]
        padded.extend(pos_indices)
        pos_padded.append(padded)
    return np.array(vec), np.array(pos_padded)

In [None]:
embedding_dims = 128
batch_size = 128
n_batches = 64#len(sentences)//batch_size+1
vec_dims = 128
neg_samples = 20
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModelerSparse(len(vocab), embedding_dims, context_size, vec_dims, list(vocab.values()), 10)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(3):
    total_loss = 0
    for i in range(n_batches):
        vec, pos = get_batch_w2v(sentences, context_size, word_to_ix,
                                               batch_size, drop_dict, neg_samples=neg_samples)
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        vec = torch.from_numpy(vec).long().to(device)
        pos = torch.from_numpy(pos).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        loss = model.calculate_loss(vec, pos)

        

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
embedding_dims = 128
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = NGramsLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        inp_tokens, out_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='ngrams')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.from_numpy(out_tokens).long().to(device))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
print(f'Cuda available {torch.cuda.is_available()}')
embedding_dims = 128
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = SkipGramLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        out_tokens, inp_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='cbow')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = None
        for j in range(out_tokens.shape[1]):
            try:
                loss += loss_function(log_probs, torch.from_numpy(out_tokens[:, j]).long().to(device))
            except TypeError:
                loss = loss_function(log_probs, torch.from_numpy(out_tokens[:, j]).long().to(device))
        loss = loss/out_tokens.shape[1]
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}', end='\r')
    print(f'Epoch: {epoch}, Batch: {i}/{n_batches} loss: {total_loss/(i+1)}')
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
embedding_dims = 128
context_size = 2
batch_size = 128
n_batches = len(sentences)//batch_size+1
vec_dims = 128
losses = []
loss_function = nn.NLLLoss()
model = CBOWLanguageModeler(len(vocab), embedding_dims, context_size, vec_dims)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(20):
    total_loss = 0
    for i in range(n_batches):
        inp_tokens, out_tokens = get_batch(sentences, context_size, word_to_ix, batch_size, mode='cbow')
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.from_numpy(inp_tokens).long().to(device)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        out = model(context_idxs)
        log_probs = model.log_probs(out)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.from_numpy(out_tokens).long().to(device))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    print(total_loss/n_batches)
    losses.append(total_loss/n_batches)
print(losses)  # The loss decreased every iteration over the training data!

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_distance(id0, id1, model):
    out = model(torch.from_numpy(np.array([id0, id1])).long().to(device))
    sk_sim = cosine_similarity(out.data.cpu().numpy())
    return (out[0] * out[1] / (out[0].norm() * out[1].norm())).sum(), sk_sim
    
print(cosine_distance(word_to_ix['nice'], word_to_ix['good'], model))
print(cosine_distance(word_to_ix['fantastic'], word_to_ix['good'], model))
print(cosine_distance(word_to_ix['terrible'], word_to_ix['good'], model))


In [None]:
cartman_id = word_to_ix['man']
asshole_id = word_to_ix['boy']
poor_id = word_to_ix['girl']
print([cartman_id, asshole_id, poor_id])
out = model(torch.from_numpy(np.array([cartman_id, asshole_id, poor_id])).long().to(device))
print(out.shape)
print(out[1] - out[2])
vec = out[0] - out[1] + out[2]
print(out[0], vec)
#vec = (out[0] + out[1]).unsqueeze(0)
print(vec.shape)
print(model.word_embedding.weight.shape)
score = vec.view(1, -1).expand_as(model.word_embedding.weight)*model.word_embedding.weight
score = score.sum(dim=-1)
norm = vec.view(1, -1).expand_as(model.word_embedding.weight).norm(dim=1)*model.word_embedding.weight.norm(dim=1)
print(score.shape)
print(norm.shape)
print(score)
print(norm)
word = (score / norm)
print(word.shape)
print(word)
word = word.argmax()
print(ix_to_word[int(word.cpu().numpy())])

In [None]:

corpus_name = "cornell_movie-dialogs_corpus"
corpus = os.path.join("../data", corpus_name)

def print_lines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

print_lines(os.path.join(corpus, "movie_lines.txt"))

In [None]:
# Splits each line of the file into a dictionary of fields
def load_lines(file_name, fields):
    lines = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            line_dict = {}
            for i, field in enumerate(fields):
                line_dict[field] = values[i]
            lines.append(line_dict)
    return pd.DataFrame(lines)

In [None]:
header_lines = ["line_id", "character_id", "movie_id", "character", "text"]
header_conversations = ["character1_id", "character2_id", "movie_id", "utterance_id"]

# Load lines and process conversations
print("\nProcessing corpus...")
lines = load_lines(os.path.join(corpus, "movie_lines.txt"), header_lines)

In [None]:
lines

In [None]:
# Groups fields of lines from `loadLines` into conversations based on *movie_conversations.txt*
def load_conversations(file_name, lines, fields):
    conversations = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            conversation = {}
            for i, field in enumerate(fields):
                conversation[field] = values[i]
            # Convert string to list (conversation["utteranceIDs"] == "['L598485', 'L598486', ...]")
            line_ids = eval(conversation["utteranceIDs"])
            # Reassemble lines
            conversation["lines"] = []
            for line_id in line_ids:
                conversation["lines"].append(lines.loc[lines['line_id'] == line_id])
            conversations.append(conversation)
    return pd.DataFrame(conversations)

In [None]:
load_conversations(os.path.join(corpus, "movie_conversations.txt"),
                                  lines, header_conversations)