In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [118]:
import numpy as np
past_values = np.load('dataset_train/past_values_da.npy')
vocabulary = np.arange(24)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
       11,  2,  2,  0,  1,  1])

In [121]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.

    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]

    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [122]:
def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(context_tuple_list[i][0])
        batch_context.append(context_tuple_list[i][1])
        batch_negative.append([w for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [123]:
from numpy.random import multinomial
from collections import Counter
import random, math
import re
import itertools
def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(past_values))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

In [124]:
import numpy as np

context_tuple_list = []

negative_samples = sample_negative(8)

for text in past_values:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 626980 pairs of target and context words


In [125]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [126]:
import time

vocabulary_size = len(vocabulary)

loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=4, vocab_size=vocabulary_size)
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=5, min_percent_gain=0.01)

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.data)
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

Loss:  1704.3027
Loss gain: 0.19%
Loss:  1703.8567
Loss gain: 0.15%
Loss:  1703.4968
Loss gain: 0.12%
Loss:  1703.1843
Loss gain: 0.1%
Loss:  1702.9463
Loss gain: 0.08%
Loss:  1702.6953
Loss gain: 0.07%
Loss:  1702.5277
Loss gain: 0.06%
Loss:  1702.3368
Loss gain: 0.05%
Loss:  1702.1917
Loss gain: 0.04%
Loss:  1702.0294
Loss gain: 0.04%
Loss:  1701.9031
Loss gain: 0.04%
Loss:  1701.7683
Loss gain: 0.03%
Loss:  1701.6635
Loss gain: 0.03%
Loss:  1701.5215
Loss gain: 0.03%
Loss:  1701.3796
Loss gain: 0.03%
Loss:  1701.2863
Loss gain: 0.03%
Loss:  1701.1938
Loss gain: 0.03%
Loss:  1701.0496
Loss gain: 0.03%
Loss:  1700.9504
Loss gain: 0.03%
Loss:  1700.8422
Loss gain: 0.03%
Loss:  1700.7478
Loss gain: 0.03%
Loss:  1700.6373
Loss gain: 0.02%
Loss:  1700.5271
Loss gain: 0.02%
Loss:  1700.422
Loss gain: 0.02%
Loss:  1700.3387
Loss gain: 0.02%
Loss:  1700.2301
Loss gain: 0.02%
Loss:  1700.0984
Loss gain: 0.03%
Loss:  1700.0088
Loss gain: 0.02%
Loss:  1699.9076
Loss gain: 0.03%
Loss:  1699.8213

In [132]:
torch.save(net.state_dict(), 'word2vec.pth')