# Part 2: Text embedding

we form text embeddings using the PV-DBOW method: for a given Paragraph, classify positive words from the negatives. 
needs a GPU to do anything worth.

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("classic")
import operator
import functools
import itertools
import dataclasses
import json
import re
import collections
import pickle
import scipy.sparse as sp
from tqdm.autonotebook import tqdm
import torch
import opt_einsum as oe
import torch.nn as nn
@dataclasses.dataclass
class ScriptParams():
    second_pass_p = "../input/textdata/abstracts_p2.txt"
    batch_size = 16384 # really large batch size to fully use the GPU.
    n_neg = 128 # number of negative samples per input.
    embedding_dim = 64 # pretty low dim embedding to aavoid overfitting.
@dataclasses.dataclass
class TextData():
    """ a simple class to hold onto the data from the abstracts.
    """
    authors: np.ndarray
    abstracts: np.ndarray
    offsets: np.ndarray
    vocabulary: np.ndarray
    frequencies: np.ndarray

In [81]:
class positive_dataset(torch.utils.data.dataset.Dataset):
    """ creates a dataset, where dataset[i] is (document, positive_word, positive_word_log_proba).
    """
    def __init__(self, data: TextData):
        super(positive_dataset).__init__()
        self.words = data.abstracts
        self.p = data.frequencies
        self.p = self.p / self.p.sum()
        self.p = - np.log(self.p)
        self.docs = data.authors
        self.offsets = data.offsets

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        #idx = np.random.choice(len(self.words), ScriptParams().batch_size)#(i, min(i+ScriptParams().batch_size, len(self.words)))
        w = self.words[idx].astype(np.int64)
        a = np.searchsorted(self.offsets, idx, side="right") - 1 # get
        p = self.p[w]
        return (a, w, p)
    
class BatchRandomSampler(torch.utils.data.Sampler[int]):
    """batch random sampler
    """

    def __init__(self, indices: int) -> None:
        self.indices = indices

    def __iter__(self):
        permut = np.random.permutation(self.indices) #shuffin'
        return (permut[i:i+ScriptParams().batch_size] for i in range(0, self.indices-ScriptParams().batch_size+1, ScriptParams().batch_size))
        

    def __len__(self):
        return len(self.indices) // ScriptParams().batch_size
    

def make_sampler(dataset):
    """ wraps the dataset into a sampler.
    
    need to use a special sampler to sample batch directly to avoid overheads
    """
    sampler = torch.utils.data.dataloader.DataLoader(dataset, 
                                                     sampler=BatchRandomSampler(len(dataset.words)),
                                                     batch_size=None, pin_memory=True, num_workers=4, prefetch_factor=2)
    return sampler

class negative_dataset(torch.utils.data.dataset.IterableDataset):
    """
    a dataset giving batch of negative samples.
    """
    
    def __init__(self, data: TextData):
        super(negative_dataset).__init__()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.p = torch.from_numpy(data.frequencies).to(torch.float).to(device)
        self.f = torch.from_numpy(data.frequencies).to(torch.float).to(device)
        self.p /= self.p.sum()
        self.p = - torch.log(self.p) #log-proba
        self.n_per_batch = ScriptParams().batch_size * ScriptParams().n_neg
        self.voc_size = len(self.p)
        self.device =device
        self.voc = torch.arange(0, len(self.p), device=device)
        self.shape = (ScriptParams().batch_size, ScriptParams().n_neg)
    
    def __iter__(self):
        """
        fast method for sampling from a  wheighted distribution.
        """
        
        while True:
            #idx = torch.searchsorted(self.cumsum, torch.rand(self.n_per_batch,device=self.device))
            idx = torch.multinomial(self.f, self.n_per_batch, replacement=True)
            yield idx.reshape(self.shape), self.p[idx].reshape(self.shape)

In [82]:
def blackout(logits):
    """ blackout inspired loss
    assuming the true value is associated the first logit.
    """

    return torch.mean(- logits[:,0] + torch.logsumexp(logits, dim=1))

In [83]:
class Doc2Vec(nn.Module):
    """A document to vec model. Based on PV-DBOW & the Blackout loss.
    """
    
    def __init__(self, n_doc, n_voc, n_emb):
        """
        inputs:
        -----
        n_doc: number of documents to embed
        n_voc: number of words to embed
        n_emb: dimention of the embeddings
        """
        super(Doc2Vec, self).__init__()
        self.doc_emb = nn.Embedding(num_embeddings=n_doc, embedding_dim=n_emb)
        self.voc_emb = nn.Embedding(num_embeddings=n_voc, embedding_dim=n_emb)
        shapes = (ScriptParams().batch_size, n_emb), (ScriptParams().batch_size, ScriptParams().n_neg+1, n_emb)
        self.einsum = oe.contract_expression("ij,ikj->ik",*shapes)
    
    def forward(self, docs, ctx, log_p):
        """ computes logits for a batch of documents and contexts.
        docs: (bs,)
        ctx: (bs, n_ctx)
        log_p: (bs, n_ctx)
        """
        x = self.doc_emb(docs) #bs, h
        y = self.voc_emb(ctx) #bs, n_ctx, h
        weighted_logits = self.einsum(x,y, backend='torch') + log_p #re-weighting according to the formula from Blackout.  
        return weighted_logits

In [84]:
def make_model(abstract_data: TextData, lr=0.01):
    """ just build the model
    """
    n_doc = len(abstract_data.authors)
    n_voc = len(abstract_data.vocabulary)
    n_h = ScriptParams().embedding_dim
    model = Doc2Vec(n_doc=n_doc, n_voc=n_voc, n_emb=n_h)
    optimizer = torch.optim.SGD(model.parameters(), lr)
    return model, optimizer

In [85]:
def train(model, optimizer, inputs, negatives, n_epoch):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    for epoch in range(n_epoch):
        iterator = tqdm(zip(inputs, negatives))
        for (doc, pos, p_pos), (neg, p_neg) in iterator:
            doc = doc.to(device)
            pos = pos.to(device)
            p_pos = p_pos.to(device)
            optimizer.zero_grad()
            logits = model(doc, torch.cat([pos.reshape((-1,1)), neg], dim=1), torch.cat([p_pos.reshape((-1,1)), p_neg], dim=1))
            l = blackout(logits)
            l.backward()
            optimizer.step()
            iterator.set_postfix_str(f"loss :{l.item():.4f}", refresh=False)

In [86]:
#load the previouly computed text data
with np.load(ScriptParams().second_pass_p+".npz", allow_pickle=True) as data:
    abstracts_data = TextData(**data)
    

model, optimizer = make_model(abstracts_data)
positives = positive_dataset(abstracts_data)
#positives= make_dataset_gpu(abstracts_data)
inputs = make_sampler(positives)
#inputs = make_sampler_gpu(positives)
negatives = negative_dataset(abstracts_data)

del abstracts_data # save some space.

In [87]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 5)
torch.save(model.state_dict(), "epoch_5.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [89]:
for g in optimizer.param_groups:
    g['lr'] = 0.05 # make learning rate a bit gigger since trainning seemed to be slow

In [90]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_15.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [92]:
for g in optimizer.param_groups:
    g['lr'] = 0.1 # make learning rate a bit gigger since trainning seemed to be slow

In [93]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_25.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [94]:
for g in optimizer.param_groups:
    g['lr'] = 0.2 # make learning rate a bit gigger since trainning seemed to be slow

In [95]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_35.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [96]:
for g in optimizer.param_groups:
    g['lr'] = 0.5 # make learning rate a bit gigger since trainning seemed to be slow

In [97]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 5)
torch.save(model.state_dict(), "epoch_40.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [98]:
for g in optimizer.param_groups:
    g['lr'] = 0.8 # make learning rate a bit gigger since trainning seemed to be slow

In [99]:
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_50.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [100]:
for g in optimizer.param_groups:
    g['lr'] = 1.0 # make learning rate a bit gigger since trainning seemed to be slow

In [101]:
#loss seems to stabilise, we will stop the trainning for now.
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_60.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [102]:
for g in optimizer.param_groups:
    g['lr'] = 1.5 # make learning rate a bit gigger since trainning seemed to be slow

In [103]:
#loss seems to stabilise, we will stop the trainning for now.
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_70.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [104]:
#loss seems to stabilise, we will stop the trainning for now.
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_80.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [105]:
#oss seems to stabilise, we will stop the trainning for now.
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_90.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [106]:
#oss seems to stabilise, we will stop the trainning for now.
#should take about ~2 minutes per epoch on kaggles GPU's.
train(model, optimizer, inputs, negatives, 10)
torch.save(model.state_dict(), "epoch_100.pt")

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

|          | 0/? [00:00<?, ?it/s]

In [107]:
doc_embs = model.doc_emb.weight.cpu().detach().numpy()

In [108]:
np.save("abstract_embeddings.npy", doc_embs)