In [1]:
!pip install sentence_transformers
from google.colab import drive
import os
import gzip
import json
import pandas as pd
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 28.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 51.3 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 65.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.4 MB/s 
Building wheels for collected pa

##**RobertaTraining**


We takes the previously created sentence embeddings, clusters and vocabulary fine and performs fine tuning of Roberta model to generate document vectors.
We run 2 epochs for training on 25000 rows of dataset 

In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
from tqdm.auto import tqdm
import re
from scipy.cluster.vq import *
import torch
import torch.nn as nn
import torch.optim as optim
import math
import json
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import AdamW


In [3]:
def textToSentences(t):
  try:
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', t)
  except:
    sentences = [""]
  return sentences
def getSentencesEmbedding(transformer, data):
  sentences = [textToSentences(t) for t in data]
  dataset_embeddings = []
  for i in range(len(sentences)):
    embeddings = transformer.encode(sentences[i])
    dataset_embeddings.append(embeddings)

  return dataset_embeddings

In [4]:
import numpy as np
embeddings=np.load('/content/drive/MyDrive/Data/sentence_embeddings_25000.npy',allow_pickle=True)
codebook=np.load('/content/drive/MyDrive/Data/codebook_25000.npy',allow_pickle=True)
json_file_path='/content/drive/MyDrive/Data/vocabs_25000.json'
with open(json_file_path, 'r') as j:
     vocabs = json.loads(j.read())

In [5]:
print(type(embeddings),embeddings.shape)
print(len(vocabs))

<class 'numpy.ndarray'> (25000, 768)
104


In [6]:
def create_tokens(document_embeddings, vocabs, cluster_centers, max_length= -1, padding = True, truncation = True): 
  docs_tokens = []
  for d in document_embeddings:
    sentences_cluster_ids = vq([d],cluster_centers)
    sentences_token_ids = []
    attention_mask = []
    for cluster_id in sentences_cluster_ids[0]: 
      sentences_token_ids.append(vocabs[str(cluster_id)])

    if truncation and max_length != -1 :
      if len(sentences_token_ids) + 2 > max_length:
        sentences_token_ids = sentences_token_ids[:(max_length-2)]
        # padding = False

    input_ids = [vocabs[' ']] + sentences_token_ids + [vocabs[' ']]
    attention_mask.extend([1] * len(input_ids))

    if padding:
      padding_len = max_length - len(input_ids)
      input_ids.extend([vocabs[' ']] * padding_len)
      attention_mask.extend([0] * padding_len)

    docs_tokens.append({'input_ids':input_ids, 'attention_mask':attention_mask})
  
  return docs_tokens

In [7]:
from scipy.cluster.vq import *
import torch
from pprint import pprint

batch = create_tokens(embeddings, vocabs, codebook, max_length=512, padding=True, truncation=True)
for x in batch:
  if len(x['input_ids'])<512:
    print(len(x['input_ids']))
labels = torch.tensor([x['input_ids'] for x in batch])
mask = torch.tensor([x['attention_mask'] for x in batch])
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
# make a random mask of 15%
mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 1) * (input_ids != 2)
# add the masked selections to input_ids
for i in range(input_ids.shape[0]):
  selection = torch.flatten(mask_arr[i].nonzero()).tolist()
  # mask input_ids
  input_ids[i, selection] = 4  # our custom 

In [8]:
#Dataloader class 
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
      # store encodings internally
      self.encodings = encodings

  def __len__(self):
      # get sample length
      return self.encodings['input_ids'].shape[0]

  def __getitem__(self, i):
      return {key: tensor[i] for key, tensor in self.encodings.items()}

In [9]:
encodings = {'input_ids': input_ids, 'attention_mask': mask, 'labels': labels}
data = Dataset(encodings)
loader = torch.utils.data.DataLoader(data, batch_size=16, shuffle=True)

In [10]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import AdamW
vocabulary_size = 105
config = RobertaConfig(
    vocab_size= vocabulary_size,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    model_max_length=512,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    position_vocab_size=514,
    type_vocab_size=1
)
model = RobertaForMaskedLM(config)

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device='cpu'
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(105, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [12]:
model.train()
# Adam optimizer initialized
optim = AdamW(model.parameters(), lr=1e-4)
epochs = 2

for epoch in range(epochs):
    batch_loops = tqdm(loader, leave=True)
    for batch in batch_loops:
        # set gradient to zero
        optim.zero_grad()
        # get input_ids and attention mask
        inputIds = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(inputIds, attention_mask=attention_mask,labels=labels)
        # Calculate loss
        loss = outputs.loss
        print("Loss: ",loss)
        # back propogation
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        batch_loops.set_description(f'Epoch {epoch}')
        batch_loops.set_postfix(loss=loss.item())



  0%|          | 0/1563 [00:00<?, ?it/s]

Loss:  tensor(4.8757, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.3386, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0613, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0269, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0249, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0245, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0240, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0240, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0236, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0236, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0237, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0232, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0235, device='cuda:0', grad_fn=<NllLossBackward0>)
Loss:  tensor(0.0232, device='cuda:0', grad_fn=<NllLossBackwar

In [13]:
model.save_pretrained('/content/drive/MyDrive/Data/docberta_dummy_25000')