<a href="https://colab.research.google.com/github/jrakhshanda/Bayesian-Methods/blob/master/roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install tokenizers
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev
#!export XLA_USE_BF16=1

In [4]:
import os
import string
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import *
from transformers import AdamW, get_linear_schedule_with_warmup
import tokenizers

In [5]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Configuration

In [6]:
from tokenizers import ByteLevelBPETokenizer
class config:
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 8
    EPOCHS = 5
    PATH = '/content/drive/MyDrive/RoBERTa_files'
    TRAINING_FILE = pd.read_csv(PATH + '/TRAINING_FILE.csv')
    TEST_FILE =  pd.read_csv(PATH + '/TEST_FILE.csv')
    MAX_LEN = 141
    TOKENIZER = ByteLevelBPETokenizer(f"{PATH}/vocab.json",
                                      f"{PATH}/merges.txt",
                                      lowercase=True, add_prefix_space=True)

# Processing of Data

In [7]:
def process_data(text, selected_text, sentiment, tokenizer=config.TOKENIZER, max_len=config.MAX_LEN):
    # roberta requires the text to have a prefix space at the beginning
    text = " " + " ".join(str(text).split(" "))
    selected_text = " " + " ".join(str(selected_text).split(" "))

    # getting initial and final index of selected_text within the text
    len_selected = len(selected_text) - 1
    idx1 = idx2 = None
    for idx, letter in enumerate(selected_text):
        if (text[idx] == selected_text[1]) and (" " + text[idx: idx+len_selected] == selected_text):
            idx1 = idx
            idx2 = idx1 + len_selected - 1
            break
    
    # making character targets
    if idx1!=None and idx2!=None:
        char_targets = [0] * len(text)
        for i in range(idx1, idx2+1):
            char_targets[i] = 1
    else:
        char_targets = [1] * len(text)

    # encoding using pretrained tokenizer
    tok_text = tokenizer.encode(text)
    ids = tok_text.ids
    mask = tok_text.attention_mask
    token_type_ids = tok_text.type_ids

    # getting indexes of tokens containing character in selected_text
    target_idx = []
    for i, (offset1, offset2) in enumerate(tok_text.offsets):
        if sum(char_targets[offset1: offset2])>0:
            target_idx.append(i)

    # we just need the indexes of the start and end tokens as we are using 
    # nn. CrossEntropy as loss
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    # token ids of sentiment as present in our vocab hard coded here
    sentiment_ids = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }

    # adding special tokens
    ids = [0] + [sentiment_ids[sentiment]] + [2] + [2] + ids + [2]
    mask = [1] * len(ids)
    token_type_ids = [0] * len(ids)
    offsets = [(0, 0)] * 4 + tok_text.offsets
    targets_start += 4
    targets_end += 4

    # padding
    padding_len = max_len - len(ids)
    if padding_len>0:
        ids = ids + [1] * padding_len
        mask = mask + [0] * padding_len
        token_type_ids = token_type_ids + [0] * padding_len
        offsets = offsets + [(0, 0)] * padding_len

    return {'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets_start': torch.tensor(targets_start, dtype=torch.long),
            'targets_end': torch.tensor(targets_end, dtype=torch.long),
            'text': text,
            'selected_text': selected_text,
            'sentiment': sentiment,
            'offsets': torch.tensor(offsets, dtype=torch.long),
            'padding_len':padding_len
            }

In [8]:
class TextDataset(Dataset):
    def __init__(self, text, sentiment, selected_text):
        self.text = text
        self.sentiment = sentiment
        self.selected_text = selected_text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        # processing data
        data = process_data(
            self.text[item], 
            self.selected_text[item], 
            self.sentiment[item]
        )
        # returning tensors
        return data

In [11]:
#import pdb
#pdb.set_trace()
df = config.TRAINING_FILE.reset_index(drop=True)
if __name__== "__main__":
  dset = TextDataset(text = df.text.values,
                      selected_text =df.selected_text.values,sentiment = df.sentiment.values)
#  print(dset[5])

Now we’ll create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [13]:
dfx = config.TRAINING_FILE
# spliting into training and validation set
train, valid = model_selection.train_test_split(dfx,
                                                      test_size=0.1,
                                                      random_state=42,
                                                      stratify=dfx.sentiment.values)

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

# using TextDataset function as coded above
train_dataset = TextDataset(text=train.text.values,
                            sentiment=train.sentiment.values,
                            selected_text=train.selected_text.values)

valid_dataset = TextDataset(text=valid.text.values,
                            sentiment=valid.sentiment.values,
                            selected_text=valid.selected_text.values)

# making pytorch dataloaders
train_data_loader = DataLoader(train_dataset,
                               batch_size=config.TRAIN_BATCH_SIZE)

valid_data_loader = DataLoader(valid_dataset,batch_size=config.VALID_BATCH_SIZE)

In [17]:
train_data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f782899a4a8>

## Model Implementation

In [21]:
class TextModel(BertPreTrainedModel):
    def __init__(self,conf):
        super(TextModel, self).__init__(conf)

        self.roberta = RobertaModel.from_pretrained(f"{config.PATH}/pytorch_model.bin",
                                                    config = conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        # this is to initialize the weights of the matrix that would convert 
        # (batch_size, max_len, 2*768) to (batch_size, max_len, 1) with std=0.02 
    
    def forward(self, ids, mask, token_type_ids):
        _, _, output = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        # out dim = (12, batch_size, max_len, 768)
        # 12 denotes the 12 hidden layers of roberta

        output = torch.cat((output[-1], output[-2]), dim=-1)
        # output dim = (batch_size, max_len, 2*768)
        output = self.drop_out(output)
        logits = self.l0(output)
        # logits dim -> (batch_size, max_len, 2)

        start_logits, end_logits = logits.split(1, dim=-1)
        # start_logits and end_logits dim -> (batch_size, max_len, 1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        # start_logits and end_logits dim -> (batch_size, max_len)

        return start_logits, end_logits

In [18]:
conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
conf.output_hidden_states = True
model = RobertaModel.from_pretrained(f"{config.PATH}/pytorch_model.bin",config = conf)
# Tell pytorch to run this model on the GPU.
model.cuda()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The RoBERTa model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [20]:
help(model)

Help on RobertaModel in module transformers.models.roberta.modeling_roberta object:

class RobertaModel(RobertaPreTrainedModel)
 |  The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.
 |  
 |  This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
 |  methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
 |  pruning heads etc.)
 |  
 |  This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
 |  subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
 |  general usage and behavior.
 |  
 |  Parameters:
 |      config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
 |          model. Initializing with a config file does not load the weights associated with the model, only t

## Temporary testing of code

In [17]:
temp_data =  dset
temp_dataLoader = DataLoader(temp_data,batch_size=config.TRAIN_BATCH_SIZE,num_workers=4)

  cpuset_checked))


In [32]:
#_, _, output = model(input_ids = dset[1]['ids'],
#           attention_mask = dset[1]['mask'],
 #           token_type_ids = dset[1]['token_type_ids']
 #       )
help(model)

Help on RobertaModel in module transformers.models.roberta.modeling_roberta object:

class RobertaModel(RobertaPreTrainedModel)
 |  The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.
 |  
 |  This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
 |  methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
 |  pruning heads etc.)
 |  
 |  This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
 |  subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
 |  general usage and behavior.
 |  
 |  Parameters:
 |      config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
 |          model. Initializing with a config file does not load the weights associated with the model, only t

In [30]:
dset[1]

{'ids': tensor([    0,  2430,     2,     2,    98,  3036,  5074,   939,    40,  2649,
            47,   259,    11, 15610,  1597,  2977, 16506,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,  

In [29]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    for d in tqdm(data_loader):
        # getting data
        ids = d['ids']
        token_type_ids = d['token_type_ids']
        mask = d['mask']
        targets_start = d['targets_start']
        targets_end = d['targets_end']

        # putting them into gpu
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.float)
        targets_end = targets_end.to(device, dtype=torch.float)

        # zeroing gradients
        optimizer.zero_grad()
        # getting outputs
        o1, o2 = model(ids,
                       mask=mask,
                       token_type_ids=token_type_ids)
        # calulating loss
        loss = loss_fn(o1, o2, targets_start, targets_end)
        # calculating gradients
        loss.backward()
        # updating model parameters
        optimizer.step()
        # stepping learning rate scheduler
        scheduler.step()

In [30]:
def eval_fn(data_loader, model, device, tokenizer=config.TOKENIZER):
    model.eval()
    # below array will store the respective data
    all_ids = []
    start_idx = []
    end_idx = []
    orig_selected = []
    padding_len = []

    for d in data_loader:
        # getting data
        ids = d['ids']
        token_type_ids = d['token_type_ids']
        mask = d['mask']
        selected_text = d['orig_selected']
        pad_len = d['padding_len']

        # putting them in gpu
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)

        # getting output
        o1, o2 = model(ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        # adding to array to use latter
        # also removing stuff from gpu
        all_ids.append(ids.cpu().detach().numpy())
        start_idx.append(torch.sigmoid(o1).cpu().detach().numpy())
        end_idx.append(torch.sigmoid(o2).cpu().detach().numpy())
        orig_selected.extend(selected_text)
        padding_len.extend(pad_len)

    # fixing dimensions
    start_idx = np.vstack(start_idx)
    end_idx = np.vstack(end_idx)
    all_ids = np.vstack(all_ids)

    # to store jaccard score to print mean of it latter
    jaccards = []

    # getting predicted text and calculating jaccard
    for i in range(0, len(start_idx)):
        start_logits = start_idx[i][4: -padding_len[i]-1]
        end_logits = end_idx[i][4: -padding_len[i]-1]
        this_id = all_ids[i][4: -padding_len[i]-1]

        idx1 = idx2 = None
        max_sum = 0
        for ii, s in enumerate(start_logits):
            for jj, e in enumerate(end_logits):
                if  s+e > max_sum:
                    max_sum = s+e
                    idx1 = ii
                    idx2 = jj

        this_id = this_id[idx1: idx2+1]
        predicted_text = tokenizer.decode(this_id, skip_special_tokens=True)
        predicted_text = predicted_text.strip()
        sel_text = orig_selected[i].strip()

        jaccards.append(jaccard(predicted_text, sel_text))

    # returning mean jaccard
    return np.mean(jaccards)

In [31]:
# jaccard function as mentioned in evaluation section of the contest
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [32]:
# loss function. Play around with it and see what works best
def loss_fn(o1, o2, t1, t2):
    l1 = nn.CrossEntropyLoss()(o1, t1.long())
    l2 = nn.CrossEntropyLoss()(o2, t2.long())
    return l1 + l2

In [34]:
def run():
    # reading train.csv
    dfx = config.TRAINING_FILE
    # spliting into training and validation set
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    # using TextDataset function as coded above
    train_dataset = TextDataset(
        text=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    valid_dataset = TextDataset(
        text=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    # making pytorch dataloaders
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_data_loader = DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    # making a instance of the model and putting it into gpu
    device = torch.device("cuda")
    conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
    conf.output_hidden_states = True
    model = TextModel(conf)
    model.to(device)
    
    # explicitly going through model parameters and removing weight decay
    # from a few layers 
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    # Coding out the optimizer and scheduler
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    # saving model when we have best jaccard
    best_jaccard = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        jaccard = eval_fn(valid_data_loader, model, device)
        print(f"Jaccard Score = {jaccard}")
        if jaccard > best_jaccard:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_jaccard = jaccard

In [35]:
run()

  0%|          | 0/773 [00:00<?, ?it/s]


TypeError: ignored

In [38]:
conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
conf.output_hidden_states = True
model = TextModel(conf)
model = nn.DataParallel(model)
#model.load_state_dict(torch.load(f"{config.PATH}/pytorch_model.bin/pytorch_model.bin"))
model.eval()

RuntimeError: ignored