<a href="https://colab.research.google.com/github/jrakhshanda/Text-Mining/blob/main/roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers
!pip install tokenizers
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev
#!export XLA_USE_BF16=1

In [5]:
import os
import string
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import *
from transformers import AdamW, get_linear_schedule_with_warmup
import tokenizers

In [6]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Configuration

In [146]:
from tokenizers import ByteLevelBPETokenizer
class config:
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 8
    EPOCHS = 7
    PATH = '/content/drive/MyDrive/RoBERTa_files'
    TRAINING_FILE = pd.read_csv('/content/drive/MyDrive/BERT_files/df_train.csv')
    TEST_FILE =  pd.read_csv('/content/drive/MyDrive/BERT_files/df_train.csv')
    MAX_LEN = 141
    TOKENIZER = ByteLevelBPETokenizer(f"{PATH}/vocab.json",
                                      f"{PATH}/merges.txt",
                                      lowercase=True, add_prefix_space=True)

# Processing of Data

In [164]:
def process_data(text, selected_text, sentiment, tokenizer=config.TOKENIZER, max_len=config.MAX_LEN):
    # roberta requires the text to have a prefix space at the beginning
    text = " " + " ".join(str(text).split(" "))
    selected_text = " " + " ".join(str(selected_text).split(" "))

    # getting initial and final index of selected_text within the text
    len_selected = len(selected_text) - 1
    idx1 = idx2 = None
    for idx, letter in enumerate(selected_text):
        if (text[idx] == selected_text[1]) and (" " + text[idx: idx+len_selected] == selected_text):
            idx1 = idx
            idx2 = idx1 + len_selected - 1
            break
    
    # making character targets
    if idx1!=None and idx2!=None:
        char_targets = [0] * len(text)
        for i in range(idx1, idx2+1):
            char_targets[i] = 1
    else:
        char_targets = [1] * len(text)

    # encoding using pretrained tokenizer
    tok_text = tokenizer.encode(text)
    ids = tok_text.ids
    attention_mask = tok_text.attention_mask
    type_ids = tok_text.type_ids

    # getting indexes of tokens containing character in selected_text
    target_idx = []
    for i, (offset1, offset2) in enumerate(tok_text.offsets):
        if sum(char_targets[offset1: offset2])>0:
            target_idx.append(i)

    # we just need the indexes of the start and end tokens as we are using 
    # nn. CrossEntropy as loss
    start_target = target_idx[0]
    end_target = target_idx[-1]

    # token ids of sentiment as present in our vocab hard coded here
    sentiment_ids = {
        'positive':1313,                    # tokenizer.encode('positive').ids
        'negative':2430,                    # tokenizer.encode('negative').ids
        'neutral':7974                     # tokenizer.encode('neutral').ids
    }

    # adding special tokens
    ids = [0] + [sentiment_ids[sentiment]] + [2] + [2] + ids + [2]
    attention_mask = [1] * len(ids)
    type_ids = [0] * len(ids)
    offsets = [(0, 0)] * 4 + tok_text.offsets
    start_target += 4
    end_target += 4

    # padding
    padding_len = max_len - len(ids)
    if padding_len>0:
        ids = ids + [1] * padding_len
        attention_mask = attention_mask + [0] * padding_len
        type_ids = type_ids + [0] * padding_len
        offsets = offsets + [(0, 0)] * padding_len

    return {
        'ids': torch.tensor(ids,dtype=torch.long),
        'attention_mask': torch.tensor(attention_mask,dtype=torch.long),
        'token_type_ids':torch.tensor(type_ids,dtype=torch.long),
        'targets_start': torch.tensor(start_target,dtype=torch.long),
        'targets_end':  torch.tensor(end_target,dtype=torch.long),
        'offsets': torch.tensor(offsets,dtype=torch.long),
        'padding_len': padding_len,
        'text': text,
        'selected_text': selected_text,
        'sentiment': sentiment
    }

In [165]:
class TextDataset(Dataset):
    def __init__(self, text, sentiment, selected_text):
        self.text = text
        self.sentiment = sentiment
        self.selected_text = selected_text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        # processing data
        data = process_data(
            self.text[item], 
            self.selected_text[item], 
            self.sentiment[item]
        )
        # returning tensors
        return data

In [166]:
#import pdb
#pdb.set_trace()
df = config.TRAINING_FILE.reset_index(drop=True)
if __name__== "__main__":
  dset = TextDataset(text = df.text.values,
                      selected_text =df.selected_text.values,sentiment = df.sentiment.values)
  print(dset[5])

{'ids': tensor([    0,  7974,     2,     2, 36778, 10242,  3923,   275,   910, 15574,
         7900,  6872,     2,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,

Now we’ll create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [117]:
dfx = config.TRAINING_FILE
dfx.reset_index(drop=True)
# spliting into training and validation set
train, valid = model_selection.train_test_split(dfx,
                                                      test_size=0.1,
                                                      random_state=42,
                                                      stratify=dfx.sentiment.values)

# using TextDataset function as coded above
train_dataset = TextDataset(text=train.text.values,
                            sentiment=train.sentiment.values,
                            selected_text=train.selected_text.values)

valid_dataset = TextDataset(text=valid.text.values,
                            sentiment=valid.sentiment.values,
                            selected_text=valid.selected_text.values)

# making pytorch dataloaders
train_data_loader = DataLoader(train_dataset,
                               batch_size=config.TRAIN_BATCH_SIZE)

valid_data_loader = DataLoader(valid_dataset,batch_size=config.VALID_BATCH_SIZE)

## Model Implementation

In [154]:
class TextModel(BertPreTrainedModel):
    def __init__(self,conf):
        super(TextModel, self).__init__(conf)

        self.roberta = RobertaModel.from_pretrained("roberta-base",config = conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        # this is to initialize the weights of the matrix that would convert 
        # (batch_size, max_len, 2*768) to (batch_size, max_len, 1) with std=0.02 
    
    def forward(self, ids, mask, token_type_ids):
        _, _, output = self.roberta(
            ids,
            attention_mask = attention_mask,
            token_type_ids=token_type_ids
        )
        # out dim = (12, batch_size, max_len, 768)
        # 12 denotes the 12 hidden layers of roberta

        output = torch.cat((output[-1], output[-2]), dim=-1)
        # output dim = (batch_size, max_len, 2*768)
        output = self.drop_out(output)
        logits = self.l0(output)
        # logits dim -> (batch_size, max_len, 2)

        start_logits, end_logits = logits.split(1, dim=-1)
        # start_logits and end_logits dim -> (batch_size, max_len, 1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        # start_logits and end_logits dim -> (batch_size, max_len)

        return start_logits, end_logits

In [155]:
conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
conf.output_hidden_states = True
model = TextModel(conf)
model.to(device)

TextModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [156]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The RoBERTa model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The RoBERTa model has 201 different named parameters.

==== Embedding Layer ====

roberta.embeddings.word_embeddings.weight               (50265, 768)
roberta.embeddings.position_embeddings.weight             (514, 768)
roberta.embeddings.token_type_embeddings.weight             (1, 768)
roberta.embeddings.LayerNorm.weight                           (768,)
roberta.embeddings.LayerNorm.bias                             (768,)

==== First Transformer ====

roberta.encoder.layer.0.attention.self.query.weight       (768, 768)
roberta.encoder.layer.0.attention.self.query.bias             (768,)
roberta.encoder.layer.0.attention.self.key.weight         (768, 768)
roberta.encoder.layer.0.attention.self.key.bias               (768,)
roberta.encoder.layer.0.attention.self.value.weight       (768, 768)
roberta.encoder.layer.0.attention.self.value.bias             (768,)
roberta.encoder.layer.0.attention.output.dense.weight     (768, 768)
roberta.encoder.layer.0.attention.output.dense.bias         

In [157]:
# loss function. Play around with it and see what works best
def loss_fn(output_start, output_end, targets_start, targets_end,device):
  loss = nn.CrossEntropyLoss().to(device)
  l1 = loss(output_start,targets_start)
  l2 = loss(output_end,targets_end)
  return l1 + l2

In [158]:
# jaccard function as mentioned in evaluation section of the contest
def jaccard_metric(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [159]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [160]:
def train_fn(data_loader, model, optimizer, device, scheduler):
  total_loss = 0
  model.train()

  for bi, batch in enumerate(data_loader):
    # getting data
    ids = batch['ids'].to(device, dtype=torch.long)
    token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
    targets_start = batch['targets_start'].to(device, dtype=torch.float)
    targets_end = batch['targets_end'].to(device, dtype=torch.float)

    # zeroing gradients
    optimizer.zero_grad()
    # getting outputs
    output_start, output_end = model(ids,
                                     attention_mask = attention_mask,
                                     token_type_ids=token_type_ids)
    # calulating loss
    loss = loss_fn(output_start, output_end, targets_start, targets_end)
    total_loss += loss.item()
    # calculating gradients
    loss.backward()
    # updating model parameters
    optimizer.step()
    # stepping learning rate scheduler
    scheduler.step()
  
  avg_train_loss = total_loss / len(data_loader)
  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
  return avg_train_loss

In [161]:
def eval_fn(data_loader, model, device, tokenizer=config.TOKENIZER):
  total_loss = 0
  model.eval()
  # below array will store the respective data
  all_ids = []
  start_idx = []
  end_idx = []
  selected_text = []
  padding_len = []

  for bi, batch in enumerate(data_loader):
    # getting data
    ids = batch['ids'].to(device, dtype=torch.long)
    token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
    targets_start = batch['targets_start'].to(device, dtype=torch.float)
    targets_end = batch['targets_end'].to(device, dtype=torch.float)
    selected_text = batch['selected_text']
    pad_len = batch['padding_len']

    # getting output
    output_start, output_end = model(ids,
                                     attention_mask = attention_mask,
                                     token_type_ids=token_type_ids)
    
    
    loss = loss_fn(output_start, output_end, targets_start, targets_end)
    total_loss += loss.item()

    # adding to array to use latter
    # also removing stuff from gpu
    all_ids.append(ids.cpu().detach().numpy())
    start_idx.append(torch.sigmoid(output_start).cpu().detach().numpy())
    end_idx.append(torch.sigmoid(output_end).cpu().detach().numpy())
    selected_text.extend(selected_text)
    padding_len.extend(pad_len)

    # fixing dimensions
    start_idx = np.vstack(start_idx)
    end_idx = np.vstack(end_idx)
    all_ids = np.vstack(all_ids)

    # to store jaccard score to print mean of it latter
    jaccards = []

    # getting predicted text and calculating jaccard
    for i in range(0, len(start_idx)):
        start_logits = start_idx[i][4: -padding_len[i]-1]
        end_logits = end_idx[i][4: -padding_len[i]-1]
        this_id = all_ids[i][4: -padding_len[i]-1]

        idx1 = idx2 = None
        max_sum = 0
        for ii, s in enumerate(start_logits):
            for jj, e in enumerate(end_logits):
                if  s+e > max_sum:
                    max_sum = s+e
                    idx1 = ii
                    idx2 = jj

        this_id = this_id[idx1: idx2+1]
        predicted_text = tokenizer.decode(this_id, skip_special_tokens=True)
        predicted_text = predicted_text.strip()
        sel_text = selected_text[i].strip()

        jaccards.append(jaccard_metric(predicted_text, sel_text))

  avg_valid_loss = np.mean(total_loss)      
  print("  Average validation loss: {0:.2f}".format(avg_valid_loss))
  print("  Average jaccard similarity: {0:.2f}".format(np.mean(jaccards)))
  print("  validation took: {:}".format(format_time(time.time() - t0)))

  return np.mean(jaccards), avg_valid_loss

In [162]:
def run():
    # reading train.csv
    dfx = config.TRAINING_FILE
    dfx.reset_index(drop=True)
    # spliting into training and validation set
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.values)

    # using TextDataset function as coded above
    train_dataset = TextDataset(
        text=df_train.text.values,
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    valid_dataset = TextDataset(
        text=df_valid.text.values,
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    # making pytorch dataloaders
    train_data_loader = DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_data_loader = DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    # making a instance of the model and putting it into gpu
    conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
    conf.output_hidden_states = True
    model = TextModel(conf)
    model.to(device)
    
    # explicitly going through model parameters and removing weight decay
    # from a few layers 
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]

    # Coding out the optimizer and scheduler
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    # saving model when we have best jaccard
    train_loss = []
    valid_loss = []
    best_jaccard = 0
    for epoch in range(config.EPOCHS):
      # ========================================
      #               Training
      # ========================================
    
      # Perform one full pass over the training set.
      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch+1, config.EPOCHS))
      print('Training...')
      t0 = time.time()
      avg_train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
      train_loss.append(avg_train_loss)

      # ========================================
      #               Validation
      # ========================================
      print("")
      print("Running Validation...")
      t0 = time.time()
        
      jaccard, avg_valid_loss = eval_fn(valid_data_loader, model, device)
      valid_loss.append(avg_valid_loss)
      if epoc == 5:
        torch.save(model.state_dict(), config.PATH+'/roberta-model1.pth')
      
      if epoc == 7:
        torch.save(model.state_dict(), config.PATH+'/roberta-model2.pth')

      if jaccard > best_jaccard:
        torch.save(model.state_dict(), config.PATH+'/jaccs.pth')
        best_jaccard = jaccard
 
    return train_loss, valid_loss

In [163]:
train_loss, valid_loss = run()


Training...


TypeError: ignored

In [None]:
conf = RobertaConfig.from_pretrained(f"{config.PATH}/config.json")
conf.output_hidden_states = True
model = TextModel(conf)
model = nn.DataParallel(model)
#model.load_state_dict(torch.load(f"{config.PATH}/pytorch_model.bin/pytorch_model.bin"))
model.eval()

RuntimeError: ignored