#### Imports





In [None]:
%%capture
!pip install datasets
!pip install -U transformers
!apt install git-lfs

In [None]:
import json
import torch
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, AutoTokenizer, BertTokenizerFast, BertForQuestionAnswering, AutoModelForQuestionAnswering
import transformers
from transformers import pipeline
from datasets import load_dataset, load_metric
from transformers import default_data_collator
from torch.utils.data import DataLoader
from transformers import get_scheduler
from transformers import AdamW
from tqdm.auto import tqdm
from transformers import AutoTokenizer


####  Data Loading.

In [None]:
#loading cluster-wise csvs
# %%capture
! gdown https://drive.google.com/drive/folders/1gebbrob3ssLaVivSE9NQNVjfC4VQ_w5T?usp=share_link -O . --folder #first batch
! gdown https://drive.google.com/drive/folders/1TTcl6weFkOy8EPb3DVKteZK0KF9guALj?usp=share_link -O . --folder #second batch

Retrieving folder list
Processing file 1LXFdB38nK0EJBCYuDkk9ilFQdyhgno1K 0_test.csv
Processing file 1-iyVF0xsZxWEO3fqwjOdQKUw1qmZ6DJM 0_train.csv
Processing file 1wqgOw5Seoio2AjwFfFPD9_Yn83-hDwbe 1_test.csv
Processing file 17k0_r9Y7xbyVoHUYRnMWxgGAKeBlD0qe 1_train.csv
Processing file 1sADLTPSdS5w67SIQ96zvIELTLYS6qWOo 2_test.csv
Processing file 1gyFqNov0B9C-XXlYzX78gYCG74q8plun 2_train.csv
Processing file 1nSUBKXYtKyGi6wSP0TxGd02UlHpT8fmQ 3_test.csv
Processing file 1CdyNEbjtJ4gLuw92-bHRIHT9J6c4JfVm 3_train.csv
Processing file 173JKDoq-dHN_C-Q092HhwgR-i3q_QZFy 4_test.csv
Processing file 1bhJtlTOJHrv_GfNXus-iG9xdOFosxj3W 4_train.csv
Processing file 1_IShR1DUXCx0Vpat2Br288gHBHVyepoF 5_test.csv
Processing file 1xRJFpE_5zukcAHGO3pfza61FB8tPm1CE 5_train.csv
Processing file 1-kEW8i_qtDS9PCHkjKmdHIH8UM1oJhLE 6_test.csv
Processing file 11nivvefLM5DD92SM2zxlfL-IN8IAl2Zf 6_train.csv
Processing file 1owBfZC3nrX4l5CZMKKCZ75DeRlywXcf9 7_test.csv
Processing file 1WGo5ukU6cwISDFE910sL4zwbYZpGolwN 7_tra

In [None]:
def loaddataset(cluster = 0):
  try:
    base_url = '1/'
    data_files = {"train":base_url + "{}_train.csv".format(cluster), "test": base_url + "{}_test.csv".format(cluster)}
    # data_files = {"train":base_url + "{}_train.csv".format(theme), "test": base_url + "{}_test.csv".format(theme)}
    dataset = load_dataset("csv", data_files=data_files)
  except:
    base_url = '2/'
    data_files = {"train":base_url + "{}_train.csv".format(cluster), "test": base_url + "{}_test.csv".format(cluster)}
    # data_files = {"train":base_url + "{}_train.csv".format(theme), "test": base_url + "{}_test.csv".format(theme)}
    dataset = load_dataset("csv", data_files=data_files)

  return dataset

In [None]:
!ls

1  2  sample_data



#### Finetuning


##### Device Setup


In [None]:
#setting up global variables
model_name = "PremalMatalia/electra-base-best-squad2"
batch_size = 8
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
data_collator = default_data_collator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
def setup_device(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast), "Fast Tokenizer Needed"
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
  pad_on_right = tokenizer.padding_side == "right"
  model.to(device)
  return model, tokenizer, pad_on_right



##### Preprocessing for FineTuning


In [None]:
def preprocess_for_training(dataset, tokenizer, pad_on_right):
    dataset["Question"] = [q.lstrip() for q in dataset["Question"]]

    tokenized_examples = tokenizer(
        dataset["Question" if pad_on_right else "Paragraph"],
        dataset["Paragraph" if pad_on_right else "Qustion"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    # print(sample_mapping, offset_mapping)
    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        #! WORKS

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = {
            'text' : (dataset["Answer_text"][sample_index]),
            'answer_start' : [int(dataset["Answer_start"][sample_index][1:-1])] if dataset["Answer_start"][sample_index][1:-1] != '' else []
        }
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"]) - 4
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


##### Finetuning

In [None]:
def grouped_LLRD_implementation(model, learning_rate):
    opt_parameters = []
    named_parameters = list(model.named_parameters()) 
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    set_2 = ["layer.4", "layer.5", "layer.6", "layer.7"]
    set_3 = ["layer.8", "layer.9", "layer.10", "layer.11"]
    init_lr = learning_rate
    for i, (name, params) in enumerate(named_parameters):  
        weight_decay = 0.0 if any(p in name for p in no_decay) else 0.01
        if name.startswith("electra.embeddings") or name.startswith("electra.encoder"):            
            # For first set, set lr to 1e-6 (i.e. 0.000001)
            lr = init_lr       
            # For set_2, increase lr to 0.00000175
            lr = init_lr * 1.75 if any(p in name for p in set_2) else lr
            
            # For set_3, increase lr to 0.0000035 
            lr = init_lr * 3.5 if any(p in name for p in set_3) else lr
            
            opt_parameters.append({"params": params,
                                   "weight_decay": weight_decay,
                                   "lr": lr})  
            
        # For regressor and pooler, set lr to 0.0000036 (slightly higher than the top layer).                
        if name.startswith("regressor") or name.startswith("roberta_model.pooler"):               
            lr = init_lr * 3.6 
            
            opt_parameters.append({"params": params,
                                   "weight_decay": weight_decay,
                                   "lr": lr})    
    
    return transformers.AdamW(opt_parameters, lr=init_lr)

In [None]:
def tune_loop(model, tokenizer, pad_on_right, outfile, cluster = 0, learning_rate = 5e-6, epochs =3):
    num_epochs = epochs
    dataset = loaddataset(cluster)
    #LLRD optimized AdamW
    optimizer = grouped_LLRD_implementation(model, learning_rate)
    tokenized_datasets = dataset.map(lambda x : preprocess_for_training(x, tokenizer, pad_on_right), batched=True, remove_columns=dataset["train"].column_names)
    train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
    )
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    model.train()
    progress_bar = tqdm(range(num_training_steps))
    model.train()
    for epoch in range(num_epochs):
        loss_loop = []
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            # print(loss)
            optimizer.step()
            # lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            progress_bar.set_description(f'Epoch {epoch+1}/{epochs}')
            loss_loop.append(loss.item())
        print(f'Loss: {sum(loss_loop)/len(loss_loop)} after Epoch {epoch}')
    model.save_pretrained(outfile)
    tokenizer.save_pretrained(outfile)
    return model


In [None]:
model, tokenizer, pad_on_right = setup_device(model_name)