# One-timers

In [None]:
!nvidia-smi
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!mkdir dataset

!cp /content/drive/MyDrive/Research/triples/data/*.csv dataset
!cp -r /content/drive/MyDrive/Research/triples/HuggingFace dataset
!ls dataset
!pip3 install -q transformers tensorboard_logger seqeval sentencepiece tokenizers sentence_transformers

Tue Jul 27 13:00:20 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 
import time, torch, random, glob, re, gc, datetime, tokenizers, pdb

import numpy as np
import transformers
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt

from tokenizers import *
from transformers import *
from functools import partial
from pathlib import Path
from tqdm.notebook import tqdm
from torch.nn import functional as F
from itertools import cycle, chain
from torch.utils.data import Dataset, DataLoader, IterableDataset, TensorDataset
# from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold
from ast import literal_eval as eval

# from transformers import RobertaForSequenceClassification, RobertaConfig, RobertaTokenizer, RobertaForTokenClassification

In [None]:
import sys
DRIVE_DIR="/content/drive/My Drive/Research/triples/"
sys.path.insert(0, DRIVE_DIR)
from utils import seed_everything, count_params

# Globals and Config

In [None]:
class Config:

    random_state=2021
    k=5 #folds

    device="cuda"
    selected_folds=list(range(k))
    # selected_folds=[3,4]
    seed = 2021
    model="bert-base-cased"
    # checkpoints=[f"/content/drive/MyDrive/Research/triples/2021-07-24/NER_bert-base-cased_ fold - {i+1}_.pt" for i in range(k)]
    checkpoint=[] # if this list has any checkpoint, the model uses that checkpoint to be the starting point and then finetunes over it.
    pretrained=True
    lowercase = False

    num_labels=4
    batch_size = 32
    batch_size_val = int(batch_size * 1.5)
    weight_decay =0.001
    
    epochs = 15
    lr =5e-5
    warmup_prop = 0.1
    freeze_main=False #this parameters controls whether we want to freeze the main bert and train only the classifier (True) or train the whole model
    max_len=128
    save_every_epoch=list(range(5, epochs, 3)) #list specifies which checkpoints to save
    model_names=["roberta", "bert", "albert", "transformer", "distilbert"]


CP_DIR=Path("/content/drive/MyDrive/Research/triples")
NUM_WORKERS = 2

In [None]:
TRANSFORMERS_DIR=Path("dataset/HuggingFace/")
TRANSFORMERS={
    "roberta-base":{
        "model_config":(RobertaModel, RobertaConfig),
        "tokenizer":RobertaTokenizer,
    },
    "bert-base-cased":{
        "model_config":(BertModel, BertConfig),
        "tokenizer":BertWordPieceTokenizer,
    },
    "bert-base-uncased":{
        "model_config":(BertModel, BertConfig),
        "tokenizer":BertWordPieceTokenizer,
    },
    "albert-base-v2":{
        "model_config":(AlbertModel,AlbertConfig),
        "tokenizer":AlbertTokenizer,
    },
    "gpt2":{
        "model_config":(GPT2Model, GPT2Config),
        "tokenizer":GPT2Tokenizer,
    },
    "distilbert-base-cased":{
        "model_config":(DistilBertModel, DistilBertConfig),
        "tokenizer":DistilBertTokenizer,
    }
}

# Function and Helpers

In [None]:
def get_checkpoint_dir():
  today=str(datetime.date.today())
  checkpoint_dir=CP_DIR/today

  if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
  return checkpoint_dir

def checkpoint_name():
  return Config.task+"_"+Config.model

def save_model_weights(model, filename, verbose=1, cp_folder=""):
    if verbose:
        print(f"\n -> Saving weights to {os.path.join(cp_folder, filename)}\n")
    torch.save(model.state_dict(), os.path.join(cp_folder, filename))


def save_log(list_, logdir):
    if os.path.exists(logdir):
        mode="a"
    else:
        mode="w"
    with open(logdir, mode) as f:
        f.writelines("\n".join(list_))
        f.writelines("\n")
    

# def get_scores(truths, preds):
#     f1=f1_score(truths, preds)
#     recall=recall_score(truths, preds)
#     precision=precision_score(truths, preds)
#     accuracy=accuracy_score(truths, preds)
    
#     return f1, precision, recall, accuracy

def load(model, with_checkpoint=None):
    model=Transformer(model)
    if with_checkpoint:
        checkpoint=torch.load(with_checkpoint, map_location="cpu")
        model.load_state_dict(checkpoint)
        print("Checkpoint loaded!", end="\r")
    return model

In [None]:
def modify_label(text, label_):

  "the modify relations function"
    label=label_.split(" ")
    loc1=text.find(label[0])
    loc2=text.find(label[-1])+len(label[-1])

    if loc1>loc2:
      return label_
    return text[loc1:loc2]


def locate_label_string(text, label_, fill_value=1):

    """
    Finds the label in the text
    """
    if not label_ in text:
        label=modify_label(text, label_)
    else:
        label=label_
    len_label = len(label) - 1
    candidates_idx = [i for i, e in enumerate(text) if e == label[1]]


    for idx in candidates_idx:
        if " " + text[idx: idx + len_label] == label:
            idx_start = idx
            idx_end = idx + len_label
            break

    assert (
        text[idx_start:idx_end] == label[1:]
    ), f'"{text[idx_start: idx_end]}" instead of "{label}" in "{text}"'

    char_targets = np.zeros(len(text))
    char_targets[idx_start:idx_end] = fill_value

    return idx_start, idx_end, char_targets

def locate_label_tokens(offsets, char_targets):
    """
    Finds the tokens corresponding to the found labels
    """
    

    target_idx = []
    for idx, (offset1, offset2) in enumerate(offsets):
        if sum(char_targets[offset1:offset2]) > 0:
            target_idx.append(idx)

    if not len(target_idx):
        for idx, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(idx)

    return target_idx[0], target_idx[-1]

def preprocess(text, entities, tokenizer, tokens, max_len=128):

  """Preprocessing required for the input to transformer and to reconstruct output from it."""
    text = " " + " ".join(str(text).split())
    entities=[" "+i for i in entities]
    label_locations=[]
    for idx, l in enumerate(entities):
        label_locations.append(locate_label_string(text, l, fill_value=1))    

    tokenized = tokenizer.encode(text)
    input_ids_text = tokenized.ids[1:-1]
    offsets = tokenized.offsets[1:-1]
    label_location_in_tokens=[]
    start_=0
    end_=0
    for label_location in label_locations:
        target_start, target_end=locate_label_tokens(offsets, label_location[2])

        start_=min(start_, target_start)
        end_=max(end_, target_end)

        label_location_in_tokens.append((target_start, target_end))

    input_ids = (
            [tokens["cls"]]
            + input_ids_text[:max_len - 2]
            + [tokens["sep"]])
    text_offsets = [(0, 0)] + offsets[:max_len - 2] + [(0, 0)]
    label_location_in_tokens=[(i[0]+1, i[1]+1) for i in label_location_in_tokens]
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([tokens["pad"]] * padding_length)
        text_offsets = text_offsets + ([(0, 0)] * padding_length)
        
    return {
        "input_ids":input_ids,
        "label_location_in_tokens":label_location_in_tokens,
        "text":text,
        "label":entities,
        "offsets":text_offsets,
    }

# Dataset

In [None]:
class SentenceDataset(Dataset):
    def __init__(self, df, tokenizer, tokens, max_len=128):
        self.tokenizer = tokenizer
        self.tokens=tokens
        self.max_len = max_len

        self.texts = df["sentence"]
        self.labels = df["triple"].map(eval)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        data=preprocess(self.texts.iloc[idx], self.labels.iloc[idx], tokenizer=self.tokenizer, tokens=self.tokens, max_len=self.max_len)
        label=[0]*len(data['input_ids'])
        for idx, location in enumerate(data["label_location_in_tokens"]):
            if location[0]==location[1]:
                label[location[0]]=idx+1
            else:
                label[location[0]:location[1]]=[idx+1]*(location[1]-location[0])
                            
        return {
            "input_ids":torch.tensor(data["input_ids"], dtype=torch.long),
            "label":torch.nn.functional.one_hot(torch.tensor(label), num_classes=4).float(),
            "text":data['text'],
            "offsets":data['offsets']
        }

# Model

In [None]:
class Transformer(nn.Module):
    def __init__(self, model, maxlen=128):
        super().__init__()
        self.name = model
        model_type, config_type=TRANSFORMERS[model]['model_config']
        if Config.pretrained:
            self.transformer=model_type.from_pretrained(model, output_hidden_states=True, num_labels=Config.num_labels)
        else:
            config_file=TRANSFORMERS[model]['config']
            config=config_type.from_json_file(config_file)
            config.num_labels=Config.num_labels
            config.output_hidden_states=True
            self.transformer=model_type(config)
            
        self.nb_features = self.transformer.pooler.dense.out_features
        if "roberta" in self.name:
            self.pad_idx=1
        else:
            self.pad_idx=0
        self.logits = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features),
            nn.Tanh(),
            nn.Linear(self.nb_features, Config.num_labels),
        )

    def forward(self, input_ids, attention_mask=None):        
        hidden_states = self.transformer(
            input_ids,
            attention_mask=(input_ids != self.pad_idx).long(),
        )[-1]

        features = hidden_states[-1]
        logits = torch.sigmoid(self.logits(features))
        
        return logits

# Fitting

In [None]:
def fit(model,train_dataset,val_dataset, fold, epochs,batch_size, weight_decay=0,warmup_prop=0.0,lr=5e-4):

  """Batchwise training and validation iterations."""
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=NUM_WORKERS)

    opt_params = []
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    for n, p in model.named_parameters():
        wd = 0 if any(nd in n for nd in no_decay) else weight_decay
        opt_params.append(
            {"params": [p], "weight_decay": wd, "lr": lr}
        )

    optimizer = AdamW(opt_params, lr=lr, betas=(0.5, 0.999))

    n_steps=epochs*len(train_loader)
    num_warmup_steps = int(warmup_prop * n_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, n_steps)
    # scheduler=ReduceLROnPlateau(optimizer, mode='min', factor=0.03, patience=2, threshold=0.001, threshold_mode='rel', cooldown=0, min_lr=1e-6, eps=1e-08, verbose=False)

    total_steps = 0
    epoch=0

    loss_function=nn.BCELoss()

    save_log(["\n",str(datetime.datetime.now()).split(".")[0],"\n", checkpoint_name()+f"_fold_{fold+1}"], logdir=get_checkpoint_dir()/f"log_{CHECKPOINT_KEYWORD}.txt")
    with tqdm(total=epochs, desc="Epoch {}/{}".format(epoch + 1, epochs), unit="sections", position=0,leave=True) as pbar:
        for epoch in range(epochs):
            model.train()
            start_time = time.time()
            optimizer.zero_grad()
            avg_loss = 0

            with tqdm(total=len(train_loader), desc="training iterations", unit="batch", position=1, leave=True) as pbar2:
                for step, data in enumerate(train_loader):
                  total_steps+=1
                  input_ids=data['input_ids']
                  labels=data['label']

                  logits=model(input_ids=input_ids.to(Config.device))                  
                  loss=loss_function(logits, labels.to(Config.device))
  
                  avg_loss += loss.item() / len(train_loader)
                  nn.utils.clip_grad_norm_(model.parameters(), 10.0)
                  loss.backward()
                  optimizer.step()
                  scheduler.step()
                  model.zero_grad()
                  pbar2.update()


            model.eval()
            avg_val_loss = 0.
            preds, truths = [], []
            with torch.no_grad():
                with tqdm(total=len(val_loader), desc="validation iterations", unit="batch", position=2, leave=True) as pbar3:
                    for idx_val, data in enumerate(val_loader):
                      input_ids=data['input_ids']
                      labels=data['label']
                      logits=model(input_ids=input_ids.to(Config.device))
                      loss=loss_function(logits, labels.to(Config.device))
                      avg_val_loss += loss.item() / len(val_loader)
                      pbar3.update()
                      
            dt = time.time() - start_time
            lr = scheduler.get_lr()[0]
            # lr = optimizer.param_groups[0]['lr']
            # scheduler.step(avg_val_loss)
            
            if epoch+1 in Config.save_every_epoch:
                save_model_weights(model, f'{checkpoint_name()}_fold-{fold+1}_epoch-{epoch+1}_{CHECKPOINT_KEYWORD}.pt', cp_folder=get_checkpoint_dir())

            log_lr=f"Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t \n"
            print(log_lr)
            log_score=f"loss={avg_loss:.3f}\t val_loss={avg_val_loss:.3f}  \n"
            print(log_score)
            save_log([log_lr, log_score], logdir=get_checkpoint_dir()/f"log_{CHECKPOINT_KEYWORD}.txt")
            pbar.update()


    del loss, data, avg_val_loss, avg_loss, train_loader, val_loader
    if Config.device != "cpu":
        torch.cuda.empty_cache()
    gc.collect()

    return preds


# K-fold

In [None]:
def k_fold(df, save=True, config=None):

    """K-fold training"""
  
    kf=KFold(n_splits=config.k)
    X=list(range(len(df)))
    folds=list(kf.split(X=X))

    tokenizer = BertWordPieceTokenizer(
                "dataset/HuggingFace/Bert/bert_base_uncased_vocab.txt",
                lowercase=False
            )
    tokens = {
            'cls': tokenizer.token_to_id('[CLS]'),
            'sep': tokenizer.token_to_id('[SEP]'),
            'pad': tokenizer.token_to_id('[PAD]'),
        }

    seed_everything(config.seed)      
    for fold, (train_idx, val_idx) in enumerate(folds):
      if fold in config.selected_folds:
        score = 0
        model=Transformer(Config.model).to(config.device)

        # if not model.transformer.config.pad_token_id:
        #     model.transformer.config.pad_token_id=tokenizer.eos_token_id

        if Config.checkpoints:
            if len(Config.checkpoints)==1:
                print("Loading common checkpoint")
                checkpoint=torch.load(Config.checkpoints[0], map_location="cpu")

            elif len(Config.checkpoints)>1:
                print("Loading fold checkpoint")
                checkpoint=torch.load(Config.checkpoints[fold], map_location="cpu")

            model.load_state_dict(checkpoint)
            print("loaded checkpoint")

        else:
            print("No checkpoint provided..!")
            pass

        
        if Config.freeze_main:
            frozen=0
            for name, param in model.named_parameters():
                if any(model_name in name for model_name in Config.model_names):
                    param.requires_grad=False
                    frozen+=1
            print(f"{frozen} layers frozen!")

                    
        model.zero_grad()
        print(f"\n-------------   Fold {fold + 1} / {len(folds)}  -------------\n")
        train_dataset=SentenceDataset(df=df.iloc[train_idx], tokenizer=tokenizer, tokens=tokens, max_len=Config.max_len)
        val_dataset=SentenceDataset(df=df.iloc[val_idx], tokenizer=tokenizer, tokens=tokens, max_len=Config.max_len)
        n_parameters = count_params(model)

        print(f">>{len(train_dataset)} training texts<<")
        print(f">>{len(val_dataset)} validation texts<<")
        print(f">>{n_parameters} trainable parameters<<\n")  

        preds = fit(model, train_dataset, val_dataset,epochs=config.epochs, batch_size=config.batch_size, weight_decay=config.weight_decay, lr=config.lr, warmup_prop=config.warmup_prop, fold=fold)

        

        if save:
            save_model_weights(model, f'{checkpoint_name()}_ fold - {fold+1}_{CHECKPOINT_KEYWORD}.pt', cp_folder=get_checkpoint_dir())

        del model, train_dataset, val_dataset
        if Config.device!="cpu":
            torch.cuda.empty_cache()
        gc.collect()

# train

In [None]:
df=pd.read_csv("dataset/squad_train.csv")
len(df)

11598

In [None]:
import pdb
CHECKPOINT_KEYWORD="cls_only"
folds=k_fold(df,save=True, config=Config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading fold checkpoint
loaded checkpoint
199 layers frozen!

-------------   Fold 1 / 5  -------------

>>9278 training texts<<
>>2320 validation texts<<
>>593668 trainable parameters<<



HBox(children=(FloatProgress(value=0.0, description='Epoch 1/15', max=15.0, style=ProgressStyle(description_wi…

HBox(children=(FloatProgress(value=0.0, description='training iterations', max=290.0, style=ProgressStyle(desc…