In [None]:
# ====================================================
# CFG: Configuration settings
# ====================================================
class CFG:
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip install -q transformers')
os.system('pip install -q tokenizers')
os.system('pip install -q sentencepiece')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

tokenizers.__version__: 0.13.3
transformers.__version__: 4.31.0
env: TOKENIZERS_PARALLELISM=true


device(type='cuda')

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

In [None]:

train=pd.read_csv("/content/train_df.csv")
test=pd.read_csv("/content/test_df.csv")
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,summary_length,fixed_summary_text,prompt_question,prompt_title,prompt_text,prompt_length,input
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,64,The third wave was an experimental see how peo...,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,The Third Wave[SEP]Summarize how the Third Wav...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,54,They would rub it up with soda to make the sme...,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1076,Excerpt from The Jungle[SEP]Summarize the vari...
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,269,"In Egypt, there were many occupations and soci...","In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,Egyptian Social Structure[SEP]In complete sent...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,28,The highest class was Pharaohs these people we...,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,625,Egyptian Social Structure[SEP]In complete sent...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,232,The Third Wave developed rapidly because the ...,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,660,The Third Wave[SEP]Summarize how the Third Wav...


In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
CFG.max_len

  0%|          | 0/7165 [00:00<?, ?it/s]

822

In [None]:
#Dataset

def tokenize_data(cfg,data):
  tokenized_data= cfg.tokenizer.encode_plus(data,max_length=cfg.max_len,pad_to_max_length=True,
                                            truncation=True,
                                            )
  for k,v in tokenized_data.items():
    tokenized_data[k]=torch.tensor(v, dtype=torch.long)
  return tokenized_data

class TrainDataset(Dataset):
  def __init__(self,cfg,df):
    self.texts=df['input'].values
    self.labels=df[cfg.target_cols].values
    self.cfg=cfg

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,item):
    inputs = tokenize_data(self.cfg, self.texts[item])
    label = torch.tensor(self.labels[item], dtype=torch.float)
    return inputs,label

class TestDataset(Dataset):
  def __init__(self,cfg,df):
    self.texts=df['text'].values
    self.cfg=cfg

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,item):
    inputs = tokenize_data(self.cfg, self.texts[item])
    return inputs



In [None]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

In [None]:
class MeanPooling(nn.Module):
  def __init__(self):
    super(MeanPooling,self).__init__()

  def forward(self,last_hidden_state,attention_mask):
    # print("last_hidden_state: ",last_hidden_state.size())
    # print("attention_mask: ",attention_mask.size())
    masked_values=attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    # print("masked_values: ",masked_values.size())
    embeddings=torch.sum(last_hidden_state*masked_values,1)
    # print("embeddings: ",embeddings.size())
    sum_mask=torch.clamp(masked_values.sum(1),min=1e-9)
    # print("sum_mask: ",sum_mask.size())
    return embeddings/sum_mask

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

We use DeBerta model which are build on our basic transformer model like BERT but have some more features like:

---


1. Enhanced MLM: where instead of masking random token, this model masks a span of tokens, this method is called Span boundary objective(SBO), It masks consecutive spans of tokens, which helps the model capture dependencies across multiple tokens and better understand long range relationships
2. Intra sentence and inter sentence relationship: Apart from normal intra sentence(within a single sentence) relation, this model also takes into consideration the relation between two different sentences(inter sentence learning) which allows the model to understand document-level semantics and capture global context effectively.
3. Contrastive birectional training
4. Cross layer parameter sharing: By sharing params, DeBERTa reduces the number of total params, making it a more computationally scalable and memory efficient model.

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        self.model = AutoModel.from_pretrained(cfg.model, config=self.config)

        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 2)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        # print("Mean embedding: ",feature)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [None]:
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = []
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.append(loss.item())
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        # if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
        #     print(f'Epoch: {epoch+1},{step},{len(train_loader)}')
        #     # print(f'Loss: {losses},{np.mean(losses)} ')
        #     print(f'Loss:{np.mean(losses)} ')

    return np.mean(losses)


def valid_fn(valid_loader, model, criterion, device):
    losses = []#AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.append(loss.item())
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        # if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
        #     print(f'EVAL: [{step}/{len(valid_loader)}] ')
        #     # print(f'Loss: {losses},({np.mean(losses)}) ')
        #     print(f'Loss:{np.mean(losses)} ')

    predictions = np.concatenate(preds)
    return np.mean(losses), predictions

def test_fn(test_loader, model, criterion, device):
    model.eval()
    preds = []
    start = end = time.time()
    for step, inputs in enumerate(test_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
    predictions = np.concatenate(preds)
    return predictions

In [None]:
# ====================================================
# train loop
# ====================================================
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(train, test_size=0.2, random_state=42)

def train_loop(train_df,valid_df):

    train_folds = train_df
    valid_folds = valid_df

    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    # torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)
        print("training loss: ",avg_loss)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        print("Validation loss: ",avg_val_loss)
        print("Predictions: ",predictions)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time
        print("Elapsed time: ",elapsed)
    torch.save(model.state_dict(), 'deberta_model.pth')
    return predictions

predictions=train_loop(train_df,valid_df)

training loss:  0.16365786648599961
Validation loss:  0.12313322756025526
Predictions:  [[ 0.6879239   0.66051334]
 [-0.26996315  0.15082607]
 [-0.39092517  0.1830393 ]
 ...
 [ 0.12343305  0.2360103 ]
 [ 1.2101552   0.6674766 ]
 [-1.1247251  -1.6724172 ]]
Elapsed time:  286.45826530456543
training loss:  0.102467445089539
Validation loss:  0.11088406518101693
Predictions:  [[ 0.6310376   0.70644265]
 [-0.3177273   0.19957699]
 [-0.4435555   0.13019091]
 ...
 [-0.05403788  0.07642724]
 [ 1.552013    0.7875407 ]
 [-1.0308473  -1.6175041 ]]
Elapsed time:  288.0299093723297
training loss:  0.08508366966026955
Validation loss:  0.10495249610394239
Predictions:  [[ 0.5628776   0.63266444]
 [-0.3007487   0.14491886]
 [-0.4578308   0.14224549]
 ...
 [-0.06799011  0.10578106]
 [ 1.28895     0.621654  ]
 [-1.1178125  -1.6625019 ]]
Elapsed time:  284.81840658187866
training loss:  0.07154476911079284
Validation loss:  0.10845416163404782
Predictions:  [[ 0.5798184   0.7174115 ]
 [-0.28474978  0.2

In [None]:
final_model=CustomModel(CFG, config_path=None, pretrained=True)
final_model.load_state_dict(torch.load('deberta_model.pth'))
final_model.eval()

CustomModel(
  (model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dro

In [None]:
dataset=TestDataset(CFG, test)
dataloader=DataLoader(dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
criterion=nn.SmoothL1Loss(reduction='mean')
final_model=final_model.to(device)
final_predictions = test_fn(dataloader, final_model, criterion, device)

In [None]:
len(final_predictions),final_predictions

(4,
 array([[-1.4205468, -1.3065829],
        [-1.4560364, -1.3117912],
        [-1.4212203, -1.2900599],
        [-1.4609294, -1.2826453]], dtype=float32))

In [None]:
new_data_df = pd.DataFrame(final_predictions, columns=['content', 'wording'])

submission=pd.read_csv("/content/drive/MyDrive/Data/EvaluateSummaries/sample_submission.csv")
submission.drop(['content','wording'],axis=1,inplace=True)

combined_df = pd.concat([submission, new_data_df], axis=1)

In [None]:
combined_df['wording']=np.round(combined_df['wording'],2)
combined_df['content']=np.round(combined_df['content'],2)
combined_df.to_csv("/content/drive/MyDrive/Data/EvaluateSummaries/my_submission.df",index=False)
combined_df.head()

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.42,-1.31
1,111111eeeeee,-1.46,-1.31
2,222222cccccc,-1.42,-1.29
3,333333dddddd,-1.46,-1.28
