In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import transformers
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_cosine_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import mean_squared_error
import random
import time
from torch.utils import checkpoint
import math
import gc
from typing import Dict, List, Tuple
import codecs
import warnings
import torch.nn.functional as F
from dataclasses import dataclass, field, asdict
import wandb
from tqdm import tqdm
transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore")

%env TOKENIZERS_PARALLELISM=true

# declare the two GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

# avoids some issues when using more than one worker
os.environ["TOKENIZERS_PARALLELISM"] = "false"

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


env: TOKENIZERS_PARALLELISM=true


In [2]:
@dataclass
class cfg:
    train_summary_file: str = field(default="/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv", metadata={"help": "train file path"})
    train_prompt_file: str = field(default="/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv", metadata={"help": "train file path"})
    batch_size: int = field(default=16, metadata={"help": "batch size"})
    epochs: int = field(default=4, metadata={"help": "number of epochs"})
    lr: float = field(default=2e-5, metadata={"help": "learning rate"})
    max_len: int = field(default=512, metadata={"help": "max length of input"})
    model_name: str = field(default="microsoft/deberta-v3-base", metadata={"help": "model name"})
    hidden_dropout_prob: float = field(default=0.0, metadata={"help": "hidden dropout probability"})
    layer_norm_eps: float = field(default=1e-7, metadata={"help": "layer norm eps"})
    gradient_accumulation_steps: int = field(default=1, metadata={"help": "gradient accumulation steps"})
    gradient_checkpointing_enable: bool = field(default=False, metadata={"help": "gradient checkpointing"})
    warmup_ratio: float = field(default=0.1, metadata={"help": "warmup ratio"})
    max_grad_norm: float = field(default=10.0, metadata={"help": "max grad norm"})
    target_columns: List[str] = field(default = ('content', 'wording'), metadata={"help": "target columns"})
    num_classes: int = field(default=2, metadata={"help": "number of classes"})
    seed: int = field(default=42, metadata={"help": "seed"})
    device: str = field(default="cuda" if torch.cuda.is_available() else "cpu", metadata={"help": "device"})
    multi_gpu: bool = field(default=torch.cuda.device_count() > 1, metadata={"help": "multi gpu"})
    use_wandb: bool = field(default=True, metadata={"help": "use wandb for logging"})
    project_name: str = field(default="commonlit-kaggle", metadata={"help": "wandb project name"})

In [3]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
def seed_everything(seed=cfg.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
#Utiliy functions 
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def compute_mcrmse(preds, labels):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [6]:
class Collate:
    """Data collator for training and improving efficiency"""
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, batch):
        
        batch_len = max([len(sample["ids"]) for sample in batch])
        
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        output["targets"] = [sample["targets"] for sample in batch]
        
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + [self.tokenizer.pad_token_id] * (batch_len - len(s)) for s in output["ids"]]
            output["mask"] = [s + [0] * (batch_len - len(s)) for s in output["mask"]]
        else:
            output["ids"] = [[self.tokenizer.pad_token_id] * (batch_len - len(s)) + s for s in output["ids"]]
            output["mask"] = [[0] * (batch_len - len(s)) + s for s in output["mask"]]
            
            
        output["ids"] = torch.tensor(output["ids"], dtype = torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype = torch.long)
        output["targets"] = torch.tensor(output["targets"], dtype = torch.float32)
        
        return output

In [7]:
class Dataset(torch.utils.data.Dataset):
    """Pytorch dataset class for tokenizing the text and targets"""
    def __init__(self, texts, targets, tokenizer):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        
        text = self.texts[idx]
        targets = self.targets[idx]
        encoding = self.tokenizer(text, add_special_tokens = True, max_length = cfg.max_len, padding = False, truncation = 'longest_first') 
        
        return {
            "ids": encoding["input_ids"], 
            "mask": encoding["attention_mask"],
            "targets": targets
        }

In [8]:
class Model(nn.Module):
    """Model class"""
    def __init__(self, model_name):
        super().__init__()

        self.model_name = model_name
        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": cfg.hidden_dropout_prob,
                "attention_probs_dropout_prob" : cfg.hidden_dropout_prob,
                "layer_norm_eps": cfg.layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": cfg.num_classes,
            }
        )
        
        self.config = config
        
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        if cfg.gradient_checkpointing_enable:
            self.transformer.gradient_checkpointing_enable()
        
        self.output = nn.Linear(config.hidden_size, cfg.num_classes)
    
    def forward(self, ids, mask, targets = None):
        transformer_out = self.transformer(input_ids = ids, attention_mask = mask)
        logits = self.output(transformer_out.last_hidden_state[:,0,:])
        return logits

In [9]:
def criterion(inputs, targets):
    return nn.MSELoss()(inputs, targets)

def get_optimizer_scheduler(model, num_train_steps):
    """get optimizer and scheduler"""
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_params = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.001,
            "lr" : cfg.lr
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr" : cfg.lr
        }
    ]
    optimizer = torch.optim.AdamW(optimizer_params, lr=cfg.lr)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(num_train_steps * cfg.warmup_ratio),
            num_training_steps=num_train_steps,
            last_epoch=-1,
    )
    return optimizer, scheduler

def train(epoch, model, train_loader, valid_loader, optimizer, scheduler, device, scaler):
    """training pass"""
    model.train()
    losses = AverageMeter()

    for batch_idx, (batch) in tqdm(enumerate(train_loader), total = len(train_loader)):
        for k, v in batch.items():
            batch[k] = v.to(device)
        
        with autocast():
            outputs = model(**batch)
            loss = criterion(outputs, batch["targets"])
        
        if cfg.gradient_accumulation_steps > 1:
            loss = loss / cfg.gradient_accumulation_steps
        
        losses.update(loss.item() * cfg.gradient_accumulation_steps , cfg.batch_size)
        scaler.scale(loss).backward()

        if (batch_idx + 1) % cfg.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()

        if cfg.use_wandb:
            wandb.log({
                "train/loss": losses.val,
                "train/lr": scheduler.get_last_lr()[0],
                "train/step": epoch * len(train_loader) + batch_idx,

            })
    
    return losses.avg

@torch.no_grad()
def evaluate(epoch, model, valid_loader, device):
    """evaluate pass"""
    model.eval()
    all_targets = []
    all_outputs = []
    losses = AverageMeter()

    for batch_idx, (batch) in tqdm(enumerate(valid_loader), total = len(valid_loader)):
        for k, v in batch.items():
            batch[k] = v.to(device)
        
        outputs = model(**batch)
        loss = criterion(outputs, batch["targets"])
        losses.update(loss.item(), cfg.batch_size)
        all_targets.extend(batch["targets"].detach().cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())
    
    all_targets = np.vstack(all_targets)
    all_outputs = np.vstack(all_outputs)
    score = compute_mcrmse(all_outputs, all_targets)
    return score, losses.avg

In [10]:
def main(fold, seed):
    """Main loop"""
    # Seed everything
    seed_everything(seed=seed)
    if cfg.use_wandb:
        run = wandb.init(project=cfg.project_name, 
                         config=asdict(cfg()), 
                         group = cfg.model_name, 
                         reinit=True)
        wandb.define_metric("train/step")
        wandb.define_metric("valid/step")
        # define which metrics will be plotted against it
        wandb.define_metric("train/*", step_metric="train/step")
        wandb.define_metric("valid/*", step_metric="valid/step")
    
    pdf = pd.read_csv(cfg.train_prompt_file)
    sdf = pd.read_csv(cfg.train_summary_file)
    df = pdf.merge(sdf, on="prompt_id")

    # 4 prompt ids, 4 folds
    id2fold = {
        "39c16e": 0,
        "814d6b": 1,
        "3b9047": 2,
        "ebad26": 3,
    }
    df["fold"] = df["prompt_id"].map(id2fold)

    train_df = df[df["fold"] != fold].reset_index(drop=True)
    valid_df = df[df["fold"] == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    sep_token = tokenizer.sep_token

    # Preparing the train texts and targets
    train_texts = train_df["text"].to_list()
    valid_texts = valid_df["text"].to_list()
    train_targets = train_df[list(cfg.target_columns)].values.tolist()
    valid_targets = valid_df[list(cfg.target_columns)].values.tolist()

    # Preparing the datasets and dataloaders
    collate_fn = Collate(tokenizer)
    train_ds = Dataset(train_texts, train_targets, tokenizer)
    valid_ds = Dataset(valid_texts, valid_targets, tokenizer)

    train_loader = torch.utils.data.DataLoader(
        train_ds, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        collate_fn = collate_fn)

    valid_loader = torch.utils.data.DataLoader(
        valid_ds,
        batch_size = cfg.batch_size,
        shuffle = False,
        collate_fn = collate_fn)
    
    # Preparing the model
    model = Model(cfg.model_name)
    model = model.to(cfg.device)
    if cfg.use_wandb:
        wandb.watch(model)
    
    if cfg.multi_gpu:
        model = nn.DataParallel(model)
    
    num_train_steps = int(len(train_ds) / cfg.batch_size / cfg.gradient_accumulation_steps * cfg.epochs)
    optimizer, scheduler = get_optimizer_scheduler(model.module, num_train_steps)

    scaler = GradScaler()
    # Training loop
    best_score = 1
    for epoch in range(cfg.epochs):
        train_loss = train(epoch, model, train_loader, valid_loader, optimizer, scheduler, cfg.device, scaler)
        valid_score, valid_loss = evaluate(epoch, model, valid_loader, cfg.device)
        if cfg.use_wandb:
            wandb.log({"valid/train_loss_avg": train_loss, 
                       "valid/valid_loss_avg": valid_loss, 
                       "valid/mcrmse": valid_score["mcrmse"],
                       "valid/content_rmse": valid_score["content_rmse"],
                       "valid/wording_rmse": valid_score["wording_rmse"], 
                       "valid/step": epoch})
        
        if valid_score["mcrmse"] < best_score:
            best_score = valid_score["mcrmse"]
            if cfg.multi_gpu:
                torch.save(model.module.state_dict(), f"{cfg.model_name.split(os.path.sep)[-1]}_fold{fold}_seed{cfg.seed}.bin")
            else:
                torch.save(model.state_dict(), f"{cfg.model_name.split(os.path.sep)[-1]}_fold{fold}_seed{cfg.seed}.bin")
    
    if cfg.use_wandb:
        run.finish()

In [None]:
main(0, cfg.seed)

[34m[1mwandb[0m: Currently logged in as: [33mjashdalvi99[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

 11%|█         | 35/320 [00:25<02:20,  2.03it/s]