In [3]:
import torch
import torch.nn as nn
import numpy as np
import os
import transformers
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_cosine_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import mean_squared_error
import random
import time
from torch.utils import checkpoint
import math
import gc
from typing import Dict, List, Tuple
import codecs
import warnings
import torch.nn.functional as F
from dataclasses import dataclass, field, asdict
import wandb
from tqdm import tqdm
import gc
from joblib import Parallel, delayed
import joblib
from functools import partial
transformers.logging.set_verbosity_error()
warnings.filterwarnings("ignore")

%env TOKENIZERS_PARALLELISM=true

# declare the two GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

# avoids some issues when using more than one worker
os.environ["TOKENIZERS_PARALLELISM"] = "false"

env: TOKENIZERS_PARALLELISM=true


In [None]:
@dataclass
class cfg:
    test_summary_file: str = field(default="/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv", metadata={"help": "test file path"})
    test_prompt_file: str = field(default="/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv", metadata={"help": "test file path"})
    sample_submission_file: str = field(default="/kaggle/input/commonlitreadabilityprize/sample_submission.csv", metadata={"help": "sample submission file path"})
    batch_size: int = field(default=16, metadata={"help": "batch size"})
    hidden_dropout_prob: float = field(default=0.0, metadata={"help": "hidden dropout probability"})
    layer_norm_eps: float = field(default=1e-7, metadata={"help": "layer norm eps"})
    target_columns: List[str] = field(default = ('content', 'wording'), metadata={"help": "target columns"})
    num_classes: int = field(default=2, metadata={"help": "number of classes"})
    num_workers: int = field(default=4, metadata={"help": "number of workers"})
    device: str = field(default="cuda" if torch.cuda.is_available() else "cpu", metadata={"help": "device"})
    multi_gpu: bool = field(default=torch.cuda.device_count() > 1, metadata={"help": "multi gpu"})

In [None]:
class Model(nn.Module):
    """Model class"""
    def __init__(self, model_name):
        super().__init__()

        self.model_name = model_name
        config = AutoConfig.from_pretrained(model_name)

        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": cfg.hidden_dropout_prob,
                "attention_probs_dropout_prob" : cfg.hidden_dropout_prob,
                "layer_norm_eps": cfg.layer_norm_eps,
                "add_pooling_layer": False,
                "num_labels": cfg.num_classes,
            }
        )
        
        self.config = config
        
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, cfg.num_classes)
    
    def forward(self, ids, mask, targets = None):
        transformer_out = self.transformer(input_ids = ids, attention_mask = mask)
        logits = self.output(transformer_out.last_hidden_state[:,0,:])
        return logits

In [None]:
model_grps = {
    'deberta_v3_512':{
        'arch': partial( Model, model_name="microsoft/deberta-v3-base"),
        'tokenizer_path': AutoTokenizer.from_pretrained("microsoft/deberta-v3-base"),
        'checkpoints':[
           "../input/colab-models-download-deberta-large/deberta-v3-large/deberta-v3-large_fold_0.pth",
        "../input/colab-models-download-deberta-large/deberta-v3-large/deberta-v3-large_fold_1.pth",
        "../input/colab-models-download-deberta-large/deberta-v3-large/deberta-v3-large_fold_2.pth",
        "../input/colab-models-download-deberta-large/deberta-v3-large/deberta-v3-large_fold_3.pth",
        "../input/colab-models-download-deberta-large/deberta-v3-large/deberta-v3-large_fold_4.pth"
        ],
        'max_length' : 512,
    }
}

In [None]:
def inference_one_epoch(model, dataloader, device):
    model.eval()
    pred = []
    model.to(device)
    if cfg.multi_gpu:
        model = nn.DataParallel(model)
    for step, batch in tqdm( enumerate(dataloader), total=len(dataloader)):
        
        for k, v in batch.items():
            batch[k] = v.to(device)

        with torch.no_grad():
            outputs = model(**batch)
        
        pred.append(outputs.detach().cpu().numpy())
    pred = np.concatenate(pred, axis = 0)
    return pred

In [None]:
def _prepare_training_data_helper(tokenizer, df, max_len):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row['text']
        encoded_text = tokenizer(text, add_special_tokens = True, max_length = max_len, padding = False, truncation = 'longest_first') 
        sample = {
            "student_id": row["student_id"],
            "input_ids": encoded_text['input_ids'],
            "attention_mask": encoded_text['attention_mask']

        }

        training_samples.append(sample)
    return training_samples


def prepare_training_data(df, tokenizer, num_jobs, max_len):
    training_samples = []

    df_splits = np.array_split(df, num_jobs)

    results = Parallel(n_jobs=num_jobs, backend="multiprocessing")(
        delayed(_prepare_training_data_helper)( tokenizer, df, max_len) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples


class Dataset:
    def __init__(self, samples,  tokenizer):
        self.samples = samples
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return {
            "ids": self.samples[idx]["input_ids"],
            "mask": self.samples[idx]["attention_mask"],
        }


class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output

In [None]:
# Read and combine data from test files
pdf = pd.read_csv(cfg.test_prompt_file)
sdf = pd.read_csv(cfg.test_summary_file)
df = pdf.merge(sdf, on="prompt_id")

In [None]:
final_predictions = []

for model_type,model in model_grps.items():
    
    print(f'Model Type : {model_type}')
    
    predictions = []
    
    print('[INFO] Preparing Data...')
    
    tokenizer = model['tokenizer_path']
    samples = prepare_training_data(df, tokenizer, num_jobs=cfg.num_workers, max_len=model['max_length'])
    samples = list(sorted(samples, key=lambda d: len(d["input_ids"])))
    dataset = Dataset(samples, tokenizer)
    collate_fn = Collate(tokenizer)
    
    test_loader = torch.utils.data.DataLoader(dataset, 
                         batch_size=cfg.batch_size,
                         shuffle=False,
                         collate_fn = collate_fn,
                         num_workers = cfg.num_workers,
                         pin_memory = True,
                         drop_last = False,
                        )
    
    print('[INFO] Inferring....')
    

    for chkpoint in model['checkpoints']:
        net =  model['arch']()

        state = torch.load(chkpoint,
                           map_location = torch.device('cpu'))
        net.load_state_dict(state)
        prediction = inference_one_epoch(net, test_loader, cfg.device)
        predictions.append(prediction)
        del net, state, prediction
        gc.collect()
        torch.cuda.empty_cache()
    
    final_predictions.append( ( [x["student_id"] for x in samples], np.mean(predictions, axis=0) ) )
    
    del dataset, samples, test_loader
    gc.collect()
    
    print('[INFO] Inference Complete.\n\n')
    
    predictions.clear()
    del tokenizer;

In [None]:
sample_submission = pd.read_csv(cfg.sample_submission_file)
sample_submission = sample_submission.set_index('student_id')

target_columns = list(cfg.target_columns)
for i, ( ids, preds ) in enumerate( final_predictions ):
    
    if i == 0:
        for k,col in enumerate(target_columns):
            sample_submission.loc[ids, col] = preds[:, k]
 
    else:
        for k,col in enumerate(target_columns):
            sample_submission.loc[ids, col] += preds[:, k]


sample_submission = sample_submission.reset_index()
sample_submission[target_columns] /= len( final_predictions )
sample_submission.to_csv('submission.csv', index=False)
sample_submission