In [1]:
DEBUG = False

In [2]:
EXP = 'PL14'

In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl

import pandas as pd
import numpy as np
import random
import re
import itertools
import argparse

from torch.utils.data import Dataset
import spacy
import ast
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import pickle
import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
import warnings
from torch.optim import Adam, SGD, AdamW

import wandb
from pytorch_lightning.loggers import WandbLogger

In [4]:
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [5]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=1024, stage='train', rand_prob=0.1, lowup_proba=0.0, swap_proba=0.0):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.mask_token = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
        self.stage = stage
        self.rand_prob = rand_prob
        self.lowup_proba = lowup_proba
        self.swap_proba = swap_proba

        self.essay_id = df['essay_id'].values
        self.input_ids = df['input_ids'].values
        self.attention_mask = df['attention_mask'].values
        self.offset_mapping = df['offset_mapping'].values
        self.token_class_labels = df['token_class_labels'].values
        self.token_scores_labels = df['token_scores_labels'].values
        self.token_examples_mapping = df['token_examples_mapping'].values
        self.examples_scores = df['examples_scores'].values
        self.examples_classes = df['examples_classes'].values    
        
    def __getitem__(self, idx):
        essay_id = self.essay_id[idx]
        offset_mapping = self.offset_mapping[idx]

        token_examples_mapping = self.token_examples_mapping[idx]
        examples_scores = self.examples_scores[idx]
        examples_classes = self.examples_classes[idx]

        token_examples_mapping = torch.tensor(token_examples_mapping, dtype=torch.long)
        examples_scores = torch.tensor(examples_scores + [-1] * (40 - len(examples_scores)), dtype=torch.long)
        examples_classes = torch.tensor(examples_classes + [-1] * (40 - len(examples_classes)), dtype=torch.long)
        
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        token_class_labels = self.token_class_labels[idx]
        token_scores_labels = self.token_scores_labels[idx]

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        token_class_labels = torch.tensor(token_class_labels, dtype=torch.long)
        token_scores_labels = torch.tensor(token_scores_labels, dtype=torch.long)

        if self.stage == 'train':
            ix = torch.rand(size=(self.max_len,)) < self.rand_prob
            input_ids[ix] = self.mask_token
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_class_labels": token_class_labels,
            "token_scores_labels": token_scores_labels,
            "token_examples_mapping": token_examples_mapping,
            "examples_scores": examples_scores,
            "examples_classes": examples_classes
        }

    def __len__(self):
        return len(self.df)

In [6]:
emb_dim = 64
betas = (0.9, 0.999)
eps = 1e-6

In [7]:
class MyModule(pl.LightningModule):
    def __init__(self, lr, model_checkpoint, num_classes, num_classes_class, emb_dim, betas, eps):
        super().__init__()
        self.lr = lr
        self.num_classes = num_classes
        self.num_classes_class = num_classes_class
        self.emb_dim = emb_dim
        self.name = model_checkpoint
        self.pad_idx = 1 if "roberta" in self.name else 0
        config = AutoConfig.from_pretrained(model_checkpoint, output_hidden_states=True)
        self.transformer = AutoModel.from_pretrained(model_checkpoint, config=config)
        self.nb_features = config.hidden_size
        self.logits = nn.Linear(self.nb_features, num_classes)
        self.example_logits = nn.Linear(self.nb_features + self.emb_dim, num_classes)
        self.class_logits = nn.Linear(self.nb_features, num_classes_class)  
        transformers.logging.set_verbosity_error()
        self.embedding = nn.Embedding(num_classes_class, emb_dim, max_norm=True)
        self.betas = betas
        self.eps = eps
    
    def configure_optimizers(self):
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': self.lr, 'weight_decay': 0.01},
            {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': self.lr, 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_parameters, lr=self.lr, betas=self.betas, eps=self.eps)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=200,
            num_training_steps=self.trainer.estimated_stepping_batches,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]
                
    def training_step(self, train_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, \
        examples_scores, examples_classes, token_class_labels = \
            train_batch["input_ids"], train_batch["attention_mask"], train_batch["token_scores_labels"], \
            train_batch['token_examples_mapping'], train_batch['examples_scores'], \
            train_batch['examples_classes'], train_batch['token_class_labels']
        
        hidden_states = self.transformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(features)
        class_logits = self.class_logits(features)
        loss = F.cross_entropy(logits.view(-1, self.num_classes), token_scores_labels.view(-1))
        class_loss = F.cross_entropy(class_logits.view(-1, self.num_classes_class), token_class_labels.view(-1))

        # Convert to examples loss
        bs, ml, nc1 = logits.shape
        
        batch_preds = []
        batch_targs = []
        
        for i in range(bs):
            example_preds = []
            example_targs = []
            num_examples = token_examples_mapping[i].max()
            assert examples_scores[i,num_examples] >= 0 # and examples_scores[i,num_examples+1] < 0 # truncation breaks this
            for j in range(num_examples + 1):
                indices = token_examples_mapping[i] == j
                fts = features[i][indices].mean(dim=0)
                class_idx = examples_classes[i,j]
                emb = self.embedding(class_idx)
                preds = self.example_logits(torch.cat([emb,fts]))
                example_preds.append(preds)
                example_targs.append(examples_scores[i,j].view(1))
                
            example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
            example_targs = torch.cat(example_targs, dim=0)
            batch_preds.append(example_preds)
            batch_targs.append(example_targs)
        
        batch_preds = torch.cat(batch_preds, dim=0).view(-1, nc1)
        batch_targs = torch.cat(batch_targs, dim=0)
        
        example_loss = F.cross_entropy(batch_preds, batch_targs)
        
#         if self.current_epoch == 0:
    
        total_loss = loss + class_loss + example_loss

        self.log('train_scores_loss', loss)
        self.log('train_classes_loss', class_loss)
        self.log('train_examples_loss', example_loss)
        self.log('train_total_loss', total_loss)

        return total_loss
        
    def validation_step(self, val_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, \
        examples_scores, examples_classes, token_class_labels = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["token_scores_labels"], \
            val_batch['token_examples_mapping'], val_batch['examples_scores'], val_batch['examples_classes'], \
            val_batch['token_class_labels']
        hidden_states = self.transformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(features)
        class_logits = self.class_logits(features)
        y_pred = F.log_softmax(logits, dim=-1)                                                
        loss = F.cross_entropy(logits.view(-1, self.num_classes), token_scores_labels.view(-1))
        class_loss = F.cross_entropy(class_logits.view(-1, self.num_classes_class), token_class_labels.view(-1))
        self.log('val_loss', loss)
        self.log('val_class_loss', class_loss)
        return {"preds": y_pred,
                "logits": logits,
                "features": features,
                "val_losses": loss,
                "token_examples_mapping": token_examples_mapping,
                "examples_scores": examples_scores,
                "examples_classes": examples_classes}   
    
    def validation_epoch_end(self, validation_step_outputs):

        bs, ml, nc1 = validation_step_outputs[0]["preds"].shape
        ml2 = validation_step_outputs[0]["examples_scores"].shape[-1]
        all_preds = torch.cat([x["logits"] for x in validation_step_outputs], dim=0).view(-1, ml, nc1)
        all_features = torch.cat([x["features"] for x in validation_step_outputs], dim=0).view(-1, ml, self.nb_features)
        all_mappings = torch.cat([x["token_examples_mapping"] for x in validation_step_outputs], dim=0).view(-1, ml)
        all_scores = torch.cat([x["examples_scores"] for x in validation_step_outputs], dim=0).view(-1, ml2)
        all_classes = torch.cat([x["examples_classes"] for x in validation_step_outputs], dim=0).view(-1, ml2)

        num_texts = all_scores.shape[0]
        
        example_preds = []
        example_targs = []
        
        for i in range(num_texts):
            num_examples = all_mappings[i].max()
            assert all_scores[i,num_examples] >= 0 # and all_scores[i,num_examples+1] < 0 # truncation breaks this
            for j in range(num_examples + 1):
                indices = all_mappings[i] == j
                fts = all_features[i][indices].mean(dim=0)
                class_idx = all_classes[i,j]
                emb = self.embedding(class_idx)
                preds = self.example_logits(torch.cat([emb,fts]))
                example_preds.append(preds)
                example_targs.append(all_scores[i,j].view(1))              
                
        example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
        example_targs = torch.cat(example_targs, dim=0)
        
        example_loss = F.cross_entropy(example_preds, example_targs)
        self.log('example_loss', example_loss)
        print(example_loss)
        
    def predict_step(self, val_batch, batch_idx):
        input_ids, attention_mask, token_scores_labels, token_examples_mapping, examples_scores, examples_classes = \
            val_batch["input_ids"], val_batch["attention_mask"], val_batch["token_scores_labels"], \
            val_batch['token_examples_mapping'], val_batch['examples_scores'], val_batch['examples_classes']
        hidden_states = self.transformer(
            input_ids,
            attention_mask=attention_mask,
        )[-1]
        features = hidden_states[-1]
        logits = self.logits(features)
        y_pred = F.softmax(logits, dim=-1)  
        
        bs, ml, nc1 = logits.shape
        ml2 = 40
#         all_preds = torch.cat([x["preds"] for x in validation_step_outputs], dim=0).view(-1, ml, nc1)
#         all_mappings = torch.cat([x["token_examples_mapping"] for x in validation_step_outputs], dim=0).view(-1, ml)
#         all_scores = torch.cat([x["examples_scores"] for x in validation_step_outputs], dim=0).view(-1, ml2)
        
#         num_texts = all_scores.shape[0]
        
        batch_preds = []
        batch_targs = []
        
        for i in range(bs):
            example_preds = []
            example_targs = []
            num_examples = token_examples_mapping[i].max()
            assert examples_scores[i,num_examples] >= 0 # and examples_scores[i,num_examples+1] < 0 # truncation breaks this
            for j in range(num_examples + 1):               
                indices = token_examples_mapping[i] == j
                fts = features[i][indices].mean(dim=0)
                class_idx = examples_classes[i,j]
                emb = self.embedding(class_idx)
                preds = self.example_logits(torch.cat([emb,fts]))
                example_preds.append(preds)
                example_targs.append(examples_scores[i,j].view(1))   
                
            example_preds = torch.cat(example_preds, dim=0).view(-1, nc1)
            example_targs = torch.cat(example_targs, dim=0)
            batch_preds.append(example_preds)
            batch_targs.append(example_targs)
        
#         batch_preds = torch.cat(batch_preds, dim=0).view(-1, nc1)
#         batch_targs = torch.cat(batch_targs, dim=0)
        
        return batch_preds, batch_targs
        

In [8]:
import pickle
with open('processed-deberta-v3-large.pickle', 'rb') as handle:
    pdf = pickle.load(handle)

In [9]:
tags = ['debug'] if DEBUG else ['train']
if DEBUG: pdf = pdf.sample(n=100, random_state=42)

In [10]:
df_train = pdf[pdf.fold != 0].reset_index(drop=True)
df_valid = pdf[pdf.fold == 0].reset_index(drop=True)

In [11]:
project = 'fbck'
run = wandb.init(project=project, tags=tags)
run.log_code()

[34m[1mwandb[0m: Currently logged in as: [33mdarek[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
seed = 42
OUTPUT_DIR = '../output'
pl.seed_everything(seed)

Global seed set to 42


42

In [13]:
model_checkpoint = 'microsoft/deberta-v3-large'
max_length = 1024
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, max_length=max_length, padding='max_length')
bs = 1

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
randmask_proba = 0.15

train_dataset = MyDataset(
    df_train,
    tokenizer,
    max_len=max_length,
    stage='train',
    rand_prob=randmask_proba
)

valid_dataset = MyDataset(
    df_valid,
    tokenizer,
    max_len=max_length,
    stage='valid',
    rand_prob=randmask_proba
)

In [15]:
train_loader = DataLoader(train_dataset,
                          batch_size=bs,
                          shuffle=True,
                          num_workers=4, pin_memory=True, drop_last=True)

val_loader = DataLoader(valid_dataset,
                          batch_size=bs,
                          shuffle=False,
                          num_workers=4, pin_memory=True, drop_last=False)

In [16]:
warnings.filterwarnings("ignore")

In [17]:
lr = 2e-5
epochs = 2 if DEBUG else 2
num_classes = 3
num_classes_class = 8

In [18]:
model = MyModule(lr=lr,
                 model_checkpoint=model_checkpoint, 
                 num_classes=num_classes,
                 num_classes_class=num_classes_class,
                 emb_dim=emb_dim,
                 betas=betas,
                 eps=eps,
                )

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
wandb_logger = WandbLogger(project=project)

In [20]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor="example_loss",
    mode="min",
    dirpath=f"../output/{EXP}/",
    filename="feedback-{epoch:02d}-{example_loss:.2f}",
)

In [21]:
trainer = pl.Trainer(precision=16, 
                     accelerator="gpu", devices=1, max_epochs=epochs,
                     log_every_n_steps=100, logger=wandb_logger,
                     default_root_dir=f"../output/{EXP}",
                     callbacks=[checkpoint_callback],
                     accumulate_grad_batches=4
                     )

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name           | Type           | Params
--------------------------------------------------
0 | transformer    | DebertaV2Model | 434 M 
1 | logits         | Linear         | 3.1 K 
2 | example_logits | Linear         | 3.3 K 
3 | class_logits   | Linear         | 8.2 K 
4 | embedding      | Embedding      | 512   
--------------------------------------------------
434 M     Trainable params
0         Non-trainable params
434 M     Total params
868.054   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

tensor(0.9578, device='cuda:0')


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

tensor(0.7160, device='cuda:0')


Validation: 0it [00:00, ?it/s]

tensor(0.6589, device='cuda:0')


In [23]:
wandb.finish()

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁█████████
example_loss,█▁
train_classes_loss,█▅▆▇▆▂▂▅▂▂▂▂▁▄▁▁
train_examples_loss,█▄▆▂▄▄▅▅▁▃▃█▄▆▅▂
train_scores_loss,█▄▄▂▅▅▆▄▁▂▄█▃▆▄▁
train_total_loss,█▄▅▄▅▃▃▅▁▂▃▅▂▅▂▁
trainer/global_step,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▇▇██
val_class_loss,█▁
val_loss,█▁

0,1
epoch,1.0
example_loss,0.65892
train_classes_loss,0.46806
train_examples_loss,0.24554
train_scores_loss,0.12999
train_total_loss,0.84358
trainer/global_step,1675.0
val_class_loss,0.79698
val_loss,0.63676


In [24]:
ls ../output/PL14

'feedback-epoch=01-example_loss=0.66.ckpt'
'feedback-epoch=03-example_loss=0.66.ckpt'


In [25]:
# !rm -rf ../output/PL13

In [26]:
# or call with pretrained model
PATH = '../output/PL13/feedback-epoch=01-example_loss=0.65.ckpt'
model = MyModule.load_from_checkpoint(PATH, lr=lr,
                 model_checkpoint=model_checkpoint, 
                 num_classes=num_classes,
                 num_classes_class=num_classes_class,
                 emb_dim=emb_dim)
trainer = pl.Trainer(accelerator="gpu")
predictions = trainer.predict(model, dataloaders=val_loader)

TypeError: __init__() missing 2 required positional arguments: 'betas' and 'eps'

In [None]:
preds = torch.cat([p for b in predictions for p in b[0]])
targs = torch.cat([p for b in predictions for p in b[1]])

In [None]:
preds.shape, targs.shape

In [None]:
F.cross_entropy(preds, targs)

In [None]:
(preds.argmax(dim=-1) == targs).sum()/preds.shape[0]

In [None]:
# SOLVED: why aren't these preds deterministic???????????????????? >>> I had a random mask in valid loader!!!