## Imports

In [1]:
import warnings
import logging
import colorlog
from multiprocessing import cpu_count

from pathlib import Path
from functools import partial

import pandas as pd
import numpy as np

import hydra
from omegaconf import DictConfig, OmegaConf

from torch.utils.data import DataLoader

from fastcore.xtras import Path  # for ls

import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.data.data_collator import default_data_collator

from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torchmetrics import PearsonCorrCoef, MeanSquaredError
from composer.models.huggingface import HuggingFaceModel
from composer.loggers import WandBLogger
from composer import Trainer

## Function Definitions

In [2]:
path = Path("dataset")

In [3]:
# ignoring warnings
warnings.filterwarnings("ignore")

def process_df(df, sep_token):
    df["section"] = df.context.str[0]
    df["sectok"] = "[" + df.section + "]"
    sectoks = list(df.sectok.unique())
    df["input"] = (
        df.sectok
        + sep_token
        + df.context
        + sep_token
        + df.anchor.str.lower()
        + sep_token
        + df.target
    )
    
    return df, sectoks


def create_val_split(df: pd.DataFrame, val_prop: float = 0.2, seed: int = 42):
    anchors = df.anchor.unique()
    np.random.seed(seed)
    np.random.shuffle(anchors)
    val_sz = int(len(anchors) * val_prop)
    val_anchors = anchors[:val_sz]
    is_val = np.isin(df.anchor, val_anchors)
    idxs = np.arange(len(df))
    val_idxs = idxs[is_val]
    trn_idxs = idxs[~is_val]

    return trn_idxs, val_idxs

def tokenize_func(batch, tokenizer):
    return tokenizer(
        batch["input"],
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )


def tokenize_and_split(df, tokenize_func, train=True):
    inps = "anchor", "target", "context"
    dataset = datasets.Dataset.from_pandas(df)
    tok_dataset = dataset.map(
        tokenize_func,
        batched=True,
        batch_size=None,
        remove_columns=inps + ("id", "input", "section", "sectok")
    )
    if train:
        tok_dataset = tok_dataset.rename_columns({"score": "labels"})
        trn_idxs, val_idxs = create_val_split(df)
        tok_dataset = datasets.DatasetDict(
        {"train": tok_dataset.select(trn_idxs), "test": tok_dataset.select(val_idxs)}
    )
    
    return tok_dataset


def create_dataloaders(tok_ds, bs, train=True):
    if train:
        train_dl = DataLoader(
            tok_ds["train"],
            batch_size=bs,
            shuffle=True,
            collate_fn=default_data_collator,
        )
        val_dl = DataLoader(
            tok_ds["test"],
            batch_size=bs,
            shuffle=False,
            collate_fn=default_data_collator,
        )

        return train_dl, val_dl
    else:
        test_dl = DataLoader(
            tok_ds,
            batch_size=bs,
            shuffle=False,
            collate_fn=default_data_collator,
        )

        return test_dl


def predict(trainer, test_dl):
    preds = trainer.predict(test_dl)[0]["logits"].numpy().astype(float)
    preds = np.clip(preds, 0, 1)
    preds = preds.round(2)
    preds = preds.squeeze()

    return preds

In [4]:
def prepare_data(train_df, tokenizer, sep_token, bs):
    train_df, sectoks = process_df(train_df, sep_token)
    tokenizer.add_special_tokens({"additional_special_tokens": sectoks})
    tokenize = partial(tokenize_func, tokenizer=tokenizer)
    train_tok_ds = tokenize_and_split(train_df, tokenize)
    train_dl, val_dl = create_dataloaders(train_tok_ds, bs)
    
    return train_dl, val_dl

def prepare_model(checkpoint, num_labels, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint, num_labels=num_labels
    )
    model.resize_token_embeddings(len(tokenizer))
    pears_corr = PearsonCorrCoef(num_outputs=num_labels)
    mse_metric = MeanSquaredError()
    composer_model = HuggingFaceModel(
        model=model,
        tokenizer=tokenizer,
        metrics=[pears_corr],
        eval_metrics=[mse_metric, pears_corr],
        use_logits=True,
    )
    
    return composer_model

def prepare_optimizer_and_scheduler(composer_model, lr, wd, epochs, train_dl):
    optimizer = AdamW(
        params=composer_model.parameters(),
        lr=lr,
        betas=(0.9, 0.98),
        eps=1e-6,
        weight_decay=wd,
    )
    scheduler = OneCycleLR(
        optimizer,
        max_lr=lr,
        steps_per_epoch=len(train_dl),
        epochs=epochs,
    )
    
    return optimizer, scheduler

def prepare_trainer(composer_model, optimizer, scheduler, train_dl, val_dl, epochs, run_name):
    trainer = Trainer(
        model=composer_model,
        train_dataloader=train_dl,
        eval_dataloader=val_dl,
        max_duration=f"{epochs}ep",
        optimizers=optimizer,
        schedulers=[scheduler],
        loggers=[WandBLogger(project="patent-phrase-to-phrase")],
        run_name=run_name,
        device="gpu",
        precision="amp_fp16",
        step_schedulers_every_batch=True,
        # seed=17,
    )
    
    return trainer

In [5]:
def train(train_df, checkpoint, run_name, bs=32, lr=8e-5, wd=0.01, epochs=4, num_labels=1, sep_token=" [s] "):
    # preparing data
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    train_dl, val_dl = prepare_data(train_df, tokenizer, sep_token, bs)
    
    # preparing model
    composer_model = prepare_model(checkpoint, num_labels, tokenizer)
    
    # preparing optimizer and scheduler
    optimizer, scheduler = prepare_optimizer_and_scheduler(composer_model, lr, wd, epochs, train_dl)
    
    # preparing trainer
    trainer = prepare_trainer(composer_model, optimizer, scheduler, train_dl, val_dl, epochs, run_name)
    
    # training
    trainer.fit()
    
    return trainer

## Main Function

In [None]:
# loading the dataset
train_df = pd.read_csv(path / "train.csv")
test_df = pd.read_csv(path / "test.csv")

In [None]:
train_df

In [None]:
sep_token = " [s] "

In [None]:
train_df, sectoks = process_df(train_df, sep_token)
eval_df, _ = process_df(test_df, sep_token)

In [None]:
train_df

In [None]:
sample_row = train_df.iloc[0]

In [None]:
sample_row.input

In [None]:
checkpoint = "microsoft/deberta-v3-small"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenizer.add_special_tokens({"additional_special_tokens": sectoks})

In [None]:
tokenizer.all_special_tokens

In [None]:
tokenize = partial(tokenize_func, tokenizer=tokenizer)

In [None]:
train_tok_ds = tokenize_and_split(train_df, tokenize)
eval_tok_ds = tokenize_and_split(eval_df, tokenize, train=False)

In [None]:
train_tok_ds

In [None]:
eval_tok_ds

In [None]:
lr = 8e-5
bs = 64
epochs = 4
num_labels =1
wd = 0.01

In [None]:
train_dl, val_dl = create_dataloaders(train_tok_ds, bs)
test_dl = create_dataloaders(eval_tok_ds, bs, train=False)

In [None]:
# get a sample batch and print the first element
print("Sample batch")
batch = next(iter(val_dl))
print(batch["input_ids"][0])
print(batch["token_type_ids"][0])
print(batch["attention_mask"][0])
print(batch["labels"][0])

In [None]:
batch['input_ids'].shape

In [None]:
batch["token_type_ids"].shape

In [None]:
batch["attention_mask"].shape

In [None]:
batch['labels'].shape

In [None]:
# loading the model
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels
)

In [None]:
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer.model_input_names

In [None]:
pears_corr = PearsonCorrCoef(num_outputs=num_labels)
mse_metric = MeanSquaredError()

In [None]:
composer_model = HuggingFaceModel(
    model=model,
    tokenizer=tokenizer,
    metrics=[pears_corr],
    eval_metrics=[mse_metric, pears_corr],
    use_logits=True,
)

In [None]:
optimizer = AdamW(
    params=composer_model.parameters(),
    lr=lr,
    betas=(0.9, 0.98),
    eps=1e-6,
    weight_decay=wd,
)

one_cycle_lr = OneCycleLR(
    optimizer,
    max_lr=lr,
    steps_per_epoch=len(train_dl),
    epochs=epochs,
)

# Baseline

In [None]:
train_df = pd.read_csv(path / "train.csv")
checkpoint = "microsoft/deberta-v3-small"

trainer = train(train_df, checkpoint, run_name="baseline")

print(trainer.state.eval_metric_values)

# Experimentation Zone

## a) Different Sep Token

In [None]:
lr = 8e-5
bs = 64
epochs = 4
num_labels =1
wd = 0.01
checkpoint = "microsoft/deberta-v3-small"
# sep_token = " [s] "

In [None]:
train_df = pd.read_csv(path / "train.csv")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
train_dl, val_dl = prepare_data(train_df, tokenizer, sep_token=tokenizer.sep_token, bs=bs)
composer_model = prepare_model(checkpoint, num_labels, tokenizer)
optimizer, scheduler = prepare_optimizer_and_scheduler(composer_model, lr, wd, epochs, train_dl)
trainer = prepare_trainer(composer_model, optimizer, scheduler, train_dl, val_dl, epochs, run_name="tok_sep")

In [None]:
trainer.fit()

In [None]:
trainer.state.eval_metric_values

## b) Classification task instead of Regression

In [None]:
train_df = pd.read_csv(path / "train.csv")

In [None]:
train_df

In [None]:
score_to_class = {
    0: 0,
    0.25: 1,
    0.5: 2,
    0.75: 3,
    1: 4,
}

In [None]:
#apply the function to the dataframe
train_df["score"] = train_df["score"].apply(lambda x: score_to_class[x])

In [None]:
train_df.head()

In [None]:
lr = 8e-5
bs = 64
epochs = 4
num_labels = 5
wd = 0.01
checkpoint = "microsoft/deberta-v3-small"
sep_token = " [s] "

In [None]:
from torchmetrics import Accuracy
from composer.metrics import CrossEntropy

In [None]:
def prepare_model_classification(checkpoint, num_labels, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint, num_labels=num_labels
    )
    model.resize_token_embeddings(len(tokenizer))
    pears_corr = PearsonCorrCoef(num_outputs=num_labels)
    cross_entropy = CrossEntropy()
    accuracy_metric = Accuracy(task='multiclass', num_classes=num_labels)
    composer_model = HuggingFaceModel(
        model=model,
        tokenizer=tokenizer,
        metrics=[cross_entropy, accuracy_metric],
        # eval_metrics=[mse_metric, pears_corr],
        use_logits=True,
    )
    
    return composer_model

In [None]:
composer_model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
train_dl, val_dl = prepare_data(train_df, tokenizer, sep_token=sep_token, bs=bs)
composer_model = prepare_model_classification(checkpoint, num_labels, tokenizer)
optimizer, scheduler = prepare_optimizer_and_scheduler(composer_model, lr, wd, epochs, train_dl)
trainer = prepare_trainer(composer_model, optimizer, scheduler, train_dl, val_dl, epochs)

In [None]:
trainer.fit()

In [None]:
trainer.state.eval_metric_values

Could not get the main metric (Pearson Correlation) to work in a classification setting.

## c) Different Checkpoints

In [None]:
train_df = pd.read_csv(path / "train.csv")
checkpoint = "anferico/bert-for-patents"
trainer = train(train_df, checkpoint, lr=8e-6, run_name="bert-for-patents")

print(trainer.state.eval_metric_values)

In [None]:
train_df = pd.read_csv(path / "train.csv")
checkpoint = "AI-Growth-Lab/PatentSBERTa"
trainer = train(train_df, checkpoint, run_name="PatentSBERTa")

print(trainer.state.eval_metric_values)

## d) Cosine Scheduler

In [None]:
from composer.optim import CosineAnnealingWithWarmupScheduler

In [None]:
def prepare_optimizer_and_scheduler(composer_model, lr, wd, epochs, train_dl):
    optimizer = AdamW(
        params=composer_model.parameters(),
        lr=lr,
        betas=(0.9, 0.98),
        eps=1e-6,
        weight_decay=wd,
    )
    scheduler = CosineAnnealingWithWarmupScheduler(
        t_warmup='0.2dur'
    )
    
    return optimizer, scheduler

In [None]:
train_df = pd.read_csv(path / "train.csv")
checkpoint = "microsoft/deberta-v3-small"
trainer = train(train_df, checkpoint, run_name="cosine_scheduler")

print(trainer.state.eval_metric_values)

## e) Replacing the Context with the explanation

In [6]:
train_df = pd.read_csv(path / "train.csv")

In [7]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [8]:
sample_row = train_df.iloc[0]
sample_row

id               37d61fd2272659b1
anchor                  abatement
target     abatement of pollution
context                       A47
score                         0.5
Name: 0, dtype: object

In [9]:
titles = pd.read_csv(path / "titles.csv")

In [10]:
titles.head()

Unnamed: 0,code,title,section,class,subclass,group,main_group
0,A,HUMAN NECESSITIES,A,,,,
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...,A,1.0,,,
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...,A,1.0,B,,
3,A01B1/00,Hand tools (edge trimmers for lawns A01G3/06 ...,A,1.0,B,1.0,0.0
4,A01B1/02,Spades; Shovels {(hand-operated dredgers E02F3...,A,1.0,B,1.0,2.0


In [11]:
titles.loc[titles["code"] == sample_row["context"]].title.values[0]

'FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; COFFEE MILLS; SPICE MILLS; SUCTION CLEANERS IN GENERAL'

In [12]:
# merge the two dataframes matching the context to the code
train_df = train_df.merge(titles, left_on="context", right_on="code")

In [13]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,


Use the title instead of the context:

In [14]:
def process_df(df, sep_token):
    df["section"] = df.context.str[0]
    df["sectok"] = "[" + df.section + "]"
    sectoks = list(df.sectok.unique())
    df["input"] = (
        df.sectok
        + sep_token
        # + "context: "
        + df.title.str.lower()
        + sep_token
        + df.anchor.str.lower()
        + sep_token
        + df.target
    )
    
    return df, sectoks

In [15]:
dummy, _ = process_df(train_df, " [s] ")

In [16]:
dummy.iloc[0].input

'[A] [s] furniture; domestic articles or appliances; coffee mills; spice mills; suction cleaners in general [s] abatement [s] abatement of pollution'

In [17]:
dummy.iloc[100].input

'[A] [s] furniture; domestic articles or appliances; coffee mills; spice mills; suction cleaners in general [s] cervical support [s] gel pack'

In [18]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group,sectok,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,[A],[A] [s] furniture; domestic articles or applia...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,[A],[A] [s] furniture; domestic articles or applia...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,[A],[A] [s] furniture; domestic articles or applia...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,[A],[A] [s] furniture; domestic articles or applia...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,[A],[A] [s] furniture; domestic articles or applia...


In [19]:
checkpoint = "microsoft/deberta-v3-small"
trainer = train(train_df, checkpoint, run_name="context_title")

print(trainer.state.eval_metric_values)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

******************************
Config:
node_name: ubuntu_desktop
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 456369393

******************************


train          Epoch   0:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   0:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   1:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   1:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   2:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   2:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   3:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   3:    0%|| 0/235 [00:00<?, ?ba/s]         

{'MeanSquaredError': tensor(0.0268, device='cuda:0'), 'PearsonCorrCoef': tensor(0.7982, device='cuda:0')}


Run some improvements! Potential is there!

In [20]:
trainer.close()

0,1
loss/train/total,█▆▆▃▆▃▅▃▄▃▃▃▄▃▃▂▂▂▂▃▂▃▂▂▁▂▂▁▂▂▁▁▁▁▁▂▂▂▁▂
metrics/eval/MeanSquaredError,█▃▁▂
metrics/eval/PearsonCorrCoef,▁▆██
metrics/train/PearsonCorrCoef,▃▁▂▆▅▇▆▆▆▇▇▇▆▅▇▇███▇█▆████▇██▇█████████▇
time/batch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/batch_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/epoch,▁▃▅▆█
time/sample,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/sample_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/token,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss/train/total,0.00601
metrics/eval/MeanSquaredError,0.02683
metrics/eval/PearsonCorrCoef,0.79822
metrics/train/PearsonCorrCoef,0.94622
time/batch,3624.0
time/batch_in_epoch,0.0
time/epoch,4.0
time/sample,115868.0
time/sample_in_epoch,0.0
time/token,8689575.0


## f) Different arrangements of the input

In [21]:
def process_df(df, sep_token):
    df["section"] = df.context.str[0]
    df["sectok"] = "[" + df.section + "]"
    sectoks = list(df.sectok.unique())
    df["input"] = (
        df.anchor.str.lower()
        + sep_token
        + df.target
        + sep_token
        + df.title.str.lower()
    )
    # df["input"] = (
    #     df.sectok
    #     + sep_token
    #     # + "context: "
    #     + df.title.str.lower()
    #     + sep_token
    #     + df.anchor.str.lower()
    #     + sep_token
    #     + df.target
    # )
    
    return df, sectoks

In [22]:
train_df = pd.read_csv(path / "train.csv")
titles = pd.read_csv(path / "titles.csv")
train_df = train_df.merge(titles, left_on="context", right_on="code")

In [23]:
dummy, _ = process_df(train_df, " [s] ")

In [24]:
dummy.iloc[0].input

'abatement [s] abatement of pollution [s] furniture; domestic articles or appliances; coffee mills; spice mills; suction cleaners in general'

In [25]:
checkpoint = "microsoft/deberta-v3-small"
trainer = train(train_df, checkpoint, run_name="anchor_target_title")

print(trainer.state.eval_metric_values)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

******************************
Config:
node_name: ubuntu_desktop
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 2979262394

******************************


train          Epoch   0:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   0:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   1:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   1:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   2:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   2:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   3:    0%|| 0/906 [00:00<?, ?ba/s]         

eval           Epoch   3:    0%|| 0/235 [00:00<?, ?ba/s]         

{'MeanSquaredError': tensor(0.0273, device='cuda:0'), 'PearsonCorrCoef': tensor(0.7924, device='cuda:0')}


In [26]:
trainer.close()

0,1
loss/train/total,█▃▄▃▂▃▂▂▂▂▁▁▂▁▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁
metrics/eval/MeanSquaredError,██▁▃
metrics/eval/PearsonCorrCoef,▁▄██
metrics/train/PearsonCorrCoef,▁▃▄▄▅▄▆▇▇▇▇▇▇█▇▇▇▇▇█▇███▇███▇▇████▇█████
time/batch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/batch_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/epoch,▁▃▅▆█
time/sample,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/sample_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/token,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss/train/total,0.02186
metrics/eval/MeanSquaredError,0.02733
metrics/eval/PearsonCorrCoef,0.79245
metrics/train/PearsonCorrCoef,0.81018
time/batch,3624.0
time/batch_in_epoch,0.0
time/epoch,4.0
time/sample,115868.0
time/sample_in_epoch,0.0
time/token,8226131.0


## g) Updates from Kaggle 1st Place Solution

### CPC Codes

In [59]:
train_df = pd.read_csv(path / "train.csv")
cpc_path = Path('cpc-data')

In [60]:
cpc_path.ls()

(#2) [Path('cpc-data/CPCSchemeXML202105'),Path('cpc-data/CPCTitleList202202')]

In [62]:
import re

def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in (cpc_path/'CPCSchemeXML202105').ls():
        result = re.findall(pattern, file_name.name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(cpc_path/f'CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        if cpc == 'C':
            cpc_result = 'C' + cpc_result
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

In [63]:
cpc_texts = get_cpc_texts()

In [64]:
train_df['context_text'] = train_df['context'].map(cpc_texts)

In [65]:
sample_row = train_df.iloc[0]
sample_row.context, sample_row.context_text

('A47',
 'HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; COFFEE MILLS; SPICE MILLS; SUCTION CLEANERS IN GENERAL')

In [66]:
def process_df(df, sep_token):
    df["section"] = df.context.str[0]
    df["sectok"] = "[" + df.section + "]"
    sectoks = list(df.sectok.unique())
    df["input"] = (
        df.sectok
        + df.anchor.str.lower()
        + sep_token
        + df.target
        + sep_token
        + df.context_text.str.lower()
    )
    return df, sectoks

In [67]:
dummy, _ = process_df(train_df, "[SEP]")

In [68]:
dummy.iloc[0].input

'[A]abatement[SEP]abatement of pollution[SEP]human necessities. furniture; domestic articles or appliances; coffee mills; spice mills; suction cleaners in general'

In [69]:
# TODO: Find a way to fix the upload issue
# see: https://github.com/wandb/wandb/issues/4441#issuecomment-1504120929
class WandBLoggerNoUpload(WandBLogger):
    def can_upload_files(self) -> bool:
        """Whether the logger supports uploading files."""
        return False

def prepare_trainer(composer_model, optimizer, scheduler, train_dl, val_dl, epochs, run_name):
    trainer = Trainer(
        model=composer_model,
        train_dataloader=train_dl,
        eval_dataloader=val_dl,
        max_duration=f"{epochs}ep",
        optimizers=optimizer,
        schedulers=[scheduler],
        loggers=[WandBLoggerNoUpload(project="patent-phrase-to-phrase")],
        run_name=run_name,
        device="gpu",
        precision="amp_fp16",
        step_schedulers_every_batch=True,
        # seed=17,
    )
    
    return trainer

In [70]:
checkpoint = "microsoft/deberta-v3-small"
trainer = train(train_df, checkpoint, run_name="cpc_texts_sectock", sep_token="[SEP]")

print(trainer.state.eval_metric_values)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667060941666326, max=1.0)…

******************************
Config:
node_name: ubuntu_desktop
num_gpus_per_node: 1
num_nodes: 1
rank_zero_seed: 3298515164

******************************


train          Epoch   0:    0%|| 0/905 [00:00<?, ?ba/s]         

eval           Epoch   0:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   1:    0%|| 0/905 [00:00<?, ?ba/s]         

eval           Epoch   1:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   2:    0%|| 0/905 [00:00<?, ?ba/s]         

eval           Epoch   2:    0%|| 0/235 [00:00<?, ?ba/s]         

train          Epoch   3:    0%|| 0/905 [00:00<?, ?ba/s]         

eval           Epoch   3:    0%|| 0/235 [00:00<?, ?ba/s]         

{'MeanSquaredError': tensor(0.0227, device='cuda:0'), 'PearsonCorrCoef': tensor(0.8278, device='cuda:0')}


In [71]:
trainer.close()

0,1
loss/train/total,▇██▆▆▃▃▃▃▅▅▅▃▂▂▃▂▂▄▃▃▂▂▂▃▂▂▂▂▂▁▁▁▁▂▁▂▂▂▁
metrics/eval/MeanSquaredError,█▃▁▁
metrics/eval/PearsonCorrCoef,▁▆██
metrics/train/PearsonCorrCoef,▂▁▂▅▃▆▇▆▅▅▆▆█▇█▇▇▇▅▆▇██▆▆▇███████▇████▇█
time/batch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/batch_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/epoch,▁▃▅▆█
time/sample,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
time/sample_in_epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
time/token,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss/train/total,0.01367
metrics/eval/MeanSquaredError,0.02265
metrics/eval/PearsonCorrCoef,0.82778
metrics/train/PearsonCorrCoef,0.90681
time/batch,3620.0
time/batch_in_epoch,0.0
time/epoch,4.0
time/sample,115828.0
time/sample_in_epoch,0.0
time/token,8453327.0


### Custom Model

In [56]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel

In [57]:
class CustomModel(nn.Module):
    def __init__(self, checkpoint, config_path=None, pretrained=True, num_labels=1, fc_dropout=0.2):
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(checkpoint, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(checkpoint, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, num_labels)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [58]:
checkpoint = "microsoft/deberta-v3-small"
model = CustomModel(checkpoint, pretrained=True, fc_dropout=0.2)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
