In [1]:
import warnings
from multiprocessing import cpu_count

from pathlib import Path

import pandas as pd

from torch.utils.data import DataLoader

from fastcore.xtras import Path  # for ls

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers.data.data_collator import default_data_collator

import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from torchmetrics import PearsonCorrCoef
from composer.models.huggingface import HuggingFaceModel
from composer import Trainer
from composer.metrics import CrossEntropy


In [10]:
# ignoring warnings
warnings.filterwarnings("ignore")

def process_df(df: pd.DataFrame, train: bool = True):
    df["input"] = (
        "TEXT1: " + df.context + "; TEXT2: " + df.target + "; ANC1: " + df.anchor
    )
    dataset = Dataset.from_pandas(df)
    if train:
        dataset = dataset.rename_columns({"score": "labels"})
    return dataset

In [3]:
# loading the dataset
path = Path("dataset")
train_df = pd.read_csv(path / "train.csv")
test_df = pd.read_csv(path / "test.csv")

In [4]:
train_df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [5]:
train_ds = process_df(train_df)
eval_ds = process_df(test_df, train=False)
print(train_ds)

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'labels', 'input'],
    num_rows: 36473
})


In [22]:
train_ds[0]

{'id': '37d61fd2272659b1',
 'anchor': 'abatement',
 'target': 'abatement of pollution',
 'context': 'A47',
 'labels': 0.5,
 'input': 'TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement'}

In [8]:
checkpoint = "microsoft/deberta-v3-small"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def tokenize_func(batch, tokenizer=tokenizer):
    return tokenizer(
        batch["input"],
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )

In [12]:
tokenize_func(train_ds[0])

{'input_ids': tensor([[    1, 54453,   435,   294,   336,  5753,   346, 54453,   445,   294,
         47284,   265,  6435,   346, 23702,   435,   294, 47284,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
train_tok_ds = train_ds.map(tokenize_func, batched=True, batch_size=None)
eval_tok_ds = eval_ds.map(tokenize_func, batched=True, batch_size=None)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [14]:
train_tok_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [15]:
# splitting the dataset
train_dds = train_tok_ds.train_test_split(
    test_size=0.2, shuffle=True, seed=42
)

In [16]:
train_dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 29178
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7295
    })
})

In [20]:
print("Checking if the training dataset lengths are similar")
print(len(train_dds["train"][0]["input_ids"]))
print(len(train_dds["train"][1]["input_ids"]))

Checking if the training dataset lengths are similar
57
57


In [21]:
print("Checking if the test dataset lengths are similar")
print(len(train_dds["test"][0]["input_ids"]))
print(len(train_dds["test"][1]["input_ids"]))

Checking if the test dataset lengths are similar
57
57


In [23]:
# creating PyTorch dataloaders
train_dl = DataLoader(
    train_dds["train"],
    batch_size=64,
    shuffle=True,
    collate_fn=default_data_collator,
)
val_dl = DataLoader(
    train_dds["test"],
    batch_size=64,
    shuffle=False,
    collate_fn=default_data_collator,
)

In [24]:
type(train_dl), type(val_dl)

(torch.utils.data.dataloader.DataLoader,
 torch.utils.data.dataloader.DataLoader)

In [25]:
# get a sample batch and print the first element
print("Sample batch")
batch = next(iter(val_dl))
print(batch["input_ids"][0])
print(batch["token_type_ids"][0])
print(batch["attention_mask"][0])
print(batch["labels"][0])

Sample batch
tensor([    1, 54453,   435,   294,   336,  5718,   346, 54453,   445,   294,
         4823,   346, 23702,   435,   294,  6624,  3480,     2,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor(0.2500)


In [39]:
batch['input_ids'].shape

torch.Size([64, 57])

In [41]:
batch["token_type_ids"].shape

torch.Size([64, 57])

In [42]:
batch["attention_mask"].shape

torch.Size([64, 57])

In [40]:
batch['labels'].shape

torch.Size([64])

In [26]:
# loading the model
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=1
)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [27]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [28]:
pears_corr = PearsonCorrCoef(num_outputs=1)
composer_model = HuggingFaceModel(
    model=model,
    tokenizer=tokenizer,
    metrics=[CrossEntropy()],
    eval_metrics=[CrossEntropy(), pears_corr],
    use_logits=True,
)

In [30]:
optimizer = AdamW(
    params=composer_model.parameters(),
    lr=8e-5,
    betas=(0.9, 0.98),
    eps=1e-6,
    weight_decay=0.01,
)

In [31]:
linear_lr_decay = LinearLR(
    optimizer, start_factor=1.0, end_factor=0, total_iters=150
)

In [35]:
trainer = Trainer(
    model=composer_model,
    train_dataloader=train_dl,
    eval_dataloader=val_dl,
    max_duration="1ep",
    optimizers=optimizer,
    schedulers=[linear_lr_decay],
    device="gpu",
    precision="amp_fp16",
    # seed=17,
)

In [38]:
trainer.fit()