In [1]:
##### hyperparameters #####

MODEL = 'microsoft/deberta-base'
MODEL_SAVE_PATH = 'deberta-base'
FAST_TOKENIZER = True
PADDING = True
TRUNCATION = True
BATCH_SIZE = 4
WARMUP = 600
WEIGHT_DECAY = 0.1
LEARNING_RATE = 1.478e-5
TRAIN_EPOCH = 5

In [2]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification, DebertaTokenizerFast
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
import gc
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW
from dont_patronize_me import DontPatronizeMe

In [3]:
##### cleanup #####
print("Cleaning up...")
torch.cuda.empty_cache()
gc.collect()
print("Cleanup done!")


Cleaning up...
Cleanup done!


In [4]:
class PCLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [5]:
def prepare_model(model, fast=True):

    print("Preparing model...")

    if fast:
        tokenizer = DebertaTokenizerFast.from_pretrained(model)
    else:
        tokenizer = DebertaTokenizer.from_pretrained(model)

    model = DebertaForSequenceClassification.from_pretrained(model)

    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    print("Model prepared!")

    return tokenizer, model


In [6]:
def prep_dataset_from_csv(tokenizer):

    # training data set comes from csv file, dev set from official dev set
    # test set from official test set
    print("Loading datasets...")

    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()

    # get dev set
    deids = pd.read_csv('dev_semeval_parids-labels.csv')
    deids.par_id = deids.par_id.astype(str)

    rows = [] # will contain par_id, label and text
    for idx in range(len(deids)):
        parid = deids.par_id[idx]
        # select row from original dataset
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    dedf1 = pd.DataFrame(rows)

    # get train set
    trdf1 = pd.read_csv('augmented_data_upsampled_factor_10.csv', sep='\t')

    # add keyword to the training data set
    trdf1['text'] = trdf1[['keyword', 'text']].agg(' '.join, axis=1)

    # convert to int, error otherwise
    trdf1['label'] = pd.to_numeric(trdf1['label'], errors='coerce')

    # shuffle only training dataset
    trdf1 = trdf1.sample(frac=1).reset_index(drop=True)

    # get test set
    tedf1 = pd.read_csv('task4_test.tsv', sep='\t', names=['id1', 'id2', 'keyword', 'loc', 'text'])

    # add keyword to the test data set
    tedf1['text'] = tedf1[['keyword', 'text']].agg(' '.join, axis=1)

    # convert to numpy
    trdf1 = trdf1.to_numpy()
    dedf1 = dedf1.to_numpy()
    tedf1 = tedf1.to_numpy()

    # posts and labels: data currently organised as (par_id | text | label) for dev set
    #                                               (text | label | keyword) for augmented training set
    #                                               (id1 | id2 | keyword | loc | text) for test set

    trposts = [row[0] for row in trdf1]
    trlabels = [row[1] for row in trdf1]
    deposts = [row[1] for row in dedf1]
    delabels = [row[2] for row in dedf1]
    teposts = [row[4] for row in tedf1]
    telabels = []

    # perform encoding
    encodings_trn = tokenizer(trposts, padding=PADDING, truncation=TRUNCATION)
    encodings_dev = tokenizer(deposts, padding=PADDING, truncation=TRUNCATION)
    encodings_tst = tokenizer(teposts, padding=PADDING, truncation=TRUNCATION)

    # convert to Dataset
    dataset_trn = PCLDataset(encodings_trn, trlabels)
    dataset_dev = PCLDataset(encodings_dev, delabels)
    dataset_tst = PCLDataset(encodings_tst, telabels)

    print("Datasets loaded!")

    return dataset_trn, dataset_dev, dataset_tst

In [7]:
def prep_dataset(tokenizer):

    # both training and test data sets come from official data
    print("Loading datasets...")

    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()

    trids = pd.read_csv('train_semeval_parids-labels.csv')
    teids = pd.read_csv('dev_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    rows = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    trdf1 = pd.DataFrame(rows)

    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        # select row from original dataset
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    tedf1 = pd.DataFrame(rows)

    if UPSAMPLE and UPSAMPLE_FACTOR > 1:
        upsampled_tr = trdf1
        for _ in range(UPSAMPLE_FACTOR - 1):
            upsampled_tr = upsampled_tr.append(trdf1.loc[trdf1['label'] == 1])
        trdf1 = upsampled_tr

    # shuffle only training dataset
    trdf1 = trdf1.sample(frac=1).reset_index(drop=True)

    # convert to numpy
    trdf1 = trdf1.to_numpy()
    tedf1 = tedf1.to_numpy()

    # posts and labels: data currently organised as (par_id | text | label)
    trposts = [row[1] for row in trdf1]
    trlabels = [row[2] for row in trdf1]
    teposts = [row[1] for row in tedf1]
    telabels = [row[2] for row in tedf1]

    # perform encoding
    encodings_trn = tokenizer(trposts, padding=PADDING, truncation=TRUNCATION)
    encodings_tst = tokenizer(teposts, padding=PADDING, truncation=TRUNCATION)

    # convert to Dataset
    dataset_trn = PCLDataset(encodings_trn, trlabels)
    dataset_tst = PCLDataset(encodings_tst, telabels)

    print("Datasets loaded!")

    return dataset_trn, dataset_tst

In [8]:
def prep_device():

    if torch.cuda.is_available():
        device = "cuda:0"
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = "cpu"

    return device


In [9]:
tokenizer, model = prepare_model(MODEL, fast=FAST_TOKENIZER)
dataset_trn, dataset_dev, dataset_tst = prep_dataset_from_csv(tokenizer)

Preparing model...


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

Model prepared!
Loading datasets...
Datasets loaded!


In [10]:
device = prep_device()
model.to(device)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 Ti


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0): DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermed

In [11]:
loader_trn = DataLoader(dataset_trn, batch_size=BATCH_SIZE, shuffle=True)
loader_dev = DataLoader(dataset_dev, batch_size=BATCH_SIZE, shuffle=False)
loader_tst = DataLoader(dataset_tst, batch_size=BATCH_SIZE, shuffle=False)
optim = AdamW(model.parameters(), lr=LEARNING_RATE)



In [12]:
# progress bar
num_training_steps = TRAIN_EPOCH * len(loader_trn)
progress_bar = tqdm(range(num_training_steps))

# start training
for epoch in range(TRAIN_EPOCH):

    model.train()

    for batch in loader_trn:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        progress_bar.update(1)

    ##### evaluating model #####
    print("Evaluating model...")
    print("Epoch", epoch)

    model.eval()

    with torch.no_grad():
        pred_labels_full = []
        true_labels_full = []
        for batch in loader_dev:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = F.softmax(outputs.logits, dim=-1)
            predictions = torch.argmax(predictions, dim=-1)

            pred_labels = [i.item() for i in predictions]
            true_labels = [i.item() for i in labels]
            pred_labels_full.extend(pred_labels)
            true_labels_full.extend(true_labels)
        # print metrics
        print("Confusion Matrix")
        print(confusion_matrix(true_labels_full, pred_labels_full))
        print("F1 Score")
        print(f1_score(true_labels_full, pred_labels_full))
    
    print('Saving model...')
    model.save_pretrained(MODEL_SAVE_PATH + '.epoch-' + str(epoch))
    tokenizer.save_pretrained(MODEL_SAVE_PATH + '.epoch-' + str(epoch))
    print('Model saved!')

print("Finished!")

  0%|          | 0/34905 [00:00<?, ?it/s]

Evaluating model...
Epoch 0
Confusion Matrix
[[1852   43]
 [ 123   76]]
F1 Score
0.4779874213836478
Saving model...
Model saved!
Evaluating model...
Epoch 1
Confusion Matrix
[[1877   18]
 [ 150   49]]
F1 Score
0.3684210526315789
Saving model...
Model saved!
Evaluating model...
Epoch 2
Confusion Matrix
[[1822   73]
 [ 100   99]]
F1 Score
0.5336927223719677
Saving model...
Model saved!
Evaluating model...
Epoch 3
Confusion Matrix
[[1854   41]
 [ 119   80]]
F1 Score
0.5
Saving model...
Model saved!
Evaluating model...
Epoch 4
Confusion Matrix
[[1862   33]
 [ 149   50]]
F1 Score
0.35460992907801414
Saving model...
Model saved!
Finished!
