In [None]:
##### hyperparameters #####
ROBERTA_MODEL = 'roberta-base'
FAST_TOKENIZER = True
UPSAMPLE = False
UPSAMPLE_FACTOR = 10 # only required when UPSAMPLE = True; 1 does not upsample, 2 doubles minority class, etc.
PADDING = True
TRUNCATION = True
BATCH_SIZE = 4
LEARNING_RATE = 1e-5
TRAIN_EPOCH = 5


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaTokenizerFast
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score
import gc
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW
from dont_patronize_me import DontPatronizeMe


In [None]:
##### cleanup #####
print("Cleaning up...")
torch.cuda.empty_cache()
gc.collect()
print("Cleanup done!")


In [None]:
class PCLDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
def prepare_model(roberta_model, fast=True):

    print("Preparing model...")

    if fast:
        tokenizer = RobertaTokenizerFast.from_pretrained(roberta_model)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(roberta_model)

    model = RobertaForSequenceClassification.from_pretrained(roberta_model)

    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    print("Model prepared!")

    return tokenizer, model


In [None]:
def prep_dataset_from_csv(tokenizer):

    # training data set comes from csv file, test set from official dev set
    print("Loading datasets...")

    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()

    # get test set
    teids = pd.read_csv('dev_semeval_parids-labels.csv')
    teids.par_id = teids.par_id.astype(str)

    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        # select row from original dataset
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    tedf1 = pd.DataFrame(rows)

    # get train set
    trdf1 = pd.read_csv('augmented_data.csv', sep='\t', names=['label', 'text'])
    # convert to int, error otherwise
    trdf1['label'] = pd.to_numeric(trdf1['label'], errors='coerce')

    # shuffle only training dataset
    trdf1 = trdf1.sample(frac=1).reset_index(drop=True)

    # convert to numpy
    trdf1 = trdf1.to_numpy()
    tedf1 = tedf1.to_numpy()

    # posts and labels: data currently organised as (par_id | text | label) for test set
    #                                               (label | text) for augmented training set
    trposts = [row[1] for row in trdf1]
    trlabels = [row[0] for row in trdf1]
    teposts = [row[1] for row in tedf1]
    telabels = [row[2] for row in tedf1]

    # perform encoding
    encodings_trn = tokenizer(trposts, padding=PADDING, truncation=TRUNCATION)
    encodings_tst = tokenizer(teposts, padding=PADDING, truncation=TRUNCATION)

    # convert to Dataset
    dataset_trn = PCLDataset(encodings_trn, trlabels)
    dataset_tst = PCLDataset(encodings_tst, telabels)

    print("Datasets loaded!")

    return dataset_trn, dataset_tst

In [None]:
def prep_dataset(tokenizer):

    # both training and test data sets come from official data
    print("Loading datasets...")

    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()

    trids = pd.read_csv('train_semeval_parids-labels.csv')
    teids = pd.read_csv('dev_semeval_parids-labels.csv')
    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    rows = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        # select row from original dataset to retrieve `text` and binary label
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    trdf1 = pd.DataFrame(rows)

    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        # select row from original dataset
        text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
        label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'text':text,
            'label':label
        })

    tedf1 = pd.DataFrame(rows)

    if UPSAMPLE and UPSAMPLE_FACTOR > 1:
        upsampled_tr = trdf1
        for _ in range(UPSAMPLE_FACTOR - 1):
            upsampled_tr = upsampled_tr.append(trdf1.loc[trdf1['label'] == 1])
        trdf1 = upsampled_tr

    # shuffle only training dataset
    trdf1 = trdf1.sample(frac=1).reset_index(drop=True)

    # convert to numpy
    trdf1 = trdf1.to_numpy()
    tedf1 = tedf1.to_numpy()

    # posts and labels: data currently organised as (par_id | text | label)
    trposts = [row[1] for row in trdf1]
    trlabels = [row[2] for row in trdf1]
    teposts = [row[1] for row in tedf1]
    telabels = [row[2] for row in tedf1]

    # perform encoding
    encodings_trn = tokenizer(trposts, padding=PADDING, truncation=TRUNCATION)
    encodings_tst = tokenizer(teposts, padding=PADDING, truncation=TRUNCATION)

    # convert to Dataset
    dataset_trn = PCLDataset(encodings_trn, trlabels)
    dataset_tst = PCLDataset(encodings_tst, telabels)

    print("Datasets loaded!")

    return dataset_trn, dataset_tst

In [None]:
def prep_device():

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    return device


In [None]:
tokenizer, model = prepare_model(roberta_model=ROBERTA_MODEL, fast=FAST_TOKENIZER)
dataset_trn, dataset_tst = prep_dataset(tokenizer)

In [None]:
device = prep_device()
model.to(device)

In [None]:
loader_trn = DataLoader(dataset_trn, batch_size=BATCH_SIZE, shuffle=True)
loader_tst = DataLoader(dataset_tst, batch_size=BATCH_SIZE, shuffle=True)
optim = AdamW(model.parameters(), lr=LEARNING_RATE)


In [None]:
# progress bar
num_training_steps = TRAIN_EPOCH * len(loader_trn)
progress_bar = tqdm(range(num_training_steps))

# start training
for epoch in range(TRAIN_EPOCH):

    model.train()

    for batch in loader_trn:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        progress_bar.update(1)

    ##### evaluating model #####
    print("Evaluating model...")
    print("Epoch", epoch)

    model.eval()

    with torch.no_grad():
        pred_labels_full = []
        true_labels_full = []
        for batch in loader_tst:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = F.softmax(outputs.logits, dim=-1)
            predictions = torch.argmax(predictions, dim=-1)

            pred_labels = [i.item() for i in predictions]
            true_labels = [i.item() for i in labels]
            pred_labels_full.extend(pred_labels)
            true_labels_full.extend(true_labels)
        # print metrics
        print("Confusion Matrix")
        print(confusion_matrix(true_labels_full, pred_labels_full))
        print("F1 Score")
        print(f1_score(true_labels_full, pred_labels_full))

print("Finished!")