# Main imports and code

In [55]:
# !pip install simpletransformers
# !pip install tensorboardx
# !pip install numpy requests nlpaug

In [56]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from urllib import request
from sklearn.metrics import confusion_matrix, f1_score
import pandas as pd
import numpy as np
import logging
import torch
from collections import Counter
from ast import literal_eval
import nlpaug.augmenter.word as naw
import wandb


In [57]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available?', cuda_available)

Cuda available? True


# Fetch Don't Patronize Me! data manager module

In [58]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [59]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [60]:
from dont_patronize_me import DontPatronizeMe

In [61]:
dpm = DontPatronizeMe('.', '.')

In [62]:
dpm.load_task1()

# Load paragraph IDs

In [63]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [64]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [65]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

# Rebuild training set (Task 1)

In [66]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label,
      'keyword':keyword
  })

In [67]:
len(rows)

8375

In [68]:
trdf1 = pd.DataFrame(rows)

# Rebuild test set (Task 1)

In [69]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  keyword = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].keyword.values[0]
  rows.append({
      'par_id':parid,
      'text':text,
      'label':label,
      'keyword':keyword
  })

In [70]:
len(rows)

2094

In [71]:
tedf1 = pd.DataFrame(rows)

# Custom Transformer for Task 1

In [72]:
##### hyperparameters #####

MODELS = [{
    "type": "roberta",
    "name": "roberta-base",
}, {
    "type": "bert",
    "name": "bert-base-uncased",
}, {
    "type": "albert",
    "name": "albert-base-v2",
}, {
    "type": "roberta",
    "name": "distilroberta-base",
}, {
    "type": "xlnet",
    "name": "xlnet-base-cased",
}, {
    "type": "deberta",
    "name": "microsoft/deberta-base",
}, {
    "type": "deberta",
    "name": "microsoft/deberta-large",
}, {
    "type": "debertav2",
    "name": "microsoft/deberta-v3-base",
}, {
    "type": "electra",
    "name": "google/electra-base-discriminator"
}, {
    "type": "squeezebert",
    "name": "squeezebert/squeezebert-uncased"
}]

MODEL = MODELS[6]

FROM_AUGMENTED_DATA = False

# DO NOT USE: THEY ARE FROM A DIFFERENT AUG PACKAGE
# NUM_AUG = 2 # Number of Extra Augmented Texts
# ALPHA_SR = 0.1 # Synonym Replacement Rate
# ALPHA_RI = 0.1 # Random Insertion Rate
# ALPHA_RS = 0.1 # Random Swap Rate

ADD_KEYWORD = False

UPSAMPLE = True
UPSAMPLE_FACTOR = 10 # only required when UPSAMPLE = True; 1 does not upsample, 2 doubles minority class, etc.

TESTING = True

TUNING = False

BATCH_SIZE = 8
WARMUP = 600
WEIGHT_DECAY = 0.1
LEARNING_RATE = 1.478e-5
TRAIN_EPOCH = 3

In [73]:
trdf1

Unnamed: 0,par_id,text,label,keyword
0,4341,"The scheme saw an estimated 150,000 children f...",1,poor-families
1,4136,Durban 's homeless communities reconciliation ...,1,homeless
2,10352,The next immediate problem that cropped up was...,1,poor-families
3,8279,Far more important than the implications for t...,1,vulnerable
4,1164,To strengthen child-sensitive social protectio...,1,poor-families
...,...,...,...,...
8370,8380,Rescue teams search for survivors on the rubbl...,0,refugee
8371,8381,The launch of ' Happy Birthday ' took place la...,0,hopeless
8372,8382,"The unrest has left at least 20,000 people dea...",0,homeless
8373,8383,You have to see it from my perspective . I may...,0,hopeless


In [74]:
print(trdf1['label'].value_counts())

0    7581
1     794
Name: label, dtype: int64


In [75]:
##### data augmentation #####

# internal dev set from the training set
# NOT USED FOR FINAL TEST RUN

# class_0 = trdf1.loc[trdf1['label'] == 0]
# class_1 = trdf1.loc[trdf1['label'] == 1]
# class_0_trn = class_0[:int(len(class_0) * 0.9)]
# class_0_dev = class_0[int(len(class_0) * 0.9):]
# class_1_trn = class_1[:int(len(class_1) * 0.9)]
# class_1_dev = class_1[int(len(class_1) * 0.9):]

# trdf1 = class_0_trn.append(class_1_trn)
# dedf1 = class_0_dev.append(class_1_dev)


In [76]:
print(trdf1['label'].value_counts())

0    7581
1     794
Name: label, dtype: int64


In [77]:
# perform data augmentation
if FROM_AUGMENTED_DATA and not TESTING:
    pass

    # USED FOR ACTUAL DATA AUGMENTATION BUT SKIP FOR SUBSEQUENT RUNS
    # BECAUSE IT TAKES TOO MUCH TIME
    # INSTEAD READ FROM CSV FILE


    # print(trdf1)

    # data_to_augment = trdf1.to_numpy()

    # aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
    # # OTHER DATA AUGMENTATION METHODS
    # # aug = naw.SynonymAug(aug_src='wordnet')
    # # aug = naw.RandomWordAug()
    # # aug = naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en')

    # augmented_data = [[aug.augment(text[1])] for text in data_to_augment]
    # data_labels = [[text[2]] for text in data_to_augment]
    # data_indices = [[text[0]] for text in data_to_augment]
    # data_keywords = [[text[3]] for text in data_to_augment]

    # full_augmented = np.hstack((data_indices, augmented_data, data_labels, data_keywords))

    # full_augmented = np.vstack((data_to_augment, full_augmented))

    # trdf1 = pd.DataFrame({'text': full_augmented[:, 1], 'label': full_augmented[:, 2], 'keyword': full_augmented[:, 3]})

    # print(trdf1)

In [78]:
# trdf1.to_csv('augmented_data_not_upsampled.csv', sep='\t', index=False)

In [79]:
if FROM_AUGMENTED_DATA and TESTING:
    trdf1 = pd.read_csv('augmented_data_not_upsampled.csv', sep='\t')

In [80]:
trdf1

Unnamed: 0,text,label,keyword
0,"We 're living in times of absolute insanity , ...",0,hopeless
1,"In Libya today , there are countless number of...",0,migrant
2,"""White House press secretary Sean Spicer said ...",0,immigrant
3,Council customers only signs would be displaye...,0,disabled
4,""""""" Just like we received migrants fleeing El ...",0,refugee
...,...,...,...
15067,The predator Trudel knew exactly to manipulate...,1,poor-families
15068,The team will soon pack the items for Christma...,1,homeless
15069,"""2015 benefit drives that feed the hungry, and...",1,homeless
15070,""""""" The boxers were from wealthy families, had...",1,poor-families


In [81]:
print(trdf1['label'].value_counts())

0    13644
1     1428
Name: label, dtype: int64


In [82]:
# perform data upsampling
if UPSAMPLE and UPSAMPLE_FACTOR > 1:
    upsampled_tr = trdf1
    for _ in range(UPSAMPLE_FACTOR - 1):
        upsampled_tr = pd.concat([upsampled_tr, trdf1.loc[trdf1['label'] == 1]], ignore_index=True)
    trdf1 = upsampled_tr


In [83]:
# trdf1.to_csv('augmented_data_upsampled_factor_10.csv', sep='\t', index=False)

In [84]:
# # perform data augmentation
# if FROM_AUGMENTED_DATA:
#     # THIS DATA AUGMENTATION METHOD IS NOT VERY GOOD

#     # !git clone https://github.com/jasonwei20/eda_nlp

#     formatted_data = trdf1[['label', 'text']].copy()
#     print(formatted_data['label'].value_counts())
#     formatted_data.to_csv('data_to_augment.csv', sep='\t', index=False)

#     !python ./eda_nlp/code/augment.py --input=data_to_augment.csv --output=augmented_data.csv --num_aug=NUM_AUG --alpha_sr=ALPHA_SR --alpha_ri=ALPHA_RI --alpha_rs=ALPHA_RS

#     trdf1 = pd.read_csv('augmented_data.csv', sep='\t', names=['label', 'text'])
#     print(len(trdf1))
#     print(trdf1)
#     print(trdf1['label'].value_counts())

#     # cast labels to int
#     trdf1['label'] = pd.to_numeric(trdf1['label'], errors='coerce')
#     print(trdf1['label'].value_counts())
#     # remove nan values
#     trdf1 = trdf1.dropna(subset=['label'])
#     print(trdf1['label'].value_counts())
#     print(trdf1)


In [85]:
# trdf1 = pd.read_csv('augmented_data_upsampled_factor_10.csv', sep='\t')

In [86]:
print(trdf1['label'].value_counts())

1    14280
0    13644
Name: label, dtype: int64


In [87]:
# shuffle only training dataset
trdf1 = trdf1.sample(frac=1).reset_index(drop=True)

In [88]:
training_set1 = trdf1

In [89]:
# add keyword to the training data set
if ADD_KEYWORD:
    training_set1['text'] = training_set1[['keyword', 'text']].agg(' '.join, axis=1)

In [90]:
# add keyword to the test data set
if ADD_KEYWORD:
    tedf1['text'] = tedf1[['keyword', 'text']].agg(' '.join, axis=1)

In [91]:
# convert labels to int
training_set1['label'] = pd.to_numeric(training_set1['label'], errors='coerce')

In [92]:
training_set1

Unnamed: 0,text,label,keyword
0,"in-need Liz Manne, longtime Boston film market...",1,in-need
1,"in-need Destitute, or clutching a smart-phone?...",1,in-need
2,refugee JUBA South Sudan ( Xinhua CA --The UN ...,0,refugee
3,hopeless This seems how cannot think one below...,0,hopeless
4,"homeless ""Nona O'Gara , a supervisor at the ce...",1,homeless
...,...,...,...
27919,in-need If you are a Ghanaian tertiary student...,0,in-need
27920,"hopeless About the same time , she gave an int...",1,hopeless
27921,vulnerable The recovering economy will underpi...,0,vulnerable
27922,poor-families Although policy change is needed...,1,poor-families


In [93]:
print("Training on", MODEL['type'], MODEL['name'])

Training on deberta microsoft/deberta-large


In [94]:
task1_model_args = ClassificationArgs(num_train_epochs=TRAIN_EPOCH,
                                      train_batch_size=BATCH_SIZE,
                                      warmup_steps=WARMUP,
                                      learning_rate=LEARNING_RATE,
                                      weight_decay=WEIGHT_DECAY,
                                      reprocess_input_data=True,
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)

task1_model = ClassificationModel(MODEL["type"],
                                  MODEL["name"],
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)

# train model
task1_model.train_model(training_set1[['text', 'label']])

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier.weight', 'classi

  0%|          | 0/27924 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/3491 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/3491 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/3491 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/3491 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/3491 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of deberta model complete. Saved to outputs/.


(17455, 0.05973857617920824)

In [95]:
if TESTING:
    # get test set
    tedf1 = pd.read_csv('task4_test.tsv', sep='\t', names=['id1', 'id2', 'keyword', 'loc', 'text'])

    if ADD_KEYWORD:
        # add keyword to the test data set
        tedf1['text'] = tedf1[['keyword', 'text']].agg(' '.join, axis=1)

In [96]:
tedf1

Unnamed: 0,id1,id2,keyword,loc,text
0,t_0,@@7258997,vulnerable,us,"vulnerable In the meantime , conservatives are..."
1,t_1,@@16397324,women,pk,women In most poor households with no educatio...
2,t_2,@@16257812,migrant,ca,migrant The real question is not whether immig...
3,t_3,@@3509652,migrant,gb,"migrant In total , the country 's immigrant po..."
4,t_4,@@477506,vulnerable,ca,"vulnerable Members of the church , which is pa..."
...,...,...,...,...,...
3827,t_3893,@@20319448,migrant,jm,migrant In a letter dated Thursday to European...
3828,t_3894,@@9990672,poor-families,au,poor-families They discovered that poor famili...
3829,t_3895,@@37984,migrant,ca,"migrant She married at 19 , to Milan ( Emil ) ..."
3830,t_3896,@@9691377,immigrant,us,immigrant The United Kingdom is n't going to d...


In [97]:
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/3832 [00:00<?, ?it/s]

  0%|          | 0/479 [00:00<?, ?it/s]

In [98]:
if not TESTING:
    print(tedf1['label'].value_counts())

In [99]:
Counter(preds_task1)

Counter({0: 3666, 1: 166})

In [100]:
if not TESTING:
    expected = np.array(tedf1.label.tolist())
    received = preds_task1

    # confusion matrix + f1 score
    conf = confusion_matrix(received, expected)
    f1 = f1_score(received, expected)

    print("Confusion Matrix")
    print(conf)

    print("F1 Score")
    print(f1)

In [101]:
# automated hyperparameter optimisation

sweep_config = {
    "method": "bayes", # grid, random
    "metric": {"name": "f1_score", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"values": [1, 3]},
        "learning_rate": {"min": 1e-5, "max": 2e-5},
    },
}

if TUNING:
    sweep_id = wandb.sweep(sweep_config, project="F1 Sweep 4")

In [102]:
# automated hyperparameter optimisation
def train():

    wandb.init() # init bruv

    task1_model = ClassificationModel(MODEL["type"],
                                      MODEL["name"],
                                      args = task1_model_args,
                                      sweep_config=wandb.config,
                                      num_labels=2,
                                      use_cuda=cuda_available)

    # train model
    task1_model.train_model(training_set1[['text', 'label']], eval_df=tedf1[['text', 'label']])

    task1_model.eval_model(eval_df=tedf1[['text', 'label']])

    preds_task1, _ = task1_model.predict(tedf1.text.tolist())

    Counter(preds_task1)

    expected = np.array(tedf1.label.tolist())
    received = preds_task1

    # confusion matrix + f1 score
    conf = confusion_matrix(received, expected)
    f1 = f1_score(received, expected)

    print("Confusion Matrix")
    print(conf)

    print("F1 Score")
    print(f1)

    wandb.log({"f1_score": f1})

    wandb.join()

if TUNING:
    wandb.agent(sweep_id, train)

In [103]:
# codalab submission
if TESTING:
    labels2file([[k] for k in preds_task1], 'task1.txt')
    !cat task1.txt | head -n 10
    !zip task1.txt.zip task1.txt

0
0
0
0
0
0
0
0
0
0
updating: task1.txt (deflated 96%)


# RoBERTa Baseline for Task 1

In [104]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [105]:
training_set1

Unnamed: 0,text,label,keyword
0,"in-need Liz Manne, longtime Boston film market...",1,in-need
1,"in-need Destitute, or clutching a smart-phone?...",1,in-need
4,"homeless ""Nona O'Gara , a supervisor at the ce...",1,homeless
5,hopeless When Prophet Elijah the Tishbite was ...,1,hopeless
6,immigrant The remote southern borders of Chile...,1,immigrant
...,...,...,...
27913,disabled Apart from Pakistan and hosts England...,0,disabled
27914,in-need Recommendations are increasing targete...,0,in-need
27919,in-need If you are a Ghanaian tertiary student...,0,in-need
27921,vulnerable The recovering economy will underpi...,0,vulnerable


In [106]:

task1_model_args = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
# task1_model.train_model(training_set1[['text', 'label']])
# run predictions
# preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [107]:
# Counter(preds_task1)

In [108]:
# expected = np.array(tedf1.label.tolist())
# received = preds_task1

# # confusion matrix + f1 score
# conf = confusion_matrix(received, expected)
# f1 = f1_score(received, expected)

# print("Confusion Matrix")
# print(conf)

# print("F1 Score")
# print(f1)