<a href="https://colab.research.google.com/github/khaliso/thesis/blob/main/bert_classifier/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's install and import some stuff first:

In [None]:
'''!pip install transformers
!pip install datasets
!pip install torch torchvision torchaudio
!pip install keras
!pip install tensorflow'''

In [None]:
import transformers
import torch
import keras
from datasets import load_dataset, concatenate_datasets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from functions import *

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer

### Preprocessing the data:

This task used the dataset presented in: 

Pérez-Almendros, C., Anke, L. E., & Schockaert, S. (2020, December). Don’t Patronize Me! An Annotated Dataset with Patronizing and Condescending Language towards Vulnerable Communities. In Proceedings of the 28th International Conference on Computational Linguistics (pp. 5891-5902).

To obtain the dataset, see https://docs.google.com/forms/d/e/1FAIpQLSe5KyzXgpnEOjS-Y6Gb8TTKiWxh4_qLuPL-NGiqKCyF41ALlg/viewform

In [None]:
# read in dataset and have a look at its properties

dpm = pd.read_csv("dontpatronizeme_pcl.tsv", sep="\t")

# Labels 0 and 1 are non-patronizing, 2-4 are patronizing
dpm["label"].replace({1:0}, inplace=True)
dpm["label"].replace(to_replace=1, value=0, inplace=True)
dpm["label"].replace(to_replace=[2,3,4], value=1, inplace=True)

In [None]:
# We don't use any of the additional info
dpm = dpm[["text", "label"]]
dpm.dropna(inplace=True)
dpm.rename(columns={"label":"labels"}, inplace=True)

In [None]:
train, test = train_test_split(pcl, test_size=0.2, random_state=42, stratify=dpm["labels"])
train["labels"].value_counts(normalize=True)

The dataset is highly unbalanced. We will undersample the negative class to the size of the postive class for our initial classifier

In [None]:
train_np = train[train["labels"] == 0]
train_pcl = train[train["labels"] == 1]

train_np_undersampled = train_np.sample(train_pcl.shape[0])

train_balanced = pd.concat([train_pcl, train_np_undersampled])
train_balanced = train_balanced.sampled(frac=1) # shuffling

train_balanced.to_csv("train_data_undersampled.csv")
train.to_csv("train_data.csv") # we will need this later
test.to_csv("val_data.csv")

### Loading the undersampled data as five 80:20 splits for cross-validation

#### Code is based on Huggingface Trainer Dokumentation

In [None]:
vals_ds_bin, trains_ds_bin = load_and_tokenize_training_set("train_data_undersampled.csv")

Let's get to training! We can do five-fold cv to be sure about metric reliability

In [None]:
metrics = {} 

for i in range(5):
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
    trainer = Trainer(model=model, args=training_args, train_dataset=trains_ds_bin[i], eval_dataset=vals_ds_bin[i], compute_metrics=compute_metrics)
    trainer.train()
    metrics[i] = trainer.evaluate()

Let's look at the metrics on our validation sets for the different folds

In [None]:
metrics_df = pd.DataFrame.from_dict(metrics).transpose()
metrics_df.describe()

In [None]:
#trainer.save_model("semeval_task4/model")

### Testing Model Performance on our held back validation set

In [None]:
test = load_test_set("val_data.csv")

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained("semeval_task4/model", local_files_only=True)
#trainer = Trainer(model=model, args=training_args)

y_pred = trainer.predict(test["train"])
compute_test_metrics(y_pred, 'binary')

we can look at the confusion matrix:

In [None]:
cm = confusion_matrix(y_pred.label_ids, y_pred.predictions.argmax(-1))
show_confusion_matrix(cm)

### Classifying synthetic data with pre-classifier. We will only use those samples, which are classified as PCL by our classifier

this step can be skipped in favor of using the already predicted datasets

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained("semeval_task4/model", local_files_only=True)
#trainer = Trainer(model=model, args=training_args)

test = load_testset("synthetic_data/synthetic_npt_data.csv") # generated as non-patronizing
#test = load_testset("synthetic_data/synthetic_data.csv") # generated as patronizing
y_pred = trainer.predict(test["train"])

In [None]:
# adding model predictions to the synthetic dataset
synth = pd.read_csv("synthetic_npt_data.csv")
synth["labels"] = y_pred.predictions.argmax(-1)
synth.to_csv("predicted/synthetic_nonpatronizing_with_predictions_new.csv")

'''synth = pd.read_csv("synthetic_data.csv")
synth["labels"] = y_pred.predictions.argmax(-1)
synth.to_csv("predicted/synthetic_patronizing_with_predictions_new.csv")'''

### Discarding all samples, where prediction and intention do not match

In [None]:
np = pd.read_csv("predicted/synthetic_nonpatronizing_with_predictions.csv", index_col=[0])
np_correct = np[np["labels"] == 0]
np_correct.to_csv("synthetic_npt_data_predicted.csv")

pcl = pd.read_csv("predicted/synthetic_patronizing_with_predictions.csv", index_col=[0])
pcl_correct = pcl[pcl["labels"] == 1]
pcl_correct.to_csv("synthetic_pcl_predicted.csv")

### Using prepared synthetic_dpm dataset to train the new classifier

In [None]:
synthetic = load_testset("synthetic_npt_data_predicted.csv")
#vals_ds_bin, trains_ds_bin = load_testset("synthetic_pcl_predicted.csv")
trains_ds_bin, vals_ds_bin = load_and_tokenize_trainingset("train_data.csv")

In [None]:
metrics_synth = {}

for i in range(5):
    trains_ds_bin_enh = concatenate_datasets([synthetic["train"], trains_ds_bin[i]])
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
    trainer = Trainer(model=model, args=training_args, train_dataset=trains_ds_bin_enh, eval_dataset=vals_ds_bin[i], compute_metrics=compute_metrics)
    trainer.train()
    metrics_synth[i] = trainer.evaluate()

In [None]:
metrics_synth_df = pd.DataFrame.from_dict(metrics_synth).transpose()
metrics_synth_df.describe()

In [None]:
trainer.save_model("dpmEnhanced/model")
#trainer.save_model("dpmEnhancedPos/model")

### And evaluate performance on held-out test set

In [None]:
test = load_test_set("val_data.csv")

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained("dpmEnhanced/model", local_files_only=True)
#trainer = Trainer(model=model, args=training_args)

y_pred = trainer.predict(test["train"])
compute_test_metrics(y_pred, 'binary')

In [None]:
cm = confusion_matrix(y_pred.label_ids, y_pred.predictions.argmax(-1))
show_confusion_matrix(cm)

## We can now classify the test data
To use the trainer.predict() method, we have created a new column "labels" in the .csv and filled it with 1




In [None]:
#model = AutoModelForSequenceClassification.from_pretrained("dpmEnhanced/model", local_files_only=True)
#trainer = Trainer(model=model, args=training_args)

test = load_test_set("test_dataset.csv")
y_pred = trainer.predict(test["train"])

Get predictions and save them to a txt-file. We need one prediction per line. The resulting file can be zipped and submitted to https://competitions.codalab.org/competitions/34344#learn_the_details

In [None]:
preds = y_pred.predictions.argmax(-1)
res = pd.DataFrame(preds)
res.to_csv("task1.txt", header=False, index=False)