# Transformer models with data augmented for EDOS

In this notebook, we explore different approaches for [EDOS task]
(https://codalab.lisn.upsaclay.fr/competitions/7124).

We explore different transformer models. 
In this cell, we choose the model and the task.

In [1]:
models = ['bert-base-uncased', 'bert-base-cased', 
          'distilbert-base-uncased', 'distilbert-base-cased', 
          'roberta-base', 
          'xlnet-base-cased',
          'gpt2', 't5-small']

model_name = models[5] 

TASK = "a" # a, b or c
TASK = TASK.lower()
USE_DATA_AUGMENTED = True

print(model_name, TASK, USE_DATA_AUGMENTED)


xlnet-base-cased a True


First, we must make sure that we are using GPU:

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Tue Feb 21 13:14:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

We will need to install some libraries:

In [3]:
!pip install datasets transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


We define a seed to assure reproductibility:

In [4]:
from transformers import set_seed
set_seed(42)

## Data


In [5]:
from google.colab import drive
# mount your google drive
drive.mount('/content/drive')

import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets

PATH = "/content/drive/My Drive/Colab Notebooks/proyectos/edos/"
PATH_DATA = "/content/drive/My Drive/Colab Notebooks/data/edos/"

df = pd.read_csv(PATH_DATA+"edos_labelled.csv") 

  
# loading the splits
df_train = df[df['split']=='train'] # for training
df_dev = df[df['split']=='dev'] # for validation
df_test = df[df['split']=='test'] # for final test

# from dataframes to dataset
dataset= DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train )
dataset['val'] = Dataset.from_pandas(df_dev)
dataset['test'] = Dataset.from_pandas(df_test)

dataset=dataset.remove_columns(['split','__index_level_0__'])

if USE_DATA_AUGMENTED:
    aug_dataset = load_dataset("csv", data_files=PATH_DATA+"edos_augmented.csv")
    # we get the augmented texts and save them into new datasets
    data_eda = aug_dataset.remove_columns(['text','text_nlpaug']).rename_columns({'text_aug':'text'})
    data_nlpaug = aug_dataset.remove_columns(['text','text_aug']).rename_columns({'text_nlpaug':'text'})
    dataset["train"]  = concatenate_datasets([dataset['train'], data_eda['train'], data_nlpaug['train']])

dataset


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['rewire_id', 'text', 'label_sexist', 'label_category', 'label_vector'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['rewire_id', 'text', 'label_sexist', 'label_category', 'label_vector'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['rewire_id', 'text', 'label_sexist', 'label_category', 'label_vector'],
        num_rows: 4000
    })
})

In [6]:
if TASK != 'a':
    dataset = dataset.filter(lambda example: example["label_sexist"]=='sexist')

# for each task, there is a different label
label_task = {'a': 'label_sexist', 'b': 'label_category', 'c':'label_vector'}

# we only keep the text and the corresponding label for the atsk      
columns_to_remove = list(set(dataset['train'].features) - set(['text', label_task[TASK]]))
dataset = dataset.remove_columns(columns_to_remove)
dataset = dataset.rename_column(label_task[TASK],'label')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
})

### Encoding the labels


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dataset['train']['label'])
LABELS = list(le.classes_)
NUM_LABELS = len(LABELS)

y_train = le.transform(dataset['train']['label'])
y_val = le.transform(dataset['val']['label'])
y_test = le.transform(dataset['test']['label'])

# check that the transformation is right!!!
print(dataset['test']['label'][:5])
print(y_test[:5])


['not sexist', 'sexist', 'not sexist', 'sexist', 'sexist']
[0 1 0 1 1]


## Tokenizer

In [8]:
if 'uncased' in model_name:
        do_lower_case = True
else:
        do_lower_case = False


if 'roberta' in model_name:

    from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
    tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')

elif 'distilbert' in model_name:

    from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_name,  do_lower_case=do_lower_case)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')
    
elif 'bert' in model_name:

    from transformers import BertTokenizerFast, BertForSequenceClassification
    tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=do_lower_case)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')

elif 'xlnet' in model_name:
    from transformers import XLNetTokenizerFast, XLNetForSequenceClassification
    tokenizer = XLNetTokenizerFast.from_pretrained(model_name, do_lower_case=do_lower_case)
    model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')

elif 'gpt2' in model_name:
    
    from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name, do_lower_case=do_lower_case)
    model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')

else:

    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS).to('cuda')

print(model_name , 'tokenizers and model loaded')


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

xlnet-base-cased tokenizers and model loaded


### Maximum length

In [9]:
# max sequence length for each document/sentence sample
MAX_LENGTH = max([len(tokenizer(text).tokens())  for text in dataset['train']['text']])
print(MAX_LENGTH)

MAX_LENGTH = min(MAX_LENGTH, 512)
print('MAX_LENGTH:', MAX_LENGTH)


149
MAX_LENGTH: 149


We define a function to preprocess the texts and apply it on the dataset. We can see that the tokenization has added two new features to the dataset: input_ids and attention_mask

In [10]:
def tokenize_func(example):
    return tokenizer(example["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

encoded_dataset = dataset.map(tokenize_func, batched=True)
# remove the test and also the previous label (with the names)
encoded_dataset = encoded_dataset.remove_columns(['text','label'])
# now, add the encoded label
encoded_dataset['train'] = encoded_dataset['train'].add_column('label', y_train)
encoded_dataset['val'] = encoded_dataset['val'].add_column('label', y_val)
#encoded_dataset['test'] = encoded_dataset['test'].add_column('label', y_test)
del(encoded_dataset['test'] )
encoded_dataset

  0%|          | 0/42 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 42000
    })
    val: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 2000
    })
})

## Model

### Defining metrics and arguments for training the model

In [11]:
# define metrics
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,    #we could return just the accuracy
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

### Define arguments @########################

from transformers import TrainingArguments
args = TrainingArguments(
    output_dir='./outputs/',
    # overwrite_output_dir = True,  # If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir` points to a checkpoint directory.
    logging_dir='./logs',            # directory for storing logs

    num_train_epochs=3, # 3, we changed to 1 for a faster training. You should increase its value to 3 or 5
    evaluation_strategy = "epoch",  # "steps",   evaluate each `logging_steps`, logging_steps=400,               # log & save weights each logging_steps     save_steps=400,
                                    # save_steps=400,
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",

    # The following argumets are used to save the model in huggingface
    # push_to_hub = True, #  Whether or not to push the model to the Hub every time the model is saved. If this is activated,
            # `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content
            # will be pushed each time a save is triggered (depending on your `save_strategy`)
    # hub_model_id = model_name+'_edos_{}'.format(TASK),
    # hub_token = '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', 

)

### Trainer

In [12]:
from transformers import Trainer

trainer = Trainer(
    model,                                  # the model
    args,                                   # the arguments of the model
    train_dataset=encoded_dataset['train'],               # the training dataset
    eval_dataset=encoded_dataset['val'],               #the validation dataset
    tokenizer=tokenizer,                    # the tokenizer
    compute_metrics=compute_metrics,        # the metrics used to evaluate the validation, these are calculated in each epoch
)

# training
print('Training: ', model_name)
trainer.train()

***** Running training *****
  Num examples = 42000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7875
  Number of trainable parameters = 117310466
You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training:  xlnet-base-cased


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2641,0.347297,0.8585,0.814388,0.804799,0.826205
2,0.1615,0.621945,0.8725,0.821772,0.831867,0.813098
3,0.0899,0.824591,0.868,0.816976,0.823838,0.810825


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to ./outputs/checkpoint-2625
Configuration saved in ./outputs/checkpoint-2625/config.json
Model weights saved in ./outputs/checkpoint-2625/pytorch_model.bin
tokenizer config file saved in ./outputs/checkpoint-2625/tokenizer_config.json
Special tokens file saved in ./outputs/checkpoint-2625/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to ./outputs/checkpoint-5250
Configuration saved in ./outputs/checkpoint-5250/config.json
Model weights saved in ./outputs/checkpoint-5250/pytorch_model.bin
tokenizer config file saved in ./outputs/checkpoint-5250/tokenizer_config.json
Special tokens file saved in ./outputs/checkpoint-5250/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to ./outputs/checkpoint-7875
Configuration saved in ./outputs/checkpoint-7875/config.

TrainOutput(global_step=7875, training_loss=0.2043728540814112, metrics={'train_runtime': 1327.7942, 'train_samples_per_second': 94.894, 'train_steps_per_second': 5.931, 'total_flos': 9951865245729792.0, 'train_loss': 0.2043728540814112, 'epoch': 3.0})

### Save the model


In [13]:
import os
SAVE_MODEL = False
if SAVE_MODEL:
    models_dir = PATH+'models/'
    if not os.path.exists(models_dir): ### If the file directory doesn't already exists,
        os.makedirs(models_dir) ### Make it please
        
    model_path = models_dir+model_name

    if USE_DATA_AUGMENTED:
        model_path += "_aug"

    model_path += "_{}".format(TASK)

    # save the model and the tokenizer
    tokenizer.save_pretrained(model_path)
    trainer.save_model(model_path)

### Evalution on the validation set
We evaluate the best model on the validation split:

In [14]:
# evaluate the current model after training, on the validation set
result = trainer.evaluate()
result

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'eval_loss': 0.6219448447227478,
 'eval_accuracy': 0.8725,
 'eval_f1': 0.8217724831391527,
 'eval_precision': 0.8318668781358445,
 'eval_recall': 0.8130983250974444,
 'eval_runtime': 4.2377,
 'eval_samples_per_second': 471.959,
 'eval_steps_per_second': 29.497,
 'epoch': 3.0}

## Final evaluation

We now use the model to predict the labels for the texts in the test split. These predictions are used to compare against the labels in the test split, and obtain the metrics. 


In [15]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    # return probs.argmax() is a tensor. We have to return its item
    return probs.argmax().item()

In [16]:
y_pred=[LABELS[get_prediction(text)] for text in dataset['test']['text']]
y_test = dataset['test']['label']

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true = y_test, y_pred = y_pred, target_names=LABELS))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

  not sexist       0.91      0.93      0.92      3030
      sexist       0.76      0.71      0.73       970

    accuracy                           0.88      4000
   macro avg       0.84      0.82      0.83      4000
weighted avg       0.87      0.88      0.87      4000

[[2819  211]
 [ 285  685]]


In [18]:
import os
### Create an output directory
output_dir = PATH+'results/'
if not os.path.exists(output_dir): ### If the file directory doesn't already exists,
    os.makedirs(output_dir) ### Make it please

# we use the test split to obtain final results
clsf_report = pd.DataFrame(classification_report(y_true = y_test, y_pred = y_pred, target_names=LABELS, output_dict=True)).transpose()
# saving to csv
path_results = output_dir+model_name
if USE_DATA_AUGMENTED:
    path_results += '_aug'
clsf_report.to_csv(path_results+'_{}.csv'.format(TASK), index= True)
print(path_results+'_{}.csv'.format(TASK), ' was saved!')


/content/drive/My Drive/Colab Notebooks/proyectos/edos/results/xlnet-base-cased_aug_a.csv  was saved!
