In [None]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install datasets
!pip install transformers
!pip install nlpaug



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import nlpaug.augmenter.char as nac
from nlpaug.util import Action
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load and preprocess the dataset
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath, sep='\t')
    data['text'] = data['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
    data['labels'] = data.apply(lambda row: label_col(row), axis=1)
    return data

def label_col(row):
    if row['misogynous'] == 0 and row['aggressiveness'] == 0:
        return 0
    elif row['misogynous'] == 1 and row['aggressiveness'] == 0:
        return 1
    elif row['misogynous'] == 1 and row['aggressiveness'] == 1:
        return 2

In [None]:
def random_deletion(sentence):
    aug = nac.RandomCharAug(action=Action.DELETE)
    return ''.join(aug.augment(sentence))

def random_insertion(sentence):
    aug = nac.RandomCharAug(action=Action.INSERT)
    return ''.join(aug.augment(sentence))

def random_swap(sentence):
    aug = nac.RandomCharAug(action=Action.SWAP)
    return ''.join(aug.augment(sentence))

In [None]:
def augment_data(data):
    print("Before data augmentation:")
    print(data.labels.value_counts())

    random.seed(42)
    augmentation_methods = [random_deletion, random_insertion, random_swap]

    instances_to_augment = {1: 1884, 2: 793}

    augmented_data = []
    for class_label, num_instances in instances_to_augment.items():
        class_data = data[data['labels'] == class_label].sample(num_instances, replace=True)

        for _, row in class_data.iterrows():
            random_augmentation = random.choice(augmentation_methods)
            augmented_text = random_augmentation(row['text'])
            augmented_data.append({'text': augmented_text, 'labels': row['labels']})

    augmented_df = pd.DataFrame(augmented_data)
    data_augmented = pd.concat([data, augmented_df], ignore_index=True)

    print("\nAfter data augmentation:")
    print(data_augmented.labels.value_counts())

    return data_augmented

In [None]:
import random

In [None]:
data = load_and_preprocess_data('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv')
data = augment_data(data)

Before data augmentation:
0    2362
2    1569
1     478
Name: labels, dtype: int64

After data augmentation:
2    2362
1    2362
0    2362
Name: labels, dtype: int64


In [None]:
train_data, val_data = train_test_split(data, test_size=0.06, random_state=17, stratify=data.labels.values)

# Convert the DataFrame to Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/6660 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [None]:
!pip install transformers[torch]



In [None]:
!pip install accelerate -U



In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,
)

In [None]:
num_labels = len(set(data["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
10,1.0802,1.098355
20,1.0867,1.087128
30,1.0451,1.026417
40,0.961,0.895092
50,0.8828,0.761113
60,0.7693,0.671895
70,0.6519,0.620605
80,0.8369,0.620529
90,0.6582,0.649222
100,0.6539,0.676145


TrainOutput(global_step=4165, training_loss=0.3277222875990913, metrics={'train_runtime': 8869.6868, 'train_samples_per_second': 3.754, 'train_steps_per_second': 0.47, 'total_flos': 8761676810342400.0, 'train_loss': 0.3277222875990913, 'epoch': 5.0})

In [None]:
trainer.save_model("./results/best_model")

In [None]:
val_preds_raw, val_labels, _ = trainer.predict(val_dataset)
val_preds = np.argmax(val_preds_raw, axis=-1)
print(classification_report(val_labels, val_preds, digits=3))

              precision    recall  f1-score   support

           0      0.900     0.887     0.894       142
           1      0.799     0.866     0.831       142
           2      0.780     0.725     0.752       142

    accuracy                          0.826       426
   macro avg      0.826     0.826     0.826       426
weighted avg      0.826     0.826     0.826       426



In [None]:
test_data = load_and_preprocess_data('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv')
test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
test_preds_raw, test_labels, _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
weighted_f1_test = f1_score(test_labels, test_preds, average='weighted')
print(f'Weighted F1 Score for the test set: {weighted_f1_test}')

Weighted F1 Score for the test set: 0.49229741973928015
