In [1]:
!pip install --upgrade pip
!pip install sentencepiece
!pip install datasets
!pip install transformers
!pip install nlpaug



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import re
import nlpaug.augmenter.char as nac
from nlpaug.util.action import Action

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load and preprocess the dataset
def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath, sep='\t')
    data['text'] = data['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
    data['labels'] = data.apply(lambda row: label_col(row), axis=1)
    return data

def label_col(row):
    if row['misogynous'] == 0:
        return 0
    elif row['misogynous'] == 1:
        return 1

In [5]:
def random_deletion(sentence):
    aug = nac.RandomCharAug(action=Action.DELETE)
    return ''.join(aug.augment(sentence))

def random_insertion(sentence):
    aug = nac.RandomCharAug(action=Action.INSERT)
    return ''.join(aug.augment(sentence))

def random_swap(sentence):
    aug = nac.RandomCharAug(action=Action.SWAP)
    return ''.join(aug.augment(sentence))

In [6]:
def augment_data(data):
    print("Before data augmentation:")
    print(data.labels.value_counts())

    random.seed(42)
    augmentation_methods = [random_deletion, random_insertion, random_swap]

    instances_to_augment = {0: 0, 1: 315}

    augmented_data = []
    for class_label, num_instances in instances_to_augment.items():
        class_data = data[data['labels'] == class_label].sample(num_instances, replace=True)

        for _, row in class_data.iterrows():
            random_augmentation = random.choice(augmentation_methods)
            augmented_text = random_augmentation(row['text'])
            augmented_data.append({'text': augmented_text, 'labels': row['labels']})

    augmented_df = pd.DataFrame(augmented_data)
    data_augmented = pd.concat([data, augmented_df], ignore_index=True)

    print("\nAfter data augmentation:")
    print(data_augmented.labels.value_counts())

    return data_augmented

In [7]:
import random

In [8]:
data = load_and_preprocess_data('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv')
data = augment_data(data)

Before data augmentation:
0    2362
1    2047
Name: labels, dtype: int64

After data augmentation:
1    2362
0    2362
Name: labels, dtype: int64


In [9]:
# Load and split the dataset
data = load_and_preprocess_data('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv')
train_data, val_data = train_test_split(data, test_size=0.06, random_state=17, stratify=data.labels.values)

# Convert the DataFrame to Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [10]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/4144 [00:00<?, ? examples/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

In [12]:
pip install transformers[torch]



In [13]:
!pip install accelerate -U



In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=10,
    load_best_model_at_end=True,
)

In [15]:
num_labels = len(set(data["labels"]))
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [17]:
trainer.train()



Step,Training Loss,Validation Loss
10,0.6981,0.689791
20,0.6797,0.682093
30,0.6646,0.654863
40,0.6255,0.569666
50,0.5312,0.411656
60,0.4768,0.354448
70,0.3709,0.404899
80,0.418,0.351581
90,0.2994,0.359551
100,0.3572,0.411751


TrainOutput(global_step=2590, training_loss=0.23200737491293189, metrics={'train_runtime': 4099.5608, 'train_samples_per_second': 5.054, 'train_steps_per_second': 0.632, 'total_flos': 5451661067059200.0, 'train_loss': 0.23200737491293189, 'epoch': 5.0})

In [18]:
trainer.save_model("./results/best_model")

In [19]:
val_preds_raw, val_labels, _ = trainer.predict(val_dataset)
val_preds = np.argmax(val_preds_raw, axis=-1)
print(classification_report(val_labels, val_preds, digits=3))

              precision    recall  f1-score   support

           0      0.915     0.908     0.912       142
           1      0.895     0.902     0.899       123

    accuracy                          0.906       265
   macro avg      0.905     0.905     0.905       265
weighted avg      0.906     0.906     0.906       265



In [20]:
test_data = load_and_preprocess_data('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv')
test_dataset = Dataset.from_pandas(test_data)
test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
test_preds_raw, test_labels, _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_raw, axis=-1)
weighted_f1_test = f1_score(test_labels, test_preds, average='weighted')
print(f'Weighted F1 Score for the test set: {weighted_f1_test}')

Weighted F1 Score for the test set: 0.7975372063156305


In [24]:
!pip install torch



In [25]:
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [26]:
MAX_LEN = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
def test_model():
    df = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv', delimiter='\t', header=0, names=['id', 'sentence', 'label', 'aggressiveness'])
    sentences = df.sentence.values
    labels = df.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,
        )
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=1)

    attention_masks = [[float(i > 1) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    batch_size = 16

    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    model.eval()

    predictions, true_labels = [], []

    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(logits)
        true_labels.extend(label_ids)

    pred_flat = np.argmax(predictions, axis=1).flatten()

    def eval_accuracy(a, b):
        true_pred = [j for i, j in zip(a, b) if i == j]
        accuracy = len(true_pred) / len(a)
        return accuracy

    print(eval_accuracy(true_labels, pred_flat))

In [27]:
test_model()

Predicting labels for 1,000 test sentences...
0.732
