In [1]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
!pip install transformers
!pip install bert-tensorflow
from transformers import AdamW, AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load data
data = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv', sep='\t')


In [5]:
import re

In [6]:
# Remove <MENTION_N> and <URL>
data['text'] = data['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))

In [7]:
# Function to create label column
def label_col(row):
    if row['misogynous'] == 0 and row['aggressiveness'] == 0:
        return 0
    elif row['misogynous'] == 1 and row['aggressiveness'] == 0:
        return 1
    elif row['misogynous'] == 1 and row['aggressiveness'] == 1:
        return 2

# Creating the labels
data['labels'] = data.apply(label_col, axis=1)
label_dict = {0: [0,0], 1: [1,0], 2: [1,1]}

In [8]:
print("Before data augmentation:")
print(data.labels.value_counts())

Before data augmentation:
0    2362
2    1569
1     478
Name: labels, dtype: int64


In [9]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [10]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from nlpaug.util import Action
import random

In [11]:
def random_deletion(sentence):
    aug = nac.RandomCharAug(action=Action.DELETE)
    return aug.augment(sentence)

def random_insertion(sentence):
    aug = nac.RandomCharAug(action=Action.INSERT)
    return aug.augment(sentence)

def random_swap(sentence):
    aug = nac.RandomCharAug(action=Action.SWAP)
    return aug.augment(sentence)

In [12]:
random.seed(42)
augmentation_methods = [random_deletion, random_insertion, random_swap]

In [13]:
instances_to_augment = {1: 1884, 2: 793}

In [14]:
augmented_data = []
for class_label, num_instances in instances_to_augment.items():
    class_data = data[data['labels'] == class_label].sample(num_instances, replace=True)

    for _, row in class_data.iterrows():
        random_augmentation = random.choice(augmentation_methods)
        augmented_text = random_augmentation(row['text'])
        augmented_data.append({'text': augmented_text, 'labels': row['labels']})

augmented_df = pd.DataFrame(augmented_data)
data_augmented = pd.concat([data, augmented_df], ignore_index=True)

In [15]:
print("\nAfter data augmentation:")
print(data_augmented.labels.value_counts())

data = data_augmented


After data augmentation:
2    2362
1    2362
0    2362
Name: labels, dtype: int64


In [16]:
# Splitting the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.06,
    random_state=17,
    stratify=data.labels.values
)

In [17]:
# Assigning data types
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

In [18]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", do_lower_case=True)

def encode_data(dataset, max_length=256):
    input_texts = [str(text) for text in dataset.text.values]
    return tokenizer.batch_encode_plus(
        input_texts,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

encoded_data_train = encode_data(data[data.data_type == 'train'])
encoded_data_val = encode_data(data[data.data_type == 'val'])

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

In [19]:
# Preparing the datasets
def create_dataset(encoded_data, labels):
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    torch_labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, torch_labels)

dataset_train = create_dataset(encoded_data_train, data[data.data_type == 'train'].labels.values)
dataset_val = create_dataset(encoded_data_val, data[data.data_type == 'val'].labels.values)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 6660
Validation dataset length: 426


In [20]:
# Creating data loaders
batch_size = 16

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)

In [21]:
# Initializing the model
model = BertForSequenceClassification.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", num_labels=3)

# Setting up the optimizer and scheduler
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)
epochs = 8
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Downloading pytorch_model.bin:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some weights of the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [22]:
import random

# Setting the random seed
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [23]:
# Function to calculate F1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Function to calculate accuracy per class
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [24]:
# Evaluation function
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [25]:
# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    print('-' * 10)

    model.train()

    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Training', leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'/content/drive/My Drive/NLP/AMI2020/finetuned_BERT_epoch_{epoch}.model')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, val_preds, val_true = evaluate(dataloader_val)

    val_f1 = f1_score_func(val_preds, val_true)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write('\n')

Epoch 1/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.5366241974438981
Validation loss: 0.4882253698176808
F1 Score (Weighted): 0.7986153908783482


Epoch 2/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.37407454770567605
Validation loss: 0.5105659663677216
F1 Score (Weighted): 0.8161464484961525


Epoch 3/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.2558691268094438
Validation loss: 0.5657373727471741
F1 Score (Weighted): 0.7921274736874888


Epoch 4/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.17534846975858942
Validation loss: 0.634133967122546
F1 Score (Weighted): 0.8254647803400171


Epoch 5/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.1181110858493451
Validation loss: 0.7922548692397497
F1 Score (Weighted): 0.8342631888731179


Epoch 6/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.08057924998869419
Validation loss: 0.8590414661017282
F1 Score (Weighted): 0.8438867352365909


Epoch 7/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.06221298352249605
Validation loss: 0.9292900622132476
F1 Score (Weighted): 0.8462179899506856


Epoch 8/8
----------


Training:   0%|          | 0/417 [00:00<?, ?it/s]

Training loss: 0.04920718172979396
Validation loss: 0.9333379419899925
F1 Score (Weighted): 0.8437678398751064




In [26]:
data_test = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv', sep='\t')

# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    list(data_test.text.values),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)

In [28]:
dataloader_test = DataLoader(dataset_test)

In [29]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()

      predictions.append(logits)


    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [30]:
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  results.append(predicted)

In [31]:
label_dict = {0: [0,0],
              1: [1,0],
              2: [1,1]}

misogynous, aggressiveness = [], []
for prediction in results:
  misogynous.append(label_dict[prediction][0])
  aggressiveness.append(label_dict[prediction][1])

In [32]:
data_test = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv', sep='\t')
data_test['labels'] = data_test.apply(lambda row: label_col(row), axis=1)
true_labels_test = data_test.labels.values

# Convert the predictions list to a numpy array
predictions_array = np.concatenate(predictions, axis=0)

# Calculate the weighted F1 score for the test set
weighted_f1_test = f1_score_func(predictions_array, true_labels_test)
print(f'Weighted F1 Score for the test set: {weighted_f1_test}')

Weighted F1 Score for the test set: 0.5798430001796134
