In [None]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
!pip install transformers
!pip install bert-tensorflow
from transformers import AdamW, AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load data
data = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv', sep='\t')


In [None]:
import re

In [None]:
# Remove <MENTION_N> and <URL>
data['text'] = data['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))

In [None]:
# Function to create label column
def label_col(row):
    if row['misogynous'] == 0 and row['aggressiveness'] == 0:
        return 0
    elif row['misogynous'] == 1 and row['aggressiveness'] == 0:
        return 1
    elif row['misogynous'] == 1 and row['aggressiveness'] == 1:
        return 2

# Creating the labels
data['labels'] = data.apply(label_col, axis=1)
label_dict = {0: [0,0], 1: [1,0], 2: [1,1]}

In [None]:
# Splitting the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.06,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Assigning data types
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", do_lower_case=True)

def encode_data(dataset, max_length=256):
    return tokenizer.batch_encode_plus(
        list(dataset.text.values),
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

encoded_data_train = encode_data(data[data.data_type == 'train'])
encoded_data_val = encode_data(data[data.data_type == 'val'])

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

In [None]:
# Preparing the datasets
def create_dataset(encoded_data, labels):
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    torch_labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, torch_labels)

dataset_train = create_dataset(encoded_data_train, data[data.data_type == 'train'].labels.values)
dataset_val = create_dataset(encoded_data_val, data[data.data_type == 'val'].labels.values)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 4144
Validation dataset length: 265


In [None]:
# Creating data loaders
batch_size = 16

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)

In [None]:
# Initializing the model
model = BertForSequenceClassification.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", num_labels=3)

# Setting up the optimizer and scheduler
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)
epochs = 8
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

Downloading pytorch_model.bin:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some weights of the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [None]:
import random

# Setting the random seed
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
# Function to calculate F1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Function to calculate accuracy per class
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [None]:
# Evaluation function
def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    print('-' * 10)

    model.train()

    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Training', leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'/content/drive/My Drive/NLP/AMI2020/finetuned_BERT_epoch_{epoch}.model')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, val_preds, val_true = evaluate(dataloader_val)

    val_f1 = f1_score_func(val_preds, val_true)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write('\n')

Epoch 1/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.3847348855729269
Validation loss: 0.4453894837814219
F1 Score (Weighted): 0.8010428647581788
Epoch 2/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.29292363911493174
Validation loss: 0.500819722080932
F1 Score (Weighted): 0.8269712949381672
Epoch 3/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.2201423984143207
Validation loss: 0.5272946478251148
F1 Score (Weighted): 0.831588565324935
Epoch 4/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.16412941092970462
Validation loss: 0.594731108230703
F1 Score (Weighted): 0.8158554504549226
Epoch 5/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.13244261997428267
Validation loss: 0.6235330920009052
F1 Score (Weighted): 0.8314573245100522
Epoch 6/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.11053570465842787
Validation loss: 0.7037388445699916
F1 Score (Weighted): 0.8164590402023263
Epoch 7/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.09453089079328787
Validation loss: 0.7201446193708655
F1 Score (Weighted): 0.8150061850541607
Epoch 8/8
----------


Training:   0%|          | 0/259 [00:00<?, ?it/s]




Training loss: 0.08262101871197015
Validation loss: 0.7624742850825629
F1 Score (Weighted): 0.8150061850541607


In [None]:
data_test = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv', sep='\t')

# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    list(data_test.text.values),
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)



In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)

In [None]:
dataloader_test = DataLoader(dataset_test)

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()

      predictions.append(logits)


    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [None]:
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  results.append(predicted)

In [None]:
label_dict = {0: [0,0],
              1: [1,0],
              2: [1,1]}

misogynous, aggressiveness = [], []
for prediction in results:
  misogynous.append(label_dict[prediction][0])
  aggressiveness.append(label_dict[prediction][1])

In [None]:
data_test = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/testset/AMI2020_test_raw_gold_anon.tsv', sep='\t')
data_test['labels'] = data_test.apply(lambda row: label_col(row), axis=1)
true_labels_test = data_test.labels.values

# Convert the predictions list to a numpy array
predictions_array = np.concatenate(predictions, axis=0)

# Calculate the weighted F1 score for the test set
weighted_f1_test = f1_score_func(predictions_array, true_labels_test)
print(f'Weighted F1 Score for the test set: {weighted_f1_test}')

Weighted F1 Score for the test set: 0.6418865955939
