In [1]:
!pip install folium urllib3 transformers keras tensorflow


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m124.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [16]:
import os
import re
import pandas as pd
import torch
import numpy as np
import random
import time
import datetime
from google.colab import drive
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
source_folder = '/content/drive/My Drive/NLP/AMI2020/'
destination_folder = '/content/drive/My Drive/ami_umberto/'

MAX_LEN = 128
batch_size = 16
epochs = 8

In [18]:
def label_col(row):
    if row['misogynous'] == 0 and row['aggressiveness'] == 0:
        return 0
    elif row['misogynous'] == 1 and row['aggressiveness'] == 0:
        return 1
    elif row['misogynous'] == 1 and row['aggressiveness'] == 1:
        return 2

In [25]:
def load_and_preprocess_data(source_folder):
    df = pd.read_csv(source_folder + "trainingset/AMI2020_training_raw_anon.tsv", delimiter='\t', header=0, names=['id', 'text', 'misogynous', 'aggressiveness'])
    df['text'] = df['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
    df['labels'] = df.apply(label_col, axis=1)

    sentences = df.text.values
    labels = df.labels.values

    tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")

    input_ids = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=1, truncating="post", padding="post")

    attention_masks = [[int(token_id > 1) for token_id in sent] for sent in input_ids]

    return input_ids, attention_masks, labels, tokenizer

In [20]:
def split_data(input_ids, attention_masks, labels):
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

In [21]:
def create_dataloaders(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, batch_size):
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    return train_dataloader, validation_dataloader

In [22]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
def train_model(train_dataloader, validation_dataloader, epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = RobertaForSequenceClassification.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1", num_labels=3)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch_i in range(0, epochs):
        print(f'======== Epoch {epoch_i + 1} / {epochs} ========')

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            total_train_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print(f'Average training loss: {avg_train_loss}')
        print(f'Training epoch took: {training_time}')

        print('Running Validation...')

        t0 = time.time()
        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        validation_time = format_time(time.time() - t0)

        print(f'Validation Loss: {avg_val_loss}')
        print(f'Validation Accuracy: {avg_val_accuracy}')
        print(f'Validation took: {validation_time}')

    return model

In [26]:
input_ids, attention_masks, labels, tokenizer = load_and_preprocess_data(source_folder)
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = split_data(input_ids, attention_masks, labels)
train_dataloader, validation_dataloader = create_dataloaders(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, batch_size)
model = train_model(train_dataloader, validation_dataloader, epochs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/794k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

You are using a model of type camembert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at Musixmatch/umberto-commoncrawl-cased-v1 were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Musixmatch/umberto-commoncrawl-cased-v1 and are new

Average training loss: 0.6649097208414347
Training epoch took: 0:01:21
Running Validation...
Validation Loss: 0.6634555943310261
Validation Accuracy: 0.7420634920634921
Validation took: 0:00:03
Average training loss: 0.44151590450576716
Training epoch took: 0:01:25
Running Validation...
Validation Loss: 0.42681045377893106
Validation Accuracy: 0.8353174603174603
Validation took: 0:00:03
Average training loss: 0.33647329675694626
Training epoch took: 0:01:25
Running Validation...
Validation Loss: 0.4247871046619756
Validation Accuracy: 0.8236607142857143
Validation took: 0:00:03
Average training loss: 0.2547282184429106
Training epoch took: 0:01:25
Running Validation...
Validation Loss: 0.45875960561845985
Validation Accuracy: 0.8482142857142857
Validation took: 0:00:03
Average training loss: 0.18150321719792462
Training epoch took: 0:01:25
Running Validation...
Validation Loss: 0.4878888276538679
Validation Accuracy: 0.8504464285714286
Validation took: 0:00:03
Average training loss: 0.

In [33]:
def test_model():
    df = pd.read_csv(source_folder + "testset/AMI2020_test_raw_gold_anon.tsv", delimiter='\t', header=0, names=['id', 'sentence', 'misogynous', 'aggressiveness'])
    df['sentence'] = df['sentence'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
    df['labels'] = df.apply(label_col, axis=1)

    sentences = df.sentence.values
    labels = df.labels.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,
        )
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=1)

    attention_masks = [[float(i > 1) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    batch_size = 16

    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    model.eval()

    predictions, true_labels = [], []

    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(logits)
        true_labels.extend(label_ids)

    pred_flat = np.argmax(predictions, axis=1).flatten()

    def f1_score_func(preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels
        return f1_score(labels_flat, preds_flat, average='weighted')

    print(f1_score_func(predictions, true_labels))

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
test_model()

Predicting labels for 1,000 test sentences...
0.6095406904285985
