In [1]:
!pip install folium urllib3 transformers keras tensorflow


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [2]:
import os
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
import random
import re
from google.colab import drive
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
source_folder = '/content/drive/My Drive/NLP/AMI2020/'
destination_folder = '/content/drive/My Drive/ami_umberto/'

MAX_LEN = 128
batch_size = 16
epochs = 8

In [5]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [6]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from nlpaug.util import Action

In [13]:
def load_and_preprocess_data(source_folder):

    def random_deletion(sentence):
        aug = nac.RandomCharAug(action=Action.DELETE)
        return aug.augment(sentence)

    def random_insertion(sentence):
        aug = nac.RandomCharAug(action=Action.INSERT)
        return aug.augment(sentence)

    def random_swap(sentence):
        aug = nac.RandomCharAug(action=Action.SWAP)
        return aug.augment(sentence)

    random.seed(42)
    augmentation_methods = [random_deletion, random_insertion, random_swap]
    tweets_to_augment = 315

    df = pd.read_csv(source_folder + "trainingset/AMI2020_training_raw_anon.tsv", delimiter='\t', header=0, names=['id', 'text', 'misoginous', 'aggressiveness'])
    df['text'] = df['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
    df['labels'] = df['misoginous']

    misogynous_data = df[df['labels'] == 1].sample(tweets_to_augment)
    augmented_data = []

    for _, row in misogynous_data.iterrows():
        random_augmentation = random.choice(augmentation_methods)
        augmented_text = random_augmentation(row['text'])
        augmented_data.append({'text': augmented_text, 'labels': row['labels']})

    augmented_df = pd.DataFrame(augmented_data)
    df_augmented = pd.concat([df, augmented_df], ignore_index=True)
    df = df_augmented

    sentences = df.text.values
    labels = df.labels.values

    tokenizer = AutoTokenizer.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1")

    sentences = [str(sent) for sent in sentences]
    input_ids = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=1, truncating="post", padding="post")

    attention_masks = [[int(token_id > 1) for token_id in sent] for sent in input_ids]

    return input_ids, attention_masks, labels, tokenizer

In [8]:
# Split data into training and validation sets
def split_data(input_ids, attention_masks, labels):
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)

    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)

    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    return train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks

In [9]:
# Create dataloaders
def create_dataloaders(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, batch_size):
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    return train_dataloader, validation_dataloader

In [10]:
# Helper functions
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
# Train model
def train_model(train_dataloader, validation_dataloader, epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = RobertaForSequenceClassification.from_pretrained("Musixmatch/umberto-commoncrawl-cased-v1", num_labels=2, output_attentions=False, output_hidden_states=False)
    model.cuda()

    optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    for epoch_i in range(0, epochs):
        # Training
        total_train_loss = 0
        model.train()
        t0 = time.time()

        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('Epoch {:}/{:}, Batch {:}/{:}, Elapsed: {:}.'.format(epoch_i + 1, epochs, step, len(train_dataloader), elapsed,))
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            total_train_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)
        print("Epoch {:}/{:}, Average training loss: {:.4f}, Training epoch time: {:}".format(epoch_i + 1, epochs, avg_train_loss, training_time))

        # Validation
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        t0 = time.time()

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs[0]
            logits = outputs[1]

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        validation_time = format_time(time.time() - t0)
        print("Epoch {:}/{:}, Average validation loss: {:.4f}, Average validation accuracy: {:.4f}, Validation epoch time: {:}".format(epoch_i + 1, epochs, avg_val_loss, avg_val_accuracy, validation_time))

    return model

In [14]:
input_ids, attention_masks, labels, tokenizer = load_and_preprocess_data(source_folder)
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = split_data(input_ids, attention_masks, labels)
train_dataloader, validation_dataloader = create_dataloaders(train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, batch_size)
model = train_model(train_dataloader, validation_dataloader, epochs)

You are using a model of type camembert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at Musixmatch/umberto-commoncrawl-cased-v1 were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Musixmatch/umberto-commoncrawl-cased-v1 and are new

Epoch 1/8, Batch 40/266, Elapsed: 0:00:15.
Epoch 1/8, Batch 80/266, Elapsed: 0:00:28.
Epoch 1/8, Batch 120/266, Elapsed: 0:00:41.
Epoch 1/8, Batch 160/266, Elapsed: 0:00:55.
Epoch 1/8, Batch 200/266, Elapsed: 0:01:08.
Epoch 1/8, Batch 240/266, Elapsed: 0:01:22.
Epoch 1/8, Average training loss: 0.4158, Training epoch time: 0:01:31
Epoch 1/8, Average validation loss: 0.2547, Average validation accuracy: 0.9109, Validation epoch time: 0:00:03
Epoch 2/8, Batch 40/266, Elapsed: 0:00:14.
Epoch 2/8, Batch 80/266, Elapsed: 0:00:28.
Epoch 2/8, Batch 120/266, Elapsed: 0:00:41.
Epoch 2/8, Batch 160/266, Elapsed: 0:00:55.
Epoch 2/8, Batch 200/266, Elapsed: 0:01:09.
Epoch 2/8, Batch 240/266, Elapsed: 0:01:23.
Epoch 2/8, Average training loss: 0.2626, Training epoch time: 0:01:32
Epoch 2/8, Average validation loss: 0.1870, Average validation accuracy: 0.9317, Validation epoch time: 0:00:03
Epoch 3/8, Batch 40/266, Elapsed: 0:00:14.
Epoch 3/8, Batch 80/266, Elapsed: 0:00:28.
Epoch 3/8, Batch 120/266

In [15]:
def test_model():
    df = pd.read_csv(source_folder + "testset/AMI2020_test_raw_gold_anon.tsv", delimiter='\t', header=0, names=['id', 'sentence', 'label', 'aggressiveness'])
    sentences = df.sentence.values
    labels = df.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,
        )
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=1)

    attention_masks = [[float(i > 1) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    batch_size = 16

    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    model.eval()

    predictions, true_labels = [], []

    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(logits)
        true_labels.extend(label_ids)

    pred_flat = np.argmax(predictions, axis=1).flatten()

    def eval_accuracy(a, b):
        true_pred = [j for i, j in zip(a, b) if i == j]
        accuracy = len(true_pred) / len(a)
        return accuracy

    print(eval_accuracy(true_labels, pred_flat))

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
test_model()

Predicting labels for 1,000 test sentences...
0.796
