In [1]:
!pip install transformers
!pip install bert-tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import re
import random
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import AdamW, AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/drive/My Drive/NLP/AMI2020/trainingset/AMI2020_training_raw_anon.tsv', sep='\t')


In [5]:
data['text'] = data['text'].apply(lambda x: re.sub(r'<MENTION_\d+>|<URL>', '', x))
data['labels'] = data['misogynous']

In [6]:
print("Before data augmentation:")
print(data.labels.value_counts())

Before data augmentation:
0    2362
1    2047
Name: labels, dtype: int64


In [7]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from nlpaug.util import Action

In [9]:
def random_deletion(sentence):
    aug = nac.RandomCharAug(action=Action.DELETE)
    return aug.augment(sentence)

def random_insertion(sentence):
    aug = nac.RandomCharAug(action=Action.INSERT)
    return aug.augment(sentence)

def random_swap(sentence):
    aug = nac.RandomCharAug(action=Action.SWAP)
    return aug.augment(sentence)

In [10]:
random.seed(42)
augmentation_methods = [random_deletion, random_insertion, random_swap]
tweets_to_augment = 315

misogynous_data = data[data['labels'] == 1].sample(tweets_to_augment)
augmented_data = []

for _, row in misogynous_data.iterrows():
    random_augmentation = random.choice(augmentation_methods)
    augmented_text = random_augmentation(row['text'])
    augmented_data.append({'text': augmented_text, 'labels': row['labels']})

augmented_df = pd.DataFrame(augmented_data)
data_augmented = pd.concat([data, augmented_df], ignore_index=True)

In [11]:
print("\nAfter data augmentation:")
print(data_augmented.labels.value_counts())


After data augmentation:
1    2362
0    2362
Name: labels, dtype: int64


In [12]:
data = data_augmented

In [14]:
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.06,
    random_state=17,
    stratify=data.labels.values
)

data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

In [15]:
tokenizer = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", do_lower_case=True)

In [17]:
def encode_data(dataset, max_length=256):
    input_texts = [str(text) for text in dataset.text.values]
    return tokenizer.batch_encode_plus(
        input_texts,
        add_special_tokens=True,
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

encoded_data_train = encode_data(data[data.data_type == 'train'])
encoded_data_val = encode_data(data[data.data_type == 'val'])

In [18]:
def create_dataset(encoded_data, labels):
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    torch_labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, torch_labels)

dataset_train = create_dataset(encoded_data_train, data[data.data_type == 'train'].labels.values)
dataset_val = create_dataset(encoded_data_val, data[data.data_type == 'val'].labels.values)

batch_size = 16
dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=batch_size)

In [19]:
model = BertForSequenceClassification.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/740M [00:00<?, ?B/s]

Some weights of the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

In [20]:
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 8
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [21]:
def eval_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat)

def evaluate(dataloader_val):
    model.eval()
    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [22]:
for epoch in tqdm(range(1, epochs + 1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_acc = eval_accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'Accuracy: {val_acc}')

  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.30304251463829185
Validation loss: 0.25665676987005603
Accuracy: 0.9014084507042254


Epoch 2:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.18698384096383674
Validation loss: 0.30403504158473676
Accuracy: 0.897887323943662


Epoch 3:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.1255082629584178
Validation loss: 0.3510912974209835
Accuracy: 0.9119718309859155


Epoch 4:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.08022595501191605
Validation loss: 0.4799884775663183
Accuracy: 0.9049295774647887


Epoch 5:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.04307534790278761
Validation loss: 0.5890381499225946
Accuracy: 0.9014084507042254


Epoch 6:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.023463309193254036
Validation loss: 0.6339062696230636
Accuracy: 0.9084507042253521


Epoch 7:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.015384686200318584
Validation loss: 0.6736062191622396
Accuracy: 0.9049295774647887


Epoch 8:   0%|          | 0/278 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.008285035901738954
Validation loss: 0.6738544659003511
Accuracy: 0.9084507042253521


In [23]:
source_folder = '/content/drive/My Drive/NLP/AMI2020/'
MAX_LEN = 128
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
def test_model():
    df = pd.read_csv(source_folder + "testset/AMI2020_test_raw_gold_anon.tsv", delimiter='\t', header=0, names=['id', 'sentence', 'label', 'aggressiveness'])
    sentences = df.sentence.values
    labels = df.label.values

    input_ids = []
    for sent in sentences:
        encoded_sent = tokenizer.encode(
            sent,
            add_special_tokens=True,
        )
        input_ids.append(encoded_sent)

    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post", value=1)

    attention_masks = [[float(i > 1) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    batch_size = 16

    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

    model.eval()

    predictions, true_labels = [], []

    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(logits)
        true_labels.extend(label_ids)

    pred_flat = np.argmax(predictions, axis=1).flatten()

    def eval_accuracy(a, b):
        true_pred = [j for i, j in zip(a, b) if i == j]
        accuracy = len(true_pred) / len(a)
        return accuracy

    print(eval_accuracy(true_labels, pred_flat))

In [25]:
test_model()

Predicting labels for 1,000 test sentences...
0.808
