In [1]:
ON_COLAB = True
if ON_COLAB:
    import os

    !pip install -q datasets emoji accelerate sklearn
    !git clone https://github.com/linv24/da-stance-detection.git
    os.chdir(os.getcwd() + '/da-stance-detection')

fatal: destination path 'da-stance-detection' already exists and is not an empty directory.


In [53]:
import torch
import pandas as pd
import transformers
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
from sklearn.metrics import f1_score
import numpy as np

from data import data_util

In [3]:
ds_csv = 'augment/data/sr_augmented.csv'

NUM_LABELS = 3
CHECKPOINT_OUTPUT_PATH = 'checkpoint/'

In [4]:
ds = load_dataset('csv', data_files=ds_csv, split='train')
split_ds = ds.train_test_split(test_size=0.2)
split_ds

DatasetDict({
    train: Dataset({
        features: ['Tweet', 'Target', 'Stance'],
        num_rows: 4673
    })
    test: Dataset({
        features: ['Tweet', 'Target', 'Stance'],
        num_rows: 1169
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')

In [6]:
def preprocess_data(batch_tweet):
    encoding = tokenizer(batch_tweet['Tweet'],
                         return_tensors='pt',
                         truncation=True,
                         max_length=128,
                         padding='max_length')
    encoding['label'] = batch_tweet['Stance']
    return encoding

In [7]:
encoded_split_ds = split_ds.map(preprocess_data,
                                batched=True)
encoded_split_ds

Map:   0%|          | 0/4673 [00:00<?, ? examples/s]

Map:   0%|          | 0/1169 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 4673
    })
    test: Dataset({
        features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1169
    })
})

In [8]:
tokenizer.decode(encoded_split_ds['train'][0]['input_ids'])

"<s> If it weren't for her marriage, Hillary would nameless. Thanks a lot, Bill. RT @ MangyLover # HIllaryClinton # WeDontTrustYouHillary # SemST </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base',
                                                           num_labels=NUM_LABELS)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [94]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    preds = np.argmax(preds, axis=1)
    labels = eval_preds.label_ids

    scores = f1_score(preds, labels, average='macro')

    against_preds = []
    favor_preds = []
    for ix, label in enumerate(labels):
        if label == 0: # AGAINST
            against_preds.append(preds[ix])
        elif label == 1: # FAVOR
            favor_preds.append(preds[ix])

    f_against = f1_score([0] * len(against_preds), against_preds, average='macro')
    f_favor = f1_score([1] * len(favor_preds), favor_preds, average='macro')



    return {
        'f_against': f_against,
        'f_favor': f_favor,
        'f_avg': (f_against + f_favor) / 2
    }


In [95]:
# model hyperparameters

lr = 2e-5
batch_size = 32
num_epochs = 5
seed = 42

In [96]:
training_args = TrainingArguments(
    output_dir=CHECKPOINT_OUTPUT_PATH + ds_csv,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    seed=seed,
    load_best_model_at_end=True,
    # metric_for_best_model="f1",
    optim="adamw_torch",
)

In [97]:
trainer = Trainer(model=model, args=training_args,
                  train_dataset=encoded_split_ds['train'],
                  eval_dataset=encoded_split_ds['test'],
                  compute_metrics=compute_metrics
                  )

In [98]:
trainer.train()

Epoch,Training Loss,Validation Loss,F Against,F Favor,F Avg
1,0.0467,0.907355,0.31879,0.312899,0.315845
2,0.1095,0.561802,0.326507,0.310167,0.318337
3,0.0341,0.412782,0.32505,0.31758,0.321315
4,0.0219,0.414048,0.324464,0.3202,0.322332
5,0.0095,0.403963,0.327087,0.318895,0.322991


TrainOutput(global_step=735, training_loss=0.04432994021850378, metrics={'train_runtime': 192.765, 'train_samples_per_second': 121.21, 'train_steps_per_second': 3.813, 'total_flos': 1536911251303680.0, 'train_loss': 0.04432994021850378, 'epoch': 5.0})

In [102]:
# test_ds = load_dataset('csv', data_files='data/semeval-2016/test.csv', split='train')
test_ds = load_dataset('csv', data_files='data/original_test.csv', split='train')
test_ds

Dataset({
    features: ['Tweet', 'Target', 'Stance'],
    num_rows: 1956
})

In [122]:
test_ds[0]

{'Tweet': 'He who exalts himself shall      be humbled; and he who humbles himself shall be exalted.Matt 23:12.     #SemST',
 'Target': 'Atheism',
 'Stance': 'AGAINST'}

In [120]:
encoded_test_ds = test_ds.map(preprocess_data,
                                      batched=True)
encoded_test_ds

Dataset({
    features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 1956
})

In [117]:
test_input_ids=encoded_test_ds['input_ids'].to(trainer.model.device)
test_token_type_ids=encoded_test_ds['token_type_ids'].to(trainer.model.device)
test_attention_mask=encoded_test_ds['attention_mask'].to(trainer.model.device)

In [121]:
out = trainer.evaluate(encoded_test_ds)
out

ValueError: ignored

In [None]:
def eval_pipeline(csv_files, test_csv, tokenizer):
    for ds_csv in csv_files:
        ds = load_dataset('csv', data_files=ds_csv, split='train')
        split_ds = ds.train_test_split(test_size=0.2)

        encoded_split_ds = split_ds.map(preprocess_data,
                                batched=True)

        model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base',
                                                           num_labels=NUM_LABELS)
        trainer = Trainer(model=model, args=training_args,
                  train_dataset=encoded_split_ds['train'],
                  eval_dataset=encoded_split_ds['test'],
                  compute_metrics=compute_metrics
                  )

        trainer.train()

        # evaluate
        test_ds = load_dataset('csv', data_files=test_csv, split='train')
        encoded_test_ds = test_ds.map(preprocess_data,
                                      batched=True)

        # out = trainer.model(**encoding)

