In [12]:
import torch
import pandas as pd
import transformers
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset

from data import data_util

In [13]:
ds_csv = 'augment/data/sr_augmented.csv'

NUM_LABELS = 3
CHECKPOINT_OUTPUT_PATH = 'checkpoint/'

In [14]:
ds = load_dataset('csv', data_files=ds_csv, split='train')
split_ds = ds.train_test_split(test_size=0.2)
split_ds

DatasetDict({
    train: Dataset({
        features: ['Tweet', 'Target', 'Stance'],
        num_rows: 4673
    })
    test: Dataset({
        features: ['Tweet', 'Target', 'Stance'],
        num_rows: 1169
    })
})

In [15]:
tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')

In [16]:
def preprocess_data(batch_tweet):
    encoding = tokenizer(batch_tweet['Tweet'],
                         return_tensors='pt',
                         truncation=True,
                         max_length=128,
                         padding='max_length')
    encoding['label'] = batch_tweet['Stance']
    return encoding

In [17]:
encoded_split_ds = split_ds.map(preprocess_data,
                                batched=True)
encoded_split_ds

Map: 100%|██████████| 4673/4673 [00:01<00:00, 2997.90 examples/s]
Map: 100%|██████████| 1169/1169 [00:00<00:00, 1214.95 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 4673
    })
    test: Dataset({
        features: ['Tweet', 'Target', 'Stance', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 1169
    })
})

In [18]:
tokenizer.decode(encoded_split_ds['train'][0]['input_ids'])

'<s> @HillaryClinton the @DalaiLama speaks of women in leadership roles bringing about a more compassionate world. #potus #SemST </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [19]:
model = AutoModelForSequenceClassification.from_pretrained('vinai/bertweet-base',
                                                           num_labels=NUM_LABELS)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
def compute_metrics(eval_preds):
    preds = eval_preds.predictions
    labels = eval_preds.label_ids


In [24]:
# model hyperparameteres

lr = 2e-5
batch_size = 32
num_epochs = 5
seed = 42

In [26]:
training_args = TrainingArguments(
    output_dir=CHECKPOINT_OUTPUT_PATH + ds_csv,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    seed=seed,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    optim="adamw_torch",
)

In [27]:
trainer = Trainer(model=model, args=training_args,
                  train_dataset=encoded_split_ds['train'],
                  eval_dataset=encoded_split_ds['test'],
                  compute_metrics=None # !!!
                  )

In [28]:
trainer.train()

  0%|          | 1/735 [00:40<8:18:42, 40.77s/it]