In [1]:
import math
import numpy as np
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaTokenizerFast, RobertaConfig, RobertaModelWithHeads
from transformers import Trainer, TrainingArguments, EvalPrediction
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaForMaskedLM
from transformers import AdapterType
from sklearn.metrics import f1_score

In [2]:
def check_gpu():
    '''
    check gpu status
    '''
    try:
        print('GPU available:', torch.cuda.is_available())
        print(torch.cuda.device_count(), 'GPUs detected')
        print('Current GPU id:', torch.cuda.current_device())
        print('Current GPU Name:', torch.cuda.get_device_name(torch.cuda.current_device()))
    except:
        print('GPU not available')

In [3]:
torch.cuda.is_available()

True

In [4]:
def encode_batch(batch):
    '''
    Encodes a batch of input data using the model tokenizer
    '''
    return tokenizer(batch["text"], max_length=120, truncation=True, padding="max_length")
#     return tokenizer(batch["text"], padding=True, truncation=True)

In [5]:
check_gpu()

GPU available: True
1 GPUs detected
Current GPU id: 0
Current GPU Name: GeForce RTX 2070 SUPER


In [6]:
# dictionary for dataset, name: (classes, type of f1 score)
dataset_dict = {'chemprot': (13, 'micro'), 'rct': (5, 'micro'),
                'CI': (6, 'macro'), 'sciie': (7, 'm2cro'),
                'HN': (2, 'macro'), 'ag': (4, 'macro'),
                'amazon': (2, 'macro'), 'imdb': (2, 'macro')}

In [7]:
ds_name = 'amazon'
n_labels = dataset_dict[ds_name][0]
f1_type = dataset_dict[ds_name][1]

In [8]:
dataset = load_dataset(f'data_loaders/{ds_name}_data_loader.py')

Reusing dataset task_dataset (/home/qiyuan/.cache/huggingface/datasets/task_dataset/task/1.0.0/87bbe42c6ce54a8fea3de56fe84f48ddf4bf3723241ccf57cfe917510b90442d)


In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [10]:
model = RobertaModelWithHeads.from_pretrained('roberta-base')
# load pretrained model
# model = RobertaForMaskedLM.from_pretrained('model/amazon_mlm.pt', return_dict=True).to('cuda')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [11]:
model.add_adapter("classifier", adapter_type=AdapterType.text_task, config="pfeiffer")

In [12]:
model.train_adapter(['classifier'])

In [13]:
model.add_classification_head("classifier", num_labels=n_labels)

In [14]:
model.set_active_adapters([["classifier"]])

In [None]:
# test model output
model(**tokenizer("Hello, my dog is cute", return_tensors="pt").to('cuda'))

In [None]:
# model.add_adapter('language', AdapterType.text_lang)

In [None]:
# model.train_adapter('language')

In [15]:
#dataset_encoded = dataset.map(encode_batch, batched=True, batch_size=512, remove_columns=["text"])
dataset_encoded = dataset.map(encode_batch, batched=True)
# tokenized_datasets = dataset.map(tokenizer, batched=True, num_proc=4, remove_columns=["text"])

HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




In [None]:
dataset_encoded

In [16]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [17]:
training_args = TrainingArguments(
    "test-classification",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    remove_unused_columns=False
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"]
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 