In [1]:
import math
import numpy as np
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaTokenizerFast, RobertaConfig, RobertaModelWithHeads
from transformers import Trainer, TrainingArguments, EvalPrediction
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaTokenizer, RobertaForMaskedLM
from sklearn.metrics import f1_score

In [3]:
def check_gpu():
    '''
    check gpu status
    '''
    try:
        print('GPU available:', torch.cuda.is_available())
        print(torch.cuda.device_count(), 'GPUs detected')
        print('Current GPU id:', torch.cuda.current_device())
        print('Current GPU Name:', torch.cuda.get_device_name(torch.cuda.current_device()))
    except:
        print('GPU not available')

In [62]:
def encode_batch(batch):
    '''
    Encodes a batch of input data using the model tokenizer
    '''
    return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

In [4]:
check_gpu()

GPU available: True
1 GPUs detected
Current GPU id: 0
Current GPU Name: NVIDIA GeForce GTX 980 Ti


In [5]:
# dictionary for dataset, name: (classes, type of f1 score)
dataset_dict = {'chemprot': (13, 'micro'), 'rct': (5, 'micro'),
                'CI': (6, 'macro'), 'sciie': (7, 'm2cro'),
                'HN': (2, 'macro'), 'ag': (4, 'macro'),
                'amazon': (2, 'macro'), 'imdb': (2, 'macro')}

In [26]:
ds_name = 'amazon'
n_labels = dataset_dict[ds_name][0]
f1_type = dataset_dict[ds_name][1]

In [27]:
dataset = load_dataset(f'data_loaders/{ds_name}_data_loader.py')

Downloading and preparing dataset task_dataset/task (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\d4dbb1ae1e5b21302597f18c62e58ab7f320999e2bdffea6d0514c3c329ad9ae...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset task_dataset downloaded and prepared to C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\d4dbb1ae1e5b21302597f18c62e58ab7f320999e2bdffea6d0514c3c329ad9ae. Subsequent calls will reuse this data.


In [60]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
# model = RobertaForMaskedLM.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base', return_dict=True)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
dataset_encoded = dataset.map(encode_batch, batched=True, batch_size=512)
# tokenized_datasets = dataset.map(tokenizer, batched=True, num_proc=4, remove_columns=["text"])

HBox(children=(FloatProgress(value=0.0, max=226.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=49.0), HTML(value='')))




In [75]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 115251
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'text'],
        num_rows: 25000
    })
})

In [76]:
dataset_encoded['train']['text'][0]

'Works fine for my new Samsung Galaxy S3\n\nWorks great. No problems with my Samsung Galaxy S3.  It charges with no problems.  It lights up when it is charging.'

In [77]:
tokenizer.decode(dataset_encoded['train']['input_ids'][0])

'<s>Works fine for my new Samsung Galaxy S3\n\nWorks great. No problems with my Samsung Galaxy S3.  It charges with no problems.  It lights up when it is charging.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [49]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [78]:
training_args = TrainingArguments(
    "test-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)

In [79]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    data_collator=data_collator,
)

In [80]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.65609,1.57583
2,1.595609,1.496033
3,1.521609,1.474264


TrainOutput(global_step=43221, training_loss=1.6328712460378056)

In [81]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 4.37


In [84]:
# save model in trainer format
trainer.save_model('model/amazon_mlm.pt')

In [87]:
# save torch model
torch.save(trainer.model, 'model/amazon_mlm.pth')

In [100]:
torch.tensor(dataset_encoded['train'][0]['input_ids']).shape

torch.Size([80])

In [107]:
trainer.model((torch.tensor(dataset_encoded['train'][0]['input_ids'])).unsqueeze(0).to('cuda'))

MaskedLMOutput(loss=None, logits=tensor([[[15.2053, -3.2850,  8.9295,  ...,  2.3371,  5.3764,  7.6053],
         [ 5.9016, -2.7083,  9.5977,  ..., -1.4241,  1.3352,  2.5618],
         [-0.7304, -4.5036,  3.2780,  ..., -0.7873, -1.0853, -0.2154],
         ...,
         [-0.1011, -5.6306,  5.3763,  ..., -5.6786, -3.1615,  1.2951],
         [-0.1011, -5.6306,  5.3763,  ..., -5.6786, -3.1615,  1.2951],
         [-0.1011, -5.6306,  5.3763,  ..., -5.6786, -3.1615,  1.2951]]],
       device='cuda:0', grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [109]:
# check model status
trainer.model.training

False

## Compare with raw RoBerta MLM model

In [110]:
model_raw = RobertaForMaskedLM.from_pretrained('roberta-base', return_dict=True)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
trainer_raw = Trainer(
    model=model_raw,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    data_collator=data_collator,
)

In [113]:
eval_results_raw = trainer_raw.evaluate()
print(f"Perplexity: {math.exp(eval_results_raw['eval_loss']):.2f}")

Perplexity: 13.99
