In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, AutoTokenizer, RobertaTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,BertForSequenceClassification, RobertaForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric
from utils import *
from train import *
import os

# load dataset and setup the hyperparameter

In [2]:
config = {
    'model_name': 'roberta', 
    'batch_size': 32,
    'lr': 5e-5,
    'num_epochs': 4,
    'warmup_steps': 0,
}  

In [3]:
# load dataset
Train = load_dataset('health_fact', split='train') 
Val = load_dataset('health_fact', split='validation') 
Test = load_dataset('health_fact', split='test') 

Using custom data configuration default
Reusing dataset health_fact (/home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19)
Using custom data configuration default
Reusing dataset health_fact (/home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19)
Using custom data configuration default
Reusing dataset health_fact (/home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19)


# preprocessing and embedding

In [4]:
# data preprocessing
train, val, test = data_preprocess(Train), data_preprocess(Val), data_preprocess(Test)

Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19/cache-673275d01d038318.arrow
Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19/cache-f48eafa5a39b424d.arrow
Loading cached processed dataset at /home/zh2095/.cache/huggingface/datasets/health_fact/default/1.1.0/99503637e4255bd805f84d57031c18fe4dd88298f00299d56c94fc59ed68ec19/cache-e2847b2b1fc0054c.arrow


In [5]:
# embedding
train_dataset, val_dataset = embedding(train, config), embedding(val, config)

  0%|          | 0/9804 [00:00<?, ?ex/s]

  0%|          | 0/1214 [00:00<?, ?ex/s]

# model finetuning

In [6]:
# load a pretrained model
if config['model_name'] == 'bert':
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=4)
elif config['model_name'] == 'roberta':
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [8]:
# set training arguments manually if needed, otherwise use the defalut
training_args = TrainingArguments(
    output_dir='../output',          # output directory
    num_train_epochs=config['num_epochs'],              # total number of training epochs
    per_device_train_batch_size=config['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=config['batch_size'],   # batch size for evaluation
    warmup_steps=config['warmup_steps'],                # number of warmup steps for learning rate scheduler
    learning_rate=config['lr'],               # learning rate
    logging_dir='../logs',            # directory for storing logs
    logging_steps=1000,
    evaluation_strategy='epoch'
)

In [9]:
# Create a Trainer object with the model, training arguments, training and test datasets, and evaluation function
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
    compute_metrics=compute_accuracy)

trainer.train() # resume_from_checkpoint=True if already trained, to save time by continuing on a checkpoin

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: explanation. If explanation are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9804
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1228


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.705412,0.719934
2,No log,0.617166,0.751236
3,No log,0.660078,0.76771
4,0.577200,0.755386,0.765239


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: explanation. If explanation are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1214
  Batch size = 32
Saving model checkpoint to ../output/checkpoint-500
Configuration saved in ../output/checkpoint-500/config.json
Model weights saved in ../output/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: explanation. If explanation are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1214
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward

TrainOutput(global_step=1228, training_loss=0.5235991835205873, metrics={'train_runtime': 678.133, 'train_samples_per_second': 57.829, 'train_steps_per_second': 1.811, 'total_flos': 5159174174408704.0, 'train_loss': 0.5235991835205873, 'epoch': 4.0})

In [11]:
# save the model
save_model(model, config)

Configuration saved in roberta-pubhealth/config.json
Model weights saved in roberta-pubhealth/pytorch_model.bin


The best fine-tuned model has been saved as:  roberta-pubhealth
