# Fine-tuning LLM

## Install and import libraries

In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import torch

from datasets import load_dataset
from datasets import load_metric

from transformers import AutoConfig
from transformers import AutoModelForCausalLM # Zero-shot LLaMA-2-7B
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import TrainingArguments
from transformers import Trainer

# Modify and push files
from huggingface_hub import login, logout
from huggingface_hub import HfApi

## Data import from HuggingFace

Change code below to take from HungingFace instead of gtfintechlab

In [3]:
HUGGING_FACE_LINK = "roymgabriel/BioPharma"

In [4]:
data_files = {"train": "train.csv", "test": "test.csv"}
dataset = load_dataset(HUGGING_FACE_LINK, data_files=data_files)
print(dataset)

Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/143k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'label'],
        num_rows: 6450
    })
    test: Dataset({
        features: ['ticker', 'disease', 'stage', 'date', 'catalyst', 'label'],
        num_rows: 785
    })
})


## Fine-Tune TinySapBERT + TinyPubMedBERT model
Taken from: dmis-lab/TinySapBERT-from-TinyPubMedBERT-v1.0

[TinySapBERT](https://aclanthology.org/2021.naacl-main.334.pdf) "TinySapBERT", tiny-sized biomedical entity representations (language model) trained using official SapBERT code and instructions (Liu et al., NAACL 2021). It is used in conjuction with TinyPubMedBERT for optimal classification.

### Data processing and tokenization

The code below uses the AutoTokenizer from `'roberta-base'`. This is important for several reasons:

* <b>Model-Specific Tokenization</b>: Different models can have different tokenization approaches. For example:

  * RoBERTa uses Byte-Pair Encoding (BPE).
  * [BERT](https://arxiv.org/abs/1810.04805) uses WordPiece tokenization.

* <b>Vocabulary Matching</b>: Each pretrained model comes with a specific vocabulary that it has been trained on.

* <b>Model Configuration and Special Tokens</b>: Pretrained models often come with specific configurations, including special tokens (like padding tokens, mask tokens, etc.).

* <b>Preprocessing Consistency</b>: If you are fine-tuning a pretrained model on a new task or dataset, it's important to preprocess the new data like how the original training data was processed.

In [5]:
# allenai/scibert_scivocab_uncased
# microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext
# dmis-lab/biobert-v1.1
# bert-base-uncased
# dmis-lab/TinySapBERT-from-TinyPubMedBERT-v1.0

MODEL = 'dmis-lab/biobert-v1.1'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize_data(example):
    return tokenizer(example['catalyst'], padding="max_length", truncation=True, max_length=512)

dataset_new = dataset.map(tokenize_data, batched=True)

remove_columns = ['ticker', 'disease', 'stage', 'date', 'catalyst']
dataset_new = dataset_new.map(remove_columns=remove_columns)


print(dataset_new)

train_dataset = dataset_new['train']
eval_dataset = dataset_new['test']

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/6450 [00:00<?, ? examples/s]

Map:   0%|          | 0/785 [00:00<?, ? examples/s]

Map:   0%|          | 0/6450 [00:00<?, ? examples/s]

Map:   0%|          | 0/785 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6450
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 785
    })
})


### Set training arguments

`learning_rate`: Anywhere from 1e-5 to 1e-7 typically works well.

`push_to_hub`: False for now because we don't want to push to Hugging Face until we are happy with the model.

In [6]:
training_args = TrainingArguments(output_dir=HUGGING_FACE_LINK,
                                  num_train_epochs=100,
                                  learning_rate=1e-6,
                                  per_device_train_batch_size=4,
                                  hub_model_id=HUGGING_FACE_LINK,
                                  push_to_hub=False)

### Load Pre-trained Language Model (PLM)

The code below loads the pretrained model "roberta-base" from Hugging Face's models hub. Note that sequence classification is a task where a model assigns a label to an entire sequence (or sentence) rather than individual tokens.

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=3)

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load and create function to compute metric

This F1 score is weighted by the number of true instances for each label. It accounts for class imbalance by giving more weight to the classes with more instances.

In [8]:
metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

### Create trainer object

The shard method is used to divide the dataset into multiple smaller "shards" and then select one of those shards. In this case, both the training and evaluation datasets are divided into 10 shards, and only the first shard (index=0) is selected for training and evaluation. This is useful when you want to train or evaluate on a subset of the data, possibly for faster experimentation.

In [9]:
import gc
gc.collect()

289

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.shard(num_shards=10, index=0),
    eval_dataset=eval_dataset.shard(num_shards=10, index=0),
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

### Train (Fine-tune) the model

In [11]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.055
1000,0.9582
1500,0.8698
2000,0.7957
2500,0.7327
3000,0.6943
3500,0.6489
4000,0.6158
4500,0.5986
5000,0.5758


Step,Training Loss
500,1.055
1000,0.9582
1500,0.8698
2000,0.7957
2500,0.7327
3000,0.6943
3500,0.6489
4000,0.6158
4500,0.5986
5000,0.5758


TrainOutput(global_step=16200, training_loss=0.5408316948973102, metrics={'train_runtime': 6802.005, 'train_samples_per_second': 9.482, 'train_steps_per_second': 2.382, 'total_flos': 1.6970815443456e+16, 'train_loss': 0.5408316948973102, 'epoch': 100.0})

### Evaluate the model

In [12]:
evaluate_output = trainer.evaluate()
print(evaluate_output)
# BioBERT

{'eval_loss': 1.4312114715576172, 'eval_f1': 0.6674167838724802, 'eval_runtime': 2.6706, 'eval_samples_per_second': 29.581, 'eval_steps_per_second': 3.744, 'epoch': 100.0}


## Deploy model on HuggingFace

### Login to HuggingFace

This function call prompts the user to log in to their Hugging Face account. Once authenticated, an access token will be saved locally, allowing the user to interact with the Hugging Face Hub (e.g., push models, datasets) programmatically without needing to re-authenticate every time.

In [None]:
login()

### Push tokenizer and trained model

After pushing, you can check out the model on Hugging Face. It is also possible to do some inference on Hugging Face (test the model).

In [None]:
MODEL_PATH = "roymgabriel/BioPharma"
tokenizer.push_to_hub("roymgabriel/BioPharma")
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

events.out.tfevents.1701240478.deaaef5c326c.201.4:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

events.out.tfevents.1701240507.deaaef5c326c.201.5:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

events.out.tfevents.1701240523.deaaef5c326c.201.6:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

events.out.tfevents.1701244181.deaaef5c326c.201.7:   0%|          | 0.00/405 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

'https://huggingface.co/roymgabriel/BioPharma/tree/main/'

### Modify and push additional files

The code below modifies a local tokenizer configuration file, then uploads the updated configuration to a specified repository on the Hugging Face Model Hub.

In [None]:
with open("/content/roymgabriel/BioPharma/tokenizer_config.json", "r") as f:
  config = json.load(f)

# Make the necessary changes to the config file.

config["name_or_path"] = MODEL

with open("/content/roymgabriel/BioPharma/tokenizer_config.json", "w") as f:
  json.dump(config, f, indent=4)

api = HfApi()
api.upload_file(
    path_or_fileobj="/content/roymgabriel/BioPharma/tokenizer_config.json",
    path_in_repo="tokenizer_config.json",
    repo_id="roymgabriel/BioPharma",
    repo_type="model",
)

'https://huggingface.co/roymgabriel/BioPharma/blob/main/tokenizer_config.json'

## Logout from HuggingFace
Logout so your tokens won’t be used by someone else.

In [None]:
logout() # logout completely

Successfully logged out.
