# Modeling with Meditron

code :

In [2]:
!pip -q install datasets loralib sentencepiece trl mlflow bitsandbytes
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
HUGGING_FACE_TOKEN='ADD_YOUR_HUGGING_FACE_TOKEN'

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd gdrive/MyDrive/dl_chatbot

/content/gdrive/MyDrive/dl_chatbot


## Data processing for finetuning

In [4]:
import torch
import torch.nn as nn
import pandas as pd
import bitsandbytes as bnb
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from transformers import TrainingArguments
from trl import SFTTrainer

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

from datasets import load_dataset, load_from_disk, Dataset, DatasetDict



In [5]:
tokenizer = AutoTokenizer.from_pretrained("epfl-llm/meditron-7b", token=HUGGING_FACE_TOKEN, add_eos_token=True)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_config.json:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [6]:
def generate_prompt(data_point):
    # taken from https://github.com/tloen/alpaca-lora
    return f""" Below is a medical question below, provide an answer to it.
    ### Question :
    {data_point["question"]}

    ### Answer :
    {data_point["answer"]}"""

In [11]:
# Load the dataset
data  = load_dataset("json", data_files="final_qa_dataset.json")

data = pd.DataFrame(data['train'])

In [12]:
# We reduce the size of our dataset because of limited ressources
DATA_LIMIT = 120000

In [13]:
data = data[:DATA_LIMIT]

In [14]:
data['text'] = data.apply(lambda x : generate_prompt(x), axis=1)

In [None]:
data.head()

Unnamed: 0,question,answer,text
0,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...,"Below is a medical question below, provide an..."
1,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...,"Below is a medical question below, provide an..."
2,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen...","Below is a medical question below, provide an..."
3,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...,"Below is a medical question below, provide an..."
4,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...,"Below is a medical question below, provide an..."


In [15]:
dataset = Dataset.from_pandas(data[['question', 'text']]).train_test_split(test_size=0.05, seed=42)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'text'],
        num_rows: 190
    })
    test: Dataset({
        features: ['question', 'text'],
        num_rows: 10
    })
})

In [16]:
# Tokenizing our data for the fine-tuning
CUTOFF_LEN = 4096

dataset = dataset.map(
    lambda data_point: tokenizer(
        data_point['text'],
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

Map:   0%|          | 0/114000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
dataset.save_to_disk('tokenized_dataset_100k')

In [22]:
#%cd ../../../
!zip -r /tokenized_dataset_100k.zip tokenized_dataset_100k

from google.colab import files
files.download("/tokenized_dataset_100k.zip")

  adding: tokenized_dataset_100k/ (stored 0%)
  adding: tokenized_dataset_100k/test/ (stored 0%)
  adding: tokenized_dataset_100k/test/dataset_info.json (deflated 67%)
  adding: tokenized_dataset_100k/test/state.json (deflated 38%)
  adding: tokenized_dataset_100k/test/data-00000-of-00001.arrow (deflated 98%)
  adding: tokenized_dataset_100k/dataset_dict.json (stored 0%)
  adding: tokenized_dataset_100k/train/ (stored 0%)
  adding: tokenized_dataset_100k/train/data-00003-of-00005.arrow (deflated 98%)
  adding: tokenized_dataset_100k/train/data-00004-of-00005.arrow (deflated 98%)
  adding: tokenized_dataset_100k/train/data-00001-of-00005.arrow (deflated 98%)
  adding: tokenized_dataset_100k/train/dataset_info.json (deflated 67%)
  adding: tokenized_dataset_100k/train/state.json (deflated 65%)
  adding: tokenized_dataset_100k/train/data-00002-of-00005.arrow (deflated 98%)
  adding: tokenized_dataset_100k/train/data-00000-of-00005.arrow (deflated 98%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Script like modeling

In [3]:
data  = load_from_disk("tokenized_dataset_100k")

print("Data loaded \n")

Data loaded 



In [4]:
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

base_dir = "lora-dolly"

per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-5
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"
EPOCHS = 2

In [5]:
model = AutoModelForCausalLM.from_pretrained("epfl-llm/meditron-7b", token=HUGGING_FACE_TOKEN, load_in_8bit=True,
    device_map="auto")

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

print("Model loaded \n")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

trainable params: 2,097,152 || all params: 6,740,652,032 || trainable%: 0.031112005041117066
Model loaded 



In [6]:
# Training Arguments and Trainer Initialization
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=10,
    load_best_model_at_end=True
)

trainer = SFTTrainer(
   model,
   train_dataset=dataset['train'],
   eval_dataset = dataset['test'],
   dataset_text_field="text",
   max_seq_length=512,
   args=training_args,
)

print("Trainer set \n")

#Upcast layer norms to float 32 for stability
for name, module in trainer.model.named_modules():
 if "norm" in name:
   module = module.to(torch.float32)

Trainer set 



In [7]:
print("Training starting \n")

# Initiate the training process
trainer.train()

Training starting 



TypeError: ignored

In [None]:
print("Training finished")

model.save_pretrained("meditron-lora-dolly")