In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes


In [None]:
!pip install -q huggingface_hub
!pip install -q -U trl peft
!pip install -q -U datasets wandb

In [None]:
# Uncomment to install new features that support latest models like Llama 2
# !pip install git+https://github.com/huggingface/peft.git
# !pip install git+https://github.com/huggingface/transformers.git

In [None]:
# When prompted, paste the HF access token you created earlier.
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
base_model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = "./results"

In [None]:
#dataset_name = "<your_hf_dataset>"
#dataset = load_dataset(dataset_name, split="train")

train='icddx_train.jsonl'
dataset = load_dataset("json", data_files= str(train))

In [None]:
dataset

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=100
)

max_seq_length = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

trainer.train()

import os
output_dir = os.path.join(output_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)

In [None]:
from peft import AutoPeftModelForCausalLM

output_dir='./icd_dx_model/results/final_checkpoint'
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=device_map, torch_dtype=torch.bfloat16)


In [None]:
text = "###Instruction\\nGive ICD10 Dx code for given text\\n###Input:\\nSepsis due to septic shock\\n###Output:\\n"
inputs = tokenizer(text, return_tensors="pt").to(0)
outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
ls

In [None]:
!zip -r icd_dx_model.zip results/

In [None]:
from google.colab import files
files.download('icd_dx_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('icddx_train.jsonl')

In [None]:
!zip -r wandb.zip wandb/

In [None]:
files.download('wandb.zip')

In [None]:
!unzip icd_dx_model.zip

In [None]:
ls


[0m[01;34mdrive[0m/  icd_dx_model.zip  [01;34msample_data[0m/


In [None]:
!unzip icd_dx_model