In [None]:
!pip uninstall -y sentence-transformers
!pip install torch==2.3.1+cu121 torchvision==0.18.1+cu121 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
!pip install -q "transformers==4.43.2" "datasets==2.18.0" "accelerate==0.29.3" "peft==0.10.0" "bitsandbytes==0.43.1" "trl==0.8.6" "protobuf==3.20.3"
!pip install -q einops scipy sentencepiece tensorboard

import os
os.kill(os.getpid(), 9)

Found existing installation: sentence-transformers 4.1.0
Uninstalling sentence-transformers-4.1.0:
  Successfully uninstalled sentence-transformers-4.1.0
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.3.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp311-cp311-linux_x86_64.whl (781.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.0/781.0 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.18.1+cu121
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.18.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.3.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.3.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m102.5 M

In [None]:

import json
import torch
from datasets import load_dataset
from google.colab import userdata
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training



try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("Successfully logged into Hugging Face.")
except Exception as e:
    print(f"🚨 Login failed. Please set your HF_TOKEN in Colab Secrets. Error: {e}")
    raise SystemExit("Stopping due to failed Hugging Face login.")

try:
    dataset = load_dataset('json', data_files='training_data_llm_generated_final.jsonl', split='train')
    print("\nDataset loaded successfully.")
    print(dataset)
except Exception as e:
    print(f"🚨 Failed to load dataset. Make sure 'training_data_llm_generated_final.jsonl' is uploaded. Error: {e}")
    raise SystemExit("Stopping due to dataset loading failure.")


def format_example(example):
    instruction = "Analyze the provided patient narrative about their experience with an Abdominal Wall Hernia (AWH) and generate a structured JSON output that summarizes your findings, adhering to the specified format and terminology."
    input_text = example['input']
    output_text = example['output']
    # The Llama 3 instruct template uses specific tokens for roles.
    return {
        'text': f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{instruction}\n\n**Patient Narrative:**\n{input_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output_text}<|eot_id|>"
    }

formatted_dataset = dataset.map(format_example)

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Llama 3 uses the end-of-text token for padding.
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=2048)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=['input', 'output', 'text'])
print("Dataset formatted and tokenized.")




bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False # Important for fine-tuning
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
print("Model configured with QLoRA.")

training_args = TrainingArguments(
    output_dir='./llama3-hernia-analyst-v1',
    num_train_epochs=1,
    per_device_train_batch_size=1, # Reduced for safety on T4 GPU
    gradient_accumulation_steps=8, # Effective batch size = 1 * 8 = 8
    learning_rate=2e-4,
    bf16=True,
    save_strategy="epoch",
    logging_steps=25,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("\n🚀 Starting the fine-tuning process...")
trainer.train()
print("✅ Training complete!")

trainer.save_model('./llama3-hernia-analyst-v1')
print("✅ Fine-tuned model adapter saved successfully to './llama3-hernia-analyst-v1'")

✅ Successfully logged into Hugging Face.


Generating train split: 0 examples [00:00, ? examples/s]


✅ Dataset loaded successfully.
Dataset({
    features: ['input', 'output'],
    num_rows: 263
})


Map:   0%|          | 0/263 [00:00<?, ? examples/s]

Map:   0%|          | 0/263 [00:00<?, ? examples/s]

✅ Dataset formatted and tokenized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Model configured with QLoRA.





🚀 Starting the fine-tuning process...


[34m[1mwandb[0m: Currently logged in as: [33mlaxmikant-nishad2308[0m ([33mlaxmikant-nishad2308-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
25,1.3972


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


✅ Training complete!
✅ Fine-tuned model adapter saved successfully to './llama3-hernia-analyst-v1'


In [None]:
!cp -r ./llama3-hernia-analyst-v1/ '/content/drive/My Drive/'