In [34]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType
)

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [36]:
model_id = "microsoft/phi-2"
output_dir = "./phi2-qlora-finetuned-med/"

In [37]:
# Configure QLoRA parameters
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [38]:
# Load the base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [39]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [40]:
# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,                       # Rank
    lora_alpha=16,             # Alpha parameter
    lora_dropout=0.1,          # Dropout probability
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],  # Verify these match Phi-2's architecture
    bias="none",
    inference_mode=False,
)

In [41]:
# Prepare model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


In [42]:
dataset = Dataset.from_csv("final_dataset.csv")
dataset_dict = dataset.train_test_split(
    test_size=0.1,
    seed=42,
    shuffle=True
)

In [43]:
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['test']
})

In [44]:
def create_conversation_pairs(hf_dataset):
    """
    Given a HuggingFace Dataset with columns 'Question' and 'Answer',
    returns a list of dicts with keys 'instruction' and 'output'.
    """
    pairs = []
    for example in hf_dataset:
        pairs.append({
            "instruction": example["input"],
            "output": example["output"]
        })
    return pairs

In [45]:
conversation_pairs = create_conversation_pairs(dataset_dict['train'])
print(f"Number of pairs: {len(conversation_pairs)}")
print("First pair:", conversation_pairs[0])

Number of pairs: 76047
First pair: {'instruction': 'What is suggested by the presence of a new murmur after a prosthetic valve has been implanted?', 'output': 'The presence of a new murmur after a prosthetic valve has been implanted is suggestive of prosthetic valve dysfunction. A new murmur can indicate that there is a problem with the valve, such as a leak or a blockage, which can lead to reduced blood flow and other complications. In some cases, additional testing, such as an echocardiogram or cardiac catheterization, may be needed to confirm the diagnosis and determine the best course of treatment.'}


In [46]:
conversation_pairs_validation = create_conversation_pairs(dataset_dict['validation'])
print(f"Number of pairs: {len(conversation_pairs_validation)}")
print("First pair:", conversation_pairs_validation[0])

Number of pairs: 8450
First pair: {'instruction': 'What is subacute sclerosing panencephalitis (SSPE), and what causes it?', 'output': 'Subacute sclerosing panencephalitis (SSPE) is a rare, progressive, and usually fatal neurological disorder that occurs as a result of persistent infection of the brain by the measles virus. The virus causes inflammation and damage to the brain, leading to symptoms such as seizures, dementia, and loss of motor function. SSPE typically develops several years after a person has had measles, and is more common in individuals who contracted the virus at a young age. There is currently no cure for SSPE, and treatment is mainly focused on managing the symptoms.'}


In [47]:
from datasets import Dataset
train_dataset = Dataset.from_list(conversation_pairs)

In [48]:
# Function to tokenize the dataset
def tokenize_function(examples):
    # Format: <|user|>instruction<|assistant|>output<|endoftext|>
    prompts = []
    labels = []
    
    for instruction, output in zip(examples["instruction"], examples["output"]):
        # Create the prompt
        prompt = f"<|user|>{instruction}<|assistant|>{output}<|endoftext|>"
        prompts.append(prompt)
    
    tokenized_inputs = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    
    # Create labels (same as input_ids for causal LM)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    
    return tokenized_inputs

In [49]:
# Tokenize the datasets
train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["instruction", "output"],
)

Map:   0%|          | 0/76047 [00:00<?, ? examples/s]

In [50]:
val_tokenized = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["instruction", "output"],
)

Map:   0%|          | 0/8450 [00:00<?, ? examples/s]

In [51]:
# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [52]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    report_to="tensorboard",
    gradient_checkpointing=True,
)

In [53]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
)


In [None]:
trainer.train()

  0%|          | 0/23765 [00:00<?, ?it/s]

In [None]:
# Save the fine-tuned model
peft_model_path = os.path.join(output_dir, "peft_model")
model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(output_dir)

from peft import PeftConfig
config = PeftConfig.from_pretrained(peft_model_path)
config.save_pretrained(peft_model_path)

print(f"Model saved to {peft_model_path}")
print(f"Tokenizer saved to {output_dir}")

In [None]:
# Function for inference
def generate_response(instruction, model, tokenizer, max_length=512):
    prompt = f"<|user|>{instruction}<|assistant|>"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    # Extract only the assistant's response
    response = response.split("<|assistant|>")[1].split("<|endoftext|>")[0].strip()
    return response

In [None]:
# Test the model with a sample question
test_instruction = " how long will 6 mgs of suboxone block opiates?"
response = generate_response(test_instruction, model, tokenizer)
print(f"Instruction: {test_instruction}")
print(f"Response: {response}")