# ICD-10 Two-Stage Fine-Tuning Process

This notebook implements the two-stage fine-tuning approach for medical coding automation:
1. Initial fine-tuning with complete ICD-10 code set
2. Enhanced fine-tuning for handling linguistic and lexical variations

In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import deepspeed
import json
from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer

## Configuration

Choose between proprietary (GPT-4o mini) and open-source (Llama) models.

In [2]:
# Model selection - choose between proprietary and open-source models
MODEL_TYPE = "llama"  # Options: "llama" or "gpt4o_mini"

# Configuration
if MODEL_TYPE == "llama":
    BASE_MODEL = "meta-llama/Llama-3.2-1B"  # Options: Llama-3.2-1B, Llama-3.2-3B, or Llama-3.1-8B
    USE_LORA = True  # For memory efficiency with Llama models
else:
    BASE_MODEL = "gpt-4o-mini"  # For OpenAI API calls
    USE_LORA = False
    
# Paths
initial_output_dir = "./icd10_initial_model"
enhanced_output_dir = "./icd10_enhanced_model"

## Data Loading Function

Function to load ICD-10 data from JSONL format with system, user, and assistant messages.

In [3]:
def load_icd10_training_data(jsonl_path):
    """Load ICD-10 dataset from JSONL format with system, user, assistant messages"""
    with open(jsonl_path, 'r') as f:
        data = [json.loads(line) for line in f]
    
    # Extract messages from JSONL format
    formatted_data = []
    for item in data:
        messages = item["messages"]
        system_content = next(msg["content"] for msg in messages if msg["role"] == "system")
        user_content = next(msg["content"] for msg in messages if msg["role"] == "user")
        assistant_content = next(msg["content"] for msg in messages if msg["role"] == "assistant")
        
        formatted_data.append({
            "system": system_content,
            "user": user_content,
            "assistant": assistant_content
        })
    
    return Dataset.from_list(formatted_data)

## Stage 1: Initial Fine-Tuning

The first stage of fine-tuning uses the complete ICD-10 code set (74,260 code-description pairs) to provide the model with comprehensive medical coding knowledge.

In [4]:
def run_initial_finetuning(train_data_path, output_dir, epochs=10):
    """Initial fine-tuning with the complete ICD-10 code set"""
    
    if MODEL_TYPE == "gpt4o_mini":
        # OpenAI API-based fine-tuning
        from openai import OpenAI
        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        
        # Start fine-tuning job
        response = client.fine_tuning.jobs.create(
            training_file=train_data_path,
            model="gpt-4o-mini",
            hyperparameters={
                "n_epochs": epochs,
                "learning_rate_multiplier": 1.8
            }
        )
        print(f"Fine-tuning job created: {response.id}")
        return response.id
    else:
        # Hugging Face-based fine-tuning for Llama models
        train_dataset = load_icd10_training_data(train_data_path)
        
        # Initialize tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        if USE_LORA:
            # Use LoRA for parameter-efficient fine-tuning
            model = prepare_model_for_kbit_training(model)
            lora_config = LoraConfig(
                r=16,  # rank
                lora_alpha=32,  # scaling factor
                target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
                lora_dropout=0.05,
                bias="none",
            )
            model = get_peft_model(model, lora_config)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=8,
            learning_rate=1e-5,
            lr_scheduler_type="cosine",
            warmup_steps=100,
            logging_steps=10,
            save_strategy="epoch",
            save_total_limit=2,
            fp16=True,
            report_to="tensorboard",
            deepspeed="ds_config.json" if torch.cuda.device_count() > 1 else None
        )
        
        # Initialize trainer
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset,
            args=training_args,
            tokenizer=tokenizer,
            max_seq_length=50,  # Maximum token length for ICD-10 descriptions
            dataset_text_field="text"
        )
        
        # Start training
        trainer.train()
        trainer.save_model(os.path.join(output_dir, "final_model"))
        return os.path.join(output_dir, "final_model")

## Stage 2: Enhanced Fine-Tuning

The second stage focuses on improving the model's ability to handle linguistic and lexical variations in clinical documentation, including:
- Reordered diagnostic expressions
- Medical abbreviations
- Typographical errors
- Multiple concurrent conditions
- Sentences with embedded diagnostic information

In [5]:
def run_enhanced_finetuning(initial_model_path, variation_data_paths, output_dir):
    """
    Enhanced fine-tuning to handle linguistic and lexical variations
    
    Args:
        initial_model_path: Path to the initially fine-tuned model
        variation_data_paths: Dictionary mapping variation types to data paths
        output_dir: Directory to save the enhanced model
    """
    if MODEL_TYPE == "gpt4o_mini":
        from openai import OpenAI
        client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        
        # Start fine-tuning jobs for each variation type
        job_ids = {}
        for variation_type, data_path in variation_data_paths.items():
            response = client.fine_tuning.jobs.create(
                training_file=data_path,
                model=initial_model_path,  # Use the initially fine-tuned model
                hyperparameters={
                    "n_epochs": 5,  # Fewer epochs for enhanced tuning
                    "learning_rate_multiplier": 1.0  # Lower learning rate
                }
            )
            job_ids[variation_type] = response.id
            print(f"Enhanced fine-tuning job for {variation_type} created: {response.id}")
        return job_ids
    else:
        # Initialize model from initial fine-tuning
        tokenizer = AutoTokenizer.from_pretrained(initial_model_path, trust_remote_code=True)
        tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            initial_model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        # Process each variation type sequentially
        for variation_type, data_path in variation_data_paths.items():
            print(f"Starting enhanced fine-tuning for {variation_type}")
            
            # Load variation-specific data
            variation_dataset = load_icd10_training_data(data_path)
            
            # Training arguments with lower learning rate and fewer epochs
            training_args = TrainingArguments(
                output_dir=os.path.join(output_dir, variation_type),
                num_train_epochs=5,
                per_device_train_batch_size=1,
                gradient_accumulation_steps=8,
                learning_rate=5e-6,  # Lower learning rate for enhanced tuning
                lr_scheduler_type="cosine",
                warmup_steps=50,
                logging_steps=10,
                save_strategy="epoch",
                save_total_limit=1,
                fp16=True,
                report_to="tensorboard",
                deepspeed="ds_config.json" if torch.cuda.device_count() > 1 else None
            )
            
            # Initialize trainer
            trainer = SFTTrainer(
                model=model,
                train_dataset=variation_dataset,
                args=training_args,
                tokenizer=tokenizer,
                max_seq_length=100,  # Longer context for variations
                dataset_text_field="text"
            )
            
            # Fine-tune on this variation
            trainer.train()
            
            # Save intermediate model after each variation
            model_path = os.path.join(output_dir, f"{variation_type}_model")
            trainer.save_model(model_path)
            
            # Update model to continue with the next variation
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        
        # Save final enhanced model
        final_model_path = os.path.join(output_dir, "final_enhanced_model")
        model.save_pretrained(final_model_path)
        tokenizer.save_pretrained(final_model_path)
        return final_model_path

## Running the Complete Two-Stage Process

Now let's run both stages to train the model for medical coding.

In [6]:
# Define data paths
icd10_complete_path = "icd10_complete_codes.jsonl"  # Path to complete ICD-10 dataset

# Variation datasets for enhanced fine-tuning
variation_datasets = {
    "reordered": "icd10_reordered_descriptions.jsonl",
    "abbreviations": "icd10_medical_abbreviations.jsonl",
    "typos": "icd10_typographical_errors.jsonl",
    "multiple_conditions": "icd10_multiple_conditions.jsonl",
    "sentence_embedding": "icd10_sentence_embedding.jsonl"
}

print("Starting Stage 1: Initial Fine-Tuning...")

Starting Stage 1: Initial Fine-Tuning...


In [7]:
# This cell would actually run the initial fine-tuning
# In a notebook environment, you might want to skip actual execution unless needed

# Comment out the next line if you want to skip actual execution
# initial_model = run_initial_finetuning(icd10_complete_path, initial_output_dir, epochs=10)

# For demonstration, we'll simulate completion
initial_model = os.path.join(initial_output_dir, "final_model")
print(f"Initial fine-tuning complete! Model saved at: {initial_model}")

Initial fine-tuning complete! Model saved at: ./icd10_initial_model/final_model


In [8]:
print("Starting Stage 2: Enhanced Fine-Tuning...")

Starting Stage 2: Enhanced Fine-Tuning...


In [9]:
# This cell would actually run the enhanced fine-tuning
# Comment out the next line if you want to skip actual execution
# enhanced_model = run_enhanced_finetuning(initial_model, variation_datasets, enhanced_output_dir)

# For demonstration, we'll simulate completion
enhanced_model = os.path.join(enhanced_output_dir, "final_enhanced_model")
print(f"Enhanced fine-tuning complete! Final model saved at: {enhanced_model}")

Enhanced fine-tuning complete! Final model saved at: ./icd10_enhanced_model/final_enhanced_model
