# 🚀 Kenya Clinical Reasoning - PRODUCTION ML TRAINING
**FLAN-T5-small Fine-tuning on Expert Clinical Data**

**Target:** Competition-winning model using REAL expert responses  
**Hardware:** Kaggle P100 GPU acceleration  
**Model:** Google FLAN-T5-small (77M params, edge-deployable)

In [None]:
# Install dependencies (run once)
!pip install rouge-score datasets accelerate transformers torch -q

# Setup
import torch
import pandas as pd
import numpy as np
from datetime import datetime
import json
import sys
import os

# Check PyTorch and transformers compatibility
print(f"🔥 PyTorch version: {torch.__version__}")

# Test AdamW import (fixed in newer versions)
try:
    from torch.optim import AdamW
    print("✅ AdamW imported from torch.optim (recommended)")
except ImportError:
    try:
        from transformers import AdamW
        print("⚠️ AdamW imported from transformers (deprecated)")
    except ImportError:
        print("❌ AdamW not found - installing latest transformers")
        !pip install --upgrade transformers torch

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔥 Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
else:
    print("⚠️ No GPU available - training will be slower on CPU")

🔥 Using device: cuda
GPU: Tesla P100-PCIE-16GB
Memory: 17.1GB


In [2]:
!git clone https://github.com/jnopareboateng/kenyan-medical-reasoning.git

Cloning into 'kenyan-medical-reasoning'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 77 (delta 9), reused 77 (delta 9), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 2.63 MiB | 21.56 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [2]:
os.getcwd()

'/kaggle/working'

In [3]:
os.listdir()

['.virtual_documents', 'kenyan-medical-reasoning']

In [4]:
%cd kenyan-medical-reasoning

/kaggle/working/kenyan-medical-reasoning


In [None]:
# Ensure all dependencies are imported first
import torch
import numpy as np
import pandas as pd

# Import our existing modules
import sys
sys.path.append(".")
from core.ml_model import MLPipeline, ClinicalT5Model, ClinicalExample
from utils.logger import CompetitionLogger

# Initialize
logger = CompetitionLogger("ML_Training")
logger.info("🚀 PRODUCTION ML TRAINING STARTED")

# Load training data
train_df = pd.read_csv("data/train.csv")
print(f"📊 Loaded {len(train_df)} training cases")
print(f"Columns: {list(train_df.columns)}")

# Check expert response columns
expert_cols = [
    "Nursing Competency",
    "Clinical Panel", 
    "Clinician",
    "GPT4.0",
    "LLAMA",
    "GEMINI",
]
for col in expert_cols:
    if col in train_df.columns:
        filled = train_df[col].notna().sum()
        print(
            f"✅ {col}: {filled}/{len(train_df)} responses ({filled/len(train_df)*100:.1f}%)"
        )

2025-06-16 18:30:01.296078: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750098601.318735     142 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750098601.325706     142 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ImportError: cannot import name 'AdamW' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
# Quick import test to verify everything works
print("🔍 Testing imports...")

try:
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    from torch.optim import AdamW
    print("✅ Transformers and PyTorch imports successful")
    
    from core.ml_model import ClinicalT5Model
    print("✅ Custom ML model import successful")
    
    print("🎯 All imports working - ready for training!")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Installing missing dependencies...")
    !pip install rouge-score datasets accelerate -q

In [None]:
# Initialize FLAN-T5 model
model = ClinicalT5Model("google/flan-t5-small")
logger.info(f"Model loaded: {sum(p.numel() for p in model.model.parameters()):,} parameters")

# Prepare training examples from REAL expert data
training_examples = model.prepare_training_data(train_df)
logger.info(f"✅ Prepared {len(training_examples)} training examples")

# Show sample
if training_examples:
    sample = training_examples[0]
    print("📋 SAMPLE TRAINING EXAMPLE:")
    print(f"Input: {sample.input_text[:200]}...")
    print(f"Target: {sample.target_response[:200]}...")
    print(f"Length: {len(sample.target_response)} chars")

In [None]:
# Split training data
train_size = int(0.85 * len(training_examples))
train_examples = training_examples[:train_size]
val_examples = training_examples[train_size:]

logger.info(f"📈 Training: {len(train_examples)}, Validation: {len(val_examples)}")

# Training configuration for GPU acceleration
config = {
    'epochs': 3,
    'batch_size': 8,  # Increase for P100
    'learning_rate': 3e-5,
}

logger.info(f"🔧 Training config: {config}")

# Start training (this will take several minutes on P100)
print("🚀 STARTING FINE-TUNING...")
training_results = model.fine_tune(
    train_examples=train_examples,
    val_examples=val_examples,
    **config
)

logger.info("✅ Training completed!")
print("📊 Training Results:")
for stat in training_results['training_stats']:
    print(f"Epoch {stat['epoch']}: Loss={stat['train_loss']:.4f}, ROUGE-L={stat.get('rouge_l', 0):.4f}")

In [None]:
# Load test data and generate predictions
test_df = pd.read_csv('data/test.csv')
logger.info(f"📋 Generating predictions for {len(test_df)} test cases...")

predictions = []
for idx, row in test_df.iterrows():
    # Create input prompt
    input_prompt = model._create_input_prompt(row)
    
    # Generate response
    response = model.generate_response(input_prompt, max_length=200)
    predictions.append(response)
    
    if idx % 10 == 0:
        print(f"Generated {idx+1}/{len(test_df)} predictions")

logger.info("✅ All predictions generated!")

# Analyze prediction lengths
lengths = [len(p) for p in predictions]
print(f"📏 Prediction lengths: Mean={np.mean(lengths):.1f}, Range={min(lengths)}-{max(lengths)}")
target_range = [(l >= 600 and l <= 800) for l in lengths]
print(f"🎯 Target range (600-800 chars): {sum(target_range)}/{len(target_range)} ({np.mean(target_range)*100:.1f}%)")

In [None]:
# Create submission file
submission_df = pd.DataFrame({
    'id': range(len(predictions)),
    'response': predictions
})

# Save submission
submission_path = 'flan_t5_submission.csv'
submission_df.to_csv(submission_path, index=False)
logger.info(f"💾 Submission saved: {submission_path}")

# Save model
model_path = 'flan_t5_clinical_model'
model.save_model(model_path)
logger.info(f"🤖 Model saved: {model_path}")

# Create final summary
summary = {
    'timestamp': datetime.now().isoformat(),
    'model': 'FLAN-T5-small',
    'parameters': sum(p.numel() for p in model.model.parameters()),
    'training_examples': len(train_examples),
    'validation_examples': len(val_examples),
    'test_predictions': len(predictions),
    'mean_response_length': float(np.mean(lengths)),
    'target_range_percentage': float(np.mean(target_range) * 100),
    'training_results': training_results,
    'submission_file': submission_path,
    'model_path': model_path
}

with open('training_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("🏆 PRODUCTION ML TRAINING COMPLETE!")
print(f"✅ Model: {summary['parameters']:,} parameters")
print(f"✅ Submission: {submission_path}")
print(f"✅ Mean length: {summary['mean_response_length']:.1f} chars")
print(f"✅ Target range: {summary['target_range_percentage']:.1f}%")

In [None]:
# Show sample predictions
print("🔍 SAMPLE PREDICTIONS:")
for i in range(min(3, len(predictions))):
    print(f"\n--- CASE {i+1} ---")
    print(f"Length: {len(predictions[i])} chars")
    print(f"Response: {predictions[i]}")

# Quantize model for edge deployment (optional)
print("\n🔧 Quantizing model for edge deployment...")
quantized_model = model.quantize_for_edge()
print("✅ Quantized model ready for Jetson Nano deployment")

# Final download instructions
print("\n📥 DOWNLOAD FILES:")
print("1. flan_t5_submission.csv - Competition submission")
print("2. flan_t5_clinical_model/ - Trained model directory") 
print("3. training_summary.json - Training metrics")

logger.info("🎯 READY FOR COMPETITION SUBMISSION!")

In [6]:
# Core ML imports
import torch
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset as HFDataset
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
import json
import yaml
import os
import re
from pathlib import Path
from datetime import datetime
import time
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import warnings

warnings.filterwarnings("ignore")

print("✅ All libraries imported successfully!")

ImportError: cannot import name 'AdamW' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

## 3. 📁 Configure Project Directory Structure

Set up clean separation of concerns with core/, utils/, logs/ directories.

In [None]:
# Set up project structure
PROJECT_ROOT = Path("/kaggle/working")
DATA_DIR = Path("/kaggle/input")  # Kaggle input data location

# Create required directories
directories = {
    'core': PROJECT_ROOT / 'core',
    'utils': PROJECT_ROOT / 'utils', 
    'logs': PROJECT_ROOT / 'logs',
    'models': PROJECT_ROOT / 'models',
    'results': PROJECT_ROOT / 'results',
    'configs': PROJECT_ROOT / 'configs'
}

for name, path in directories.items():
    path.mkdir(parents=True, exist_ok=True)
    print(f"✅ Created {name}/ directory")

# Set up paths
PATHS = {
    'project_root': PROJECT_ROOT,
    'data_dir': DATA_DIR,
    'train_data': DATA_DIR / 'train.csv',  # Update with actual Kaggle dataset path
    'test_data': DATA_DIR / 'test.csv',
    'logs': directories['logs'],
    'models': directories['models'],
    'results': directories['results']
}

print(f"\n📁 Project structure ready at: {PROJECT_ROOT}")
print(f"📊 Data expected at: {DATA_DIR}")

## 4. ⚙️ Load Configuration and Set Up Logging

Configure training parameters and initialize logging system.

In [None]:
# Training configuration (optimized for P100)
CONFIG = {
    'model': {
        'name': 'google/flan-t5-small',  # 77M parameters
        'max_length': 512,
        'target_length': 200
    },
    'training': {
        'batch_size': 8,  # Optimized for P100 16GB
        'learning_rate': 5e-5,
        'epochs': 3,
        'warmup_ratio': 0.1,
        'weight_decay': 0.01,
        'gradient_accumulation_steps': 2
    },
    'evaluation': {
        'eval_steps': 100,
        'save_steps': 500,
        'logging_steps': 50
    }
}

# Simple logging setup
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Save config
config_path = PATHS['project_root'] / 'configs' / 'training_config.yaml'
with open(config_path, 'w') as f:
    yaml.dump(CONFIG, f, default_flow_style=False)

logger.info(f"✅ Configuration loaded and saved to {config_path}")
print(f"🔧 Training Config: {CONFIG['model']['name']} on {device}")
print(f"📊 Batch size: {CONFIG['training']['batch_size']}, Epochs: {CONFIG['training']['epochs']}")