In [12]:
# Cell 1: Excellence Setup and Imports
import sys
sys.path.append('..')

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Import excellence modules
from src.data.dataset import ChestXrayDataset, create_data_splits, calculate_class_weights
from src.data.dataloader import create_rtx3060_dataloaders
from src.models.excellence_ensemble import (
    create_excellence_efficientnet,
    ExcellenceLoss
)
from src.trainer.excellence_trainer import ExcellenceTrainer

print("✅ Excellence modules imported successfully")
print("🎯 Target: 0.85+ AUC with excellence metrics")

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Using device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

✅ Excellence modules imported successfully
🎯 Target: 0.85+ AUC with excellence metrics
🚀 Using device: cuda
   GPU: NVIDIA GeForce RTX 3060
   VRAM: 12.9 GB


In [13]:
# Cell 2: Load Data Pipeline for Excellence Training
BASE_PATH = Path(r"D:/Projects/CLARITY/Model/Dataset/archive")  # Update this!

print("🔄 Loading data pipeline for excellence training...")

# Load metadata
data_entry_path = BASE_PATH / "Data_Entry_2017.csv"
df = pd.read_csv(data_entry_path)
print(f"✅ Metadata loaded: {len(df):,} entries")

# Create image mapping
image_mapping = {}
for main_folder in sorted(BASE_PATH.iterdir()):
    if main_folder.is_dir() and main_folder.name.startswith('images_'):
        images_subfolder = main_folder / 'images'
        if images_subfolder.exists():
            for img_file in images_subfolder.glob("*.png"):
                image_name = img_file.name
                if image_name not in image_mapping:
                    image_mapping[image_name] = img_file

print(f"✅ Image mapping: {len(image_mapping):,} images")

# Create data splits
train_df, val_df, test_df = create_data_splits(df, 
                                               test_size=0.15,
                                               val_size=0.10,
                                               random_seed=42)

print(f"✅ Data splits: Train({len(train_df):,}) Val({len(val_df):,}) Test({len(test_df):,})")

🔄 Loading data pipeline for excellence training...
✅ Metadata loaded: 112,120 entries
✅ Image mapping: 112,120 images
Patient-level data splits:
  Train: 83,847 images from 23,105 patients (74.8%)
  Val:   11,550 images from 3,080 patients (10.3%)
  Test:  16,723 images from 4,620 patients (14.9%)
✅ No patient overlap verified - clean splits!
✅ Data splits: Train(83,847) Val(11,550) Test(16,723)


In [14]:
# Cell 3: Create Excellence Datasets with Optimized Settings
print("🔄 Creating excellence datasets...")

# EXCELLENCE SETTINGS - Optimized for maximum performance
EXCELLENCE_IMAGE_SIZE = 512  # Higher resolution for better features
EXCELLENCE_BATCH_SIZE = 16    # Smaller batch for larger images
EXCELLENCE_WORKERS = 4       # Optimized workers
EXCELLENCE_ACCUMULATION = 6  # Effective batch = 36

print(f"⚙️  Excellence Configuration:")
print(f"   Image size: {EXCELLENCE_IMAGE_SIZE}×{EXCELLENCE_IMAGE_SIZE}")
print(f"   Batch size: {EXCELLENCE_BATCH_SIZE}")
print(f"   Effective batch: {EXCELLENCE_BATCH_SIZE * EXCELLENCE_ACCUMULATION}")
print(f"   Target: 0.85+ AUC")

# Create excellence datasets with aggressive augmentation
train_dataset = ChestXrayDataset(
    df=train_df,
    image_mapping=image_mapping,
    image_size=EXCELLENCE_IMAGE_SIZE,
    is_training=True,
    augmentation_prob=0.95  # Very aggressive augmentation
)

val_dataset = ChestXrayDataset(
    df=val_df,
    image_mapping=image_mapping,
    image_size=EXCELLENCE_IMAGE_SIZE,
    is_training=False
)

test_dataset = ChestXrayDataset(
    df=test_df,
    image_mapping=image_mapping,
    image_size=EXCELLENCE_IMAGE_SIZE,
    is_training=False
)

print(f"✅ Excellence datasets created!")

🔄 Creating excellence datasets...
⚙️  Excellence Configuration:
   Image size: 512×512
   Batch size: 16
   Effective batch: 96
   Target: 0.85+ AUC
Dataset created with 83847 samples
Training mode: True
Image size: 512x512

Label matrix created: (83847, 15)
Positive samples per class:
  No Finding...............  45146
  Atelectasis..............   8720
  Cardiomegaly.............   2019
  Effusion.................  10071
  Infiltration.............  14772
  Mass.....................   4477
  Nodule...................   4691
  Pneumonia................   1062
  Pneumothorax.............   3981
  Consolidation............   3458
  Edema....................   1738
  Emphysema................   1794
  Fibrosis.................   1236
  Pleural_Thickening.......   2562
  Hernia...................    171
Transforms created for training
Dataset created with 11550 samples
Training mode: False
Image size: 512x512

Label matrix created: (11550, 15)
Positive samples per class:
  No Finding.....

In [15]:
# Cell 4: Create Excellence DataLoaders
print("🔄 Creating excellence dataloaders...")

# Calculate enhanced class weights
class_weights = calculate_class_weights(train_dataset.labels, method='inverse_freq_sqrt')
print(f"✅ Enhanced class weights calculated")

# Create excellence dataloaders
train_loader, val_loader, test_loader = create_rtx3060_dataloaders(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    batch_size=EXCELLENCE_BATCH_SIZE,
    num_workers=EXCELLENCE_WORKERS,
    use_weighted_sampling=True
)

print(f"✅ Excellence dataloaders created!")

🔄 Creating excellence dataloaders...

Class weights (inverse_freq_sqrt):
--------------------------------------------------
No Finding...............    1.000 (pos:  45146)
Atelectasis..............    1.000 (pos:   8720)
Cardiomegaly.............    1.000 (pos:   2019)
Effusion.................    1.000 (pos:  10071)
Infiltration.............    1.000 (pos:  14772)
Mass.....................    1.000 (pos:   4477)
Nodule...................    1.000 (pos:   4691)
Pneumonia................    1.000 (pos:   1062)
Pneumothorax.............    1.000 (pos:   3981)
Consolidation............    1.000 (pos:   3458)
Edema....................    1.000 (pos:   1738)
Emphysema................    1.000 (pos:   1794)
Fibrosis.................    1.000 (pos:   1236)
Pleural_Thickening.......    1.000 (pos:   2562)
Hernia...................    1.000 (pos:    171)
✅ Enhanced class weights calculated
Creating RTX 3060 optimized dataloaders:
  Batch size: 16
  Num workers: 4
  Weighted sampling: True
  We

In [16]:
# Cell 5: Create Excellence Model (CORRECTED)
print("🔄 Creating excellence model...")

# Create high-performance EfficientNet-B4
model = create_excellence_efficientnet(
    num_classes=15,
    model_size='b4',  # High performance backbone
    class_weights=class_weights
)

# Move to device
model = model.to(device)

# Create excellence loss function (FIXED - 3 components only)
criterion = ExcellenceLoss(
    class_weights=class_weights,
    focal_alpha=0.25,
    focal_gamma=2.5,
    label_smoothing=0.15,
    loss_weights={
        'focal': 0.5,      # Increased weight
        'asymmetric': 0.3,
        'bce': 0.2         # Only 3 components, sum = 1.0
        # Removed 'consistency' to fix the KeyError
    }
)

# Print model statistics
total_params = sum(p.numel() for p in model.parameters())
print(f"\n📊 Excellence Model Statistics:")
print(f"   Total parameters: {total_params:,}")
print(f"   Architecture: Excellence EfficientNet-B4")
print(f"   Features: Multi-scale fusion + Advanced attention")
print(f"   Loss components: 3 (Focal + Asymmetric + BCE)")

# Enable optimizations
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

print("✅ Excellence model created successfully!")

🔄 Creating excellence model...


Unexpected keys (bn2.num_batches_tracked, bn2.bias, bn2.running_mean, bn2.running_var, bn2.weight, classifier.bias, classifier.weight, conv_head.weight) found while loading pretrained weights. This may be expected if model is being adapted.


✅ Excellence EfficientNet Created:
   Backbone: efficientnet_b4
   Advanced Attention: True
   Multi-scale Fusion: True
   Classifier dim: 3072
✅ Excellence Loss Function Created
   Focal weight: 0.5
   Asymmetric weight: 0.3
   BCE weight: 0.2
   Components: 3 (consistency disabled)

📊 Excellence Model Statistics:
   Total parameters: 31,730,490
   Architecture: Excellence EfficientNet-B4
   Features: Multi-scale fusion + Advanced attention
   Loss components: 3 (Focal + Asymmetric + BCE)
✅ Excellence model created successfully!


In [17]:
# Cell 6: Initialize Excellence Trainer
print("🔄 Initializing excellence trainer...")

# Excellence training configuration
trainer = ExcellenceTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    device=device,
    learning_rate=8e-5,
    weight_decay=1e-5,
    accumulation_steps=EXCELLENCE_ACCUMULATION,
    mixed_precision=True,
    patience=15,
    max_epochs=20,
    scheduler_type='cosine_warm_restarts',
    warmup_epochs=5,
    test_time_augmentation=True,
    gradient_clipping=1.0,
    checkpoint_dir='../models/excellence_checkpoints'
)

print("✅ Excellence trainer initialized!")
print(f"   Target: 0.85+ AUC with 90%+ accuracy")

🔄 Initializing excellence trainer...
✅ Mixed precision training enabled (FP16)
🚀 Excellence Trainer initialized:
   Target: 0.85+ AUC
   Max epochs: 20
   Batch accumulation: 6
   Scheduler: cosine_warm_restarts
   Test-time augmentation: True
✅ Excellence trainer initialized!
   Target: 0.85+ AUC with 90%+ accuracy


In [18]:
# Cell 7: Start Excellence Training
print("🎯 Starting CLARITy Excellence Training...")
print("   Target: 0.85+ AUC")
print("   Expected time: 8-12 hours")
print("=" * 90)

import time
start_time = time.time()
# Run excellence training
best_auc = trainer.train_for_excellence()

training_time = time.time() - start_time

print(f"\n🏆 EXCELLENCE TRAINING COMPLETED!")
print(f"🎯 Final Results:")
print(f"   Best AUC: {best_auc:.4f}")
print(f"   Target (0.85): {'✅ ACHIEVED!' if best_auc >= 0.85 else '🔸 Close' if best_auc >= 0.80 else '❌ Needs work'}")
print(f"   Training time: {training_time/3600:.2f} hours")

🎯 Starting CLARITy Excellence Training...
   Target: 0.85+ AUC
   Expected time: 8-12 hours
🎯 Starting Excellence Training for 0.85+ AUC
   Max epochs: 20
   Early stopping patience: 15


                                                                                                                       

💎 New best model saved: AUC = 0.5987
Epoch   1/20 | Loss: 0.2001/0.1612 | AUC: 0.5987 | F1: 0.0000 | IoU: 0.0000 | LR: 7.95e-05 | Time: 2547.5s


                                                                                                                       

💎 New best model saved: AUC = 0.6248
Epoch   2/20 | Loss: 0.1781/0.1592 | AUC: 0.6248 | F1: 0.0000 | IoU: 0.0000 | LR: 7.80e-05 | Time: 2313.6s


                                                                                                                       

Epoch   3/20 | Loss: 0.1749/0.1589 | AUC: 0.6149 | F1: 0.0000 | IoU: 0.0000 | LR: 7.56e-05 | Time: 2399.1s


                                                                                                                       

💎 New best model saved: AUC = 0.6385
Epoch   4/20 | Loss: 0.1730/0.1586 | AUC: 0.6385 | F1: 0.0000 | IoU: 0.0000 | LR: 7.24e-05 | Time: 2407.5s


                                                                                                                       

💎 New best model saved: AUC = 0.6543
Epoch   5/20 | Loss: 0.1713/0.1578 | AUC: 0.6543 | F1: 0.0000 | IoU: 0.0000 | LR: 6.83e-05 | Time: 2216.4s


                                                                                                                       

💎 New best model saved: AUC = 0.6618
Epoch   6/20 | Loss: 0.1699/0.1581 | AUC: 0.6618 | F1: 0.0000 | IoU: 0.0000 | LR: 6.35e-05 | Time: 2213.0s


                                                                                                                       

💎 New best model saved: AUC = 0.6984
Epoch   7/20 | Loss: 0.1687/0.1567 | AUC: 0.6984 | F1: 0.0000 | IoU: 0.0000 | LR: 5.82e-05 | Time: 2219.8s


                                                                                                                       

💎 New best model saved: AUC = 0.6985
Epoch   8/20 | Loss: 0.1678/0.1556 | AUC: 0.6985 | F1: 0.0000 | IoU: 0.0000 | LR: 5.24e-05 | Time: 2614.8s


                                                                                                                       

💎 New best model saved: AUC = 0.7013
Epoch   9/20 | Loss: 0.1670/0.1558 | AUC: 0.7013 | F1: 0.0000 | IoU: 0.0000 | LR: 4.63e-05 | Time: 2217.4s


                                                                                                                       

💎 New best model saved: AUC = 0.7035
Epoch  10/20 | Loss: 0.1662/0.1556 | AUC: 0.7035 | F1: 0.0000 | IoU: 0.0000 | LR: 4.00e-05 | Time: 2213.1s

📊 Progress Report (Epoch 10):
   Best AUC: 0.7035
   Current AUC: 0.7035
   Mean F1: 0.0000
   Mean Precision: 0.0000
   Mean Recall: 0.0000
   Mean IoU: 0.0000
   Overall Accuracy: 0.9159
   Top 3 classes:
     Pneumothorax: 0.828
     Edema: 0.863
     Emphysema: 0.869



                                                                                                                       

💎 New best model saved: AUC = 0.7146
Epoch  11/20 | Loss: 0.1659/0.1550 | AUC: 0.7146 | F1: 0.0000 | IoU: 0.0000 | LR: 3.38e-05 | Time: 2203.4s


                                                                                                                       

Epoch  12/20 | Loss: 0.1655/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 2.77e-05 | Time: 2201.5s


                                                                                                                       

Epoch  13/20 | Loss: 0.1650/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 2.19e-05 | Time: 2201.2s


                                                                                                                       

Epoch  14/20 | Loss: 0.1645/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 1.66e-05 | Time: 2201.4s


                                                                                                                       

Epoch  15/20 | Loss: 0.1642/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 1.18e-05 | Time: 2201.7s


                                                                                                                       

Epoch  16/20 | Loss: 0.1642/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 7.71e-06 | Time: 2201.7s


                                                                                                                       

Epoch  17/20 | Loss: 0.1641/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 4.44e-06 | Time: 2201.8s


                                                                                                                       

Epoch  18/20 | Loss: 0.1641/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 2.04e-06 | Time: 2210.8s


                                                                                                                       

Epoch  19/20 | Loss: 0.1637/nan | AUC: 0.5000 | F1: 0.0000 | IoU: 0.0000 | LR: 5.72e-07 | Time: 2211.5s


                                                                                                                       

KeyboardInterrupt: 

In [23]:
# Cell: Resume from Best Checkpoint (FIXED)
import torch
import numpy as np

print("🔄 Loading best checkpoint...")

# Load checkpoint
checkpoint_path = "../models/excellence_checkpoints/best_excellence_model.pth"
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

# Load model state
model.load_state_dict(checkpoint['model_state_dict'])
print(f"✅ Loaded model from epoch {checkpoint['epoch']} with AUC {checkpoint['best_val_auc']:.4f}")

# Create new stable trainer with fixes
trainer_resume = ExcellenceTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    device=device,
    
    # STABILITY FIXES
    learning_rate=2e-5,              # Lower LR
    gradient_clipping=0.5,           # Stronger clipping
    mixed_precision=False,           # Disable FP16
    scheduler_type='cosine_warm_restarts',  # Use cosine instead of plateau
    
    max_epochs=15,                   # Continue for 15 more epochs
    patience=8,
    checkpoint_dir="../models/excellence_checkpoints_resume"  # FIXED PATH
)

# Set starting values from checkpoint
trainer_resume.best_val_auc = checkpoint['best_val_auc']
trainer_resume.best_epoch = checkpoint['epoch']

print(f"🎯 Ready to resume training!")
print(f"   Starting AUC: {checkpoint['best_val_auc']:.4f}")
print(f"   Target: 0.80+ AUC")

# Resume training
print("🚀 Starting resumed training...")
final_auc = trainer_resume.train_for_excellence()

🔄 Loading best checkpoint...


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 12.00 GiB of which 0 bytes is free. Of the allocated memory 17.28 GiB is allocated by PyTorch, and 110.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [26]:
# Cell: Save Current Training State (FIXED)
import torch
from pathlib import Path
from datetime import datetime

print("💾 Saving interrupted training state...")

# Create save directory
save_dir = Path("../models/interrupted_training")
save_dir.mkdir(parents=True, exist_ok=True)

# Save current model state
torch.save({
    'model_state_dict': model.state_dict(),
    'training_interrupted_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'completed_epochs': 11,  # Last successful epoch
    'best_auc_achieved': 0.7146,
    'training_status': 'interrupted_due_to_nan_loss',
    'next_steps': 'resume_with_stability_fixes',
    'model_architecture': 'Excellence_EfficientNet_B4',
    'training_config': {
        'image_size': 512,
        'batch_size': 16,
        'max_epochs': 20,
        'scheduler': 'cosine_warm_restarts'
    }
}, save_dir / "interrupted_training_state.pth")

# Save training summary (FIXED - UTF-8 encoding and no Unicode arrows)
with open(save_dir / "training_summary.txt", 'w', encoding='utf-8') as f:
    f.write(f"""CLARITy Excellence Training - Interrupted Session Summary
    
Training Period: 20 epochs planned
Completed: 11 successful epochs
Status: Interrupted due to NaN loss after epoch 11

Performance:
- Best AUC: 0.7146 (Epoch 11)
- Progress: 59.7% to 71.46% AUC (+11.76%)
- Target: 0.85+ AUC
- Gap to target: +0.135 AUC needed

Top Performing Classes (Epoch 10):
- Emphysema: 0.869
- Edema: 0.863  
- Pneumothorax: 0.828

Issue: NaN loss starting from epoch 12
Cause: Gradient explosion/numerical instability

Next Steps:
1. Resume from best checkpoint (epoch 11)
2. Apply stability fixes (lower LR, gradient clipping)
3. Continue training for 15 more epochs
4. Expected final AUC: 0.80-0.85+

Files Saved:
- Best model: ../models/excellence_checkpoints/best_excellence_model.pth
- Interrupted state: ../models/interrupted_training/interrupted_training_state.pth

Training Time: ~11 hours for 11 epochs
GPU Temperature: 77C (within safe range)
""")

print("✅ Training state saved!")
print(f"   Directory: {save_dir}")
print(f"   Best AUC achieved: 0.7146")
print(f"   Status: Ready to resume with fixes")
print(f"   Time saved: ~11 hours of training preserved!")

💾 Saving interrupted training state...
✅ Training state saved!
   Directory: ..\models\interrupted_training
   Best AUC achieved: 0.7146
   Status: Ready to resume with fixes
   Time saved: ~11 hours of training preserved!


In [24]:
# Cell 8: Save Excellence Model
print("💾 Saving excellence model...")

# Create save directory
save_dir = Path("../models/saved_models/excellence_ensemble/excellence_model")
save_dir.mkdir(parents=True, exist_ok=True)

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'model_architecture': 'Excellence_EfficientNet_B4',
    'num_classes': 15,
    'image_size': EXCELLENCE_IMAGE_SIZE,
    'best_auc': best_auc,
    'training_config': {
        'max_epochs': 100,
        'learning_rate': 8e-5,
        'scheduler_type': 'cosine_warm_restarts',
        'test_time_augmentation': True,
        'mixed_precision': True
    },
    'model_features': {
        'advanced_attention': True,
        'multiscale_fusion': True,
        'excellence_loss': True,
        'test_time_augmentation': True
    }
}, save_dir / "excellence_model.pth")

# Save training history
pd.DataFrame({
    'epoch': range(1, len(trainer.val_aucs) + 1),
    'train_loss': trainer.train_losses,
    'val_loss': trainer.val_losses,
    'val_auc': trainer.val_aucs,
    'val_ap': trainer.val_aps,
    'val_f1': trainer.val_f1s,
    'val_precision': trainer.val_precisions,
    'val_recall': trainer.val_recalls,
    'learning_rate': trainer.learning_rates[:len(trainer.val_aucs)]
}).to_csv(save_dir / "excellence_training_history.csv", index=False)

print(f"✅ Excellence model saved to: {save_dir}")
print(f"\n🎉 CLARITy Excellence Training Complete!")
print(f"   Final AUC: {best_auc:.4f}")
print(f"   Status: {'🏆 EXCELLENCE ACHIEVED!' if best_auc >= 0.85 else '🥈 HIGH PERFORMANCE' if best_auc >= 0.80 else '🥉 GOOD PROGRESS'}")

💾 Saving excellence model...


NameError: name 'best_auc' is not defined