### Setup

In [41]:
import os
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
# import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import Sequential, Linear, ReLU, Dropout, BatchNorm1d
from train import _compute_metrics, train_epoch, eval_epoch, train_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# import matplotlib.pyplot as plt

from train import train_model
import pickle
import optunahub


In [42]:
project_path = Path(Path.cwd()).parent.parent

In [43]:
SEED = 7777

In [44]:
class Dummy:
    """Universal dummy class to safely replace missing modules/classes during unpickling."""
    def __new__(cls, *args, **kwargs):
        return super().__new__(cls)
    def __init__(self, *args, **kwargs):
        pass
    def __call__(self, *args, **kwargs):
        return self
    def __getattr__(self, name):
        return self
    def __setattr__(self, name, value):
        pass
    def __getitem__(self, key):
        return self
    def __iter__(self):
        return iter([])
    def __repr__(self):
        return "<Dummy>"

class SafeUnpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module.startswith("optunahub_registry"):
            return Dummy
        return super().find_class(module, name)

### Load Data

In [45]:
cancer_detection_path = project_path / "data/inputs/Lung Cancer Dataset.csv"
df_detection = pd.read_csv(cancer_detection_path)

df_detection.columns = [x for x in df_detection.columns.str.lower().str.replace(" ", "_")]

df_detection.head(10)

Unnamed: 0,age,gender,smoking,finger_discoloration,mental_stress,exposure_to_pollution,long_term_illness,energy_level,immune_weakness,breathing_issue,alcohol_consumption,throat_discomfort,oxygen_saturation,chest_tightness,family_history,smoking_family_history,stress_immune,pulmonary_disease
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0,NO
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0,YES
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0,NO
3,44,0,1,0,1,1,0,59.785767,0,1,0,1,95.1879,0,0,0,0,YES
4,72,0,1,1,1,1,1,59.733941,0,1,0,1,93.503008,0,0,0,0,YES
5,37,1,1,1,1,1,1,57.684285,0,1,1,1,94.057151,1,0,0,0,YES
6,50,0,1,1,1,0,1,52.647022,1,1,1,0,96.773598,0,0,0,1,NO
7,68,0,1,1,1,0,1,53.306451,0,0,0,1,95.019018,0,0,0,0,NO
8,48,0,1,1,0,1,1,64.272789,1,1,0,1,98.539379,1,0,0,0,YES
9,52,0,0,0,1,1,1,58.319319,0,1,0,1,96.055097,0,0,0,0,NO


In [46]:
df_detection.shape

(5000, 18)

### Preprocessing

In [47]:
# Convert label column to numerical values
label_map_dict = {
    'NO': 0,
    'YES': 1
}

df_detection['pulmonary_disease'] = df_detection['pulmonary_disease'].map(label_map_dict)

In [48]:
# Convert binary columns to categorical
binary_columns = [
    'gender',
    'smoking',
    'finger_discoloration',
    'mental_stress',
    'exposure_to_pollution',
    'long_term_illness',
    'immune_weakness',
    'breathing_issue',
    'alcohol_consumption',
    'throat_discomfort',
    'chest_tightness',
    'family_history',
    'smoking_family_history',
    'stress_immune',
    'pulmonary_disease'
]

df_detection[binary_columns] = df_detection[binary_columns].astype('category')

In [49]:
X = df_detection.drop(columns=['pulmonary_disease']).values
y = df_detection['pulmonary_disease'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=SEED, stratify=y_train)

In [50]:
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Train set shape: (3612, 17)
Validation set shape: (638, 17)
Test set shape: (750, 17)


In [51]:
X_train.mean(axis=0), X_train.std(axis=0)

(array([57.37015504,  0.49612403,  0.66196013,  0.60354374,  0.54512735,
         0.51522702,  0.43992248, 54.99256295,  0.39451827,  0.79983389,
         0.35022148,  0.69988926, 94.9913959 ,  0.6013289 ,  0.303433  ,
         0.20265781,  0.21179402]),
 array([15.83079571,  0.49998498,  0.47304219,  0.48916121,  0.49795936,
         0.49976808,  0.49637757,  7.84740972,  0.48874697,  0.40012453,
         0.4770392 ,  0.45830589,  1.49321387,  0.48962481,  0.4597406 ,
         0.40197963,  0.40857963]))

In [52]:
with open("ga_results.pkl", "rb") as f:
    ga_results = pickle.load(f)

In [53]:
best_feature_set = ga_results['best_individual']
best_feature_mask = np.array(best_feature_set, dtype=bool)
print(best_feature_set)
print(best_feature_mask)

[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1]
[ True False  True  True False  True False  True  True  True False  True
 False False  True  True  True]


In [54]:
X_train = X_train[:,best_feature_mask]
X_val = X_val[:,best_feature_mask]
X_test = X_test[:,best_feature_mask]

In [55]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### PSO performance

In [56]:
with open("pso_results_2.pkl", "rb") as f:
    pso_results = SafeUnpickler(f).load()

print("Loaded successfully:", type(pso_results))


Loaded successfully: <class 'dict'>


In [57]:
best_params = pso_results['best_params']
best_params

{'use_batch_norm': True,
 'learning_rate': 0.004355387279653677,
 'weight_decay': 1e-06,
 'batch_size': 96,
 'hidden_size_0': 32,
 'hidden_size_1': 16,
 'hidden_size_2': 16,
 'dropout_rate_0': 0.6,
 'dropout_rate_1': 0.5086790371135944,
 'dropout_rate_2': 0.1293341508821494}

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
epochs = 300

In [63]:
# Your best parameters from PSO
best_params = pso_results['best_params']
use_batch_norm = best_params['use_batch_norm']
learning_rate = best_params['learning_rate']
weight_decay = best_params['weight_decay']
batch_size = best_params['batch_size']
hidden_size_0 = best_params['hidden_size_0']
hidden_size_1 = best_params['hidden_size_1']
hidden_size_2 = best_params['hidden_size_2']
dropout_rate_0 = best_params['dropout_rate_0']
dropout_rate_1 = best_params['dropout_rate_1']
dropout_rate_2 = best_params['dropout_rate_2']

# Create datasets
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32), 
    torch.tensor(y_train, dtype=torch.long)
)
val_dataset = TensorDataset(
    torch.tensor(X_val, dtype=torch.float32), 
    torch.tensor(y_val, dtype=torch.long)
)
test_dataset = TensorDataset(
    torch.tensor(X_test, dtype=torch.float32), 
    torch.tensor(y_test, dtype=torch.long)
)

# Create model WITH batch normalization
if use_batch_norm:
    mlp = Sequential(
        # First layer
        Linear(11, hidden_size_0),
        ReLU(),
        BatchNorm1d(hidden_size_0),  # BatchNorm AFTER activation
        Dropout(dropout_rate_0),
        
        # Second layer
        Linear(hidden_size_0, hidden_size_1),
        ReLU(),
        BatchNorm1d(hidden_size_1),  # BatchNorm AFTER activation
        Dropout(dropout_rate_1),
        
        # Third layer
        Linear(hidden_size_1, hidden_size_2),
        ReLU(),
        BatchNorm1d(hidden_size_2),  # BatchNorm AFTER activation
        Dropout(dropout_rate_2),
        
        # Output layer (no BatchNorm on output)
        Linear(hidden_size_2, 2)
    )
else:
    # Without batch norm (your original code)
    mlp = Sequential(
        Linear(11, hidden_size_0),
        ReLU(),
        Dropout(dropout_rate_0),
        Linear(hidden_size_0, hidden_size_1),
        ReLU(),
        Dropout(dropout_rate_1),
        Linear(hidden_size_1, hidden_size_2),
        ReLU(),
        Dropout(dropout_rate_2),
        Linear(hidden_size_2, 2)
    )

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Create data loaders with optimized batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, pin_memory=True)

# Print model architecture
print("Model Architecture (with BatchNorm):")
print("=" * 60)
for i, layer in enumerate(mlp):
    print(f"Layer {i}: {layer}")
print("=" * 60)

# Count parameters
total_params = sum(p.numel() for p in mlp.parameters())
trainable_params = sum(p.numel() for p in mlp.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Model Architecture (with BatchNorm):
Layer 0: Linear(in_features=11, out_features=32, bias=True)
Layer 1: ReLU()
Layer 2: BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
Layer 3: Dropout(p=0.6, inplace=False)
Layer 4: Linear(in_features=32, out_features=16, bias=True)
Layer 5: ReLU()
Layer 6: BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
Layer 7: Dropout(p=0.5086790371135944, inplace=False)
Layer 8: Linear(in_features=16, out_features=16, bias=True)
Layer 9: ReLU()
Layer 10: BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
Layer 11: Dropout(p=0.1293341508821494, inplace=False)
Layer 12: Linear(in_features=16, out_features=2, bias=True)

Total parameters: 1,346
Trainable parameters: 1,346


In [64]:
results = train_model(mlp, train_loader, val_loader, criterion, optimizer, device)

Starting training for 300 epochs with patience=75
Epoch   1/300 | Train: Loss 0.6481 Acc 61.85% F1 0.5540 AUROC 0.6648 | Val: Loss 0.4666 Acc 85.27% F1 0.8066 AUROC 0.9213
Epoch  10/300 | Train: Loss 0.3322 Acc 88.93% F1 0.8628 AUROC 0.9183 | Val: Loss 0.2453 Acc 92.48% F1 0.9055 AUROC 0.9467
Epoch  20/300 | Train: Loss 0.3162 Acc 89.65% F1 0.8732 AUROC 0.9203 | Val: Loss 0.2502 Acc 92.01% F1 0.9014 AUROC 0.9503
Epoch  30/300 | Train: Loss 0.3126 Acc 89.95% F1 0.8756 AUROC 0.9198 | Val: Loss 0.2361 Acc 93.42% F1 0.9180 AUROC 0.9498
Epoch  40/300 | Train: Loss 0.3097 Acc 90.25% F1 0.8790 AUROC 0.9185 | Val: Loss 0.2426 Acc 92.63% F1 0.9084 AUROC 0.9453
Epoch  50/300 | Train: Loss 0.3108 Acc 90.23% F1 0.8802 AUROC 0.9190 | Val: Loss 0.2300 Acc 93.26% F1 0.9165 AUROC 0.9499
Epoch  60/300 | Train: Loss 0.3078 Acc 90.31% F1 0.8808 AUROC 0.9180 | Val: Loss 0.2411 Acc 93.57% F1 0.9207 AUROC 0.9488
Epoch  70/300 | Train: Loss 0.3095 Acc 90.39% F1 0.8814 AUROC 0.9134 | Val: Loss 0.2363 Acc 93.2

In [66]:
results

{'history': {'train_loss': [0.6480894684791565,
   0.5067323629444224,
   0.41958325715160055,
   0.38745966613094673,
   0.36369902044039626,
   0.3530406354867739,
   0.34543462310518536,
   0.3390187689434254,
   0.3317756957786028,
   0.3322473281740746,
   0.3322278246333037,
   0.32027956675057395,
   0.32053898111530316,
   0.31825077226787707,
   0.32481922758774107,
   0.3231137124604957,
   0.3218630981009664,
   0.3133955521639003,
   0.3152867945721775,
   0.3161604312369198,
   0.3244854280124867,
   0.30965123669649675,
   0.31935147302491323,
   0.30140225732841364,
   0.3229357551696689,
   0.31039020617142865,
   0.31589120071987775,
   0.3121609287800583,
   0.3171883160093694,
   0.31256204338564825,
   0.30977453251811754,
   0.31596082845003504,
   0.3009523396872207,
   0.3143972111896819,
   0.31043925752671453,
   0.30834243137179024,
   0.31369625076503055,
   0.31382785846426636,
   0.3148805736703334,
   0.30968328033174786,
   0.30434166622716324,
   0.30641

In [65]:
# Load best model and evaluate on test set
mlp.load_state_dict(torch.load('best_model.pth'))
test_loss, test_acc, test_labels, test_preds, test_probs = eval_epoch(mlp, test_loader, criterion, device)
test_precision, test_recall, test_f1, test_auroc = _compute_metrics(test_labels, test_preds, test_probs)

print(f"\nRESULTADOS DEL CONJUNTO DE PRUEBA (usando el mejor modelo):")
print(f"Pérdida: {test_loss:.4f}, Accuracy: {test_acc:.2f}%, Precision: {test_precision:.4f}, "
      f"Recall: {test_recall:.4f}, F1: {test_f1:.4f}, AUROC: {test_auroc:.4f}")


RESULTADOS DEL CONJUNTO DE PRUEBA (usando el mejor modelo):
Pérdida: 0.3034, Accuracy: 90.53%, Precision: 0.8730, Recall: 0.8987, F1: 0.8857, AUROC: 0.9163


In [40]:
from torchinfo import summary
summary(mlp, input_size=(1, 11))

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [1, 2]                    --
├─Linear: 1-1                            [1, 32]                   384
├─ReLU: 1-2                              [1, 32]                   --
├─BatchNorm1d: 1-3                       [1, 32]                   64
├─Dropout: 1-4                           [1, 32]                   --
├─Linear: 1-5                            [1, 16]                   528
├─ReLU: 1-6                              [1, 16]                   --
├─BatchNorm1d: 1-7                       [1, 16]                   32
├─Dropout: 1-8                           [1, 16]                   --
├─Linear: 1-9                            [1, 16]                   272
├─ReLU: 1-10                             [1, 16]                   --
├─BatchNorm1d: 1-11                      [1, 16]                   32
├─Dropout: 1-12                          [1, 16]                   --
├─Linear: 1-