### SetUp

In [1]:
import os
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
# import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import Sequential, Linear, ReLU, LeakyReLU, Sigmoid, Dropout
from train import _compute_metrics, train_epoch, eval_epoch, train_model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# import matplotlib.pyplot as plt

from train import train_model

In [2]:
project_path = Path(Path.cwd()).parent.parent

In [3]:
SEED = 7777

### Load Data

In [4]:
cancer_detection_path = project_path / "data/inputs/Lung Cancer Dataset.csv"
df_detection = pd.read_csv(cancer_detection_path)

df_detection.columns = [x for x in df_detection.columns.str.lower().str.replace(" ", "_")]

df_detection.head(10)

Unnamed: 0,age,gender,smoking,finger_discoloration,mental_stress,exposure_to_pollution,long_term_illness,energy_level,immune_weakness,breathing_issue,alcohol_consumption,throat_discomfort,oxygen_saturation,chest_tightness,family_history,smoking_family_history,stress_immune,pulmonary_disease
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0,NO
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0,YES
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0,NO
3,44,0,1,0,1,1,0,59.785767,0,1,0,1,95.1879,0,0,0,0,YES
4,72,0,1,1,1,1,1,59.733941,0,1,0,1,93.503008,0,0,0,0,YES
5,37,1,1,1,1,1,1,57.684285,0,1,1,1,94.057151,1,0,0,0,YES
6,50,0,1,1,1,0,1,52.647022,1,1,1,0,96.773598,0,0,0,1,NO
7,68,0,1,1,1,0,1,53.306451,0,0,0,1,95.019018,0,0,0,0,NO
8,48,0,1,1,0,1,1,64.272789,1,1,0,1,98.539379,1,0,0,0,YES
9,52,0,0,0,1,1,1,58.319319,0,1,0,1,96.055097,0,0,0,0,NO


In [5]:
df_detection.shape

(5000, 18)

### Preprocessing

In [11]:
# Convert label column to numerical values
label_map_dict = {
    'NO': 0,
    'YES': 1
}

df_detection['pulmonary_disease'] = df_detection['pulmonary_disease'].map(label_map_dict)

In [12]:
# Convert binary columns to categorical
binary_columns = [
    'gender',
    'smoking',
    'finger_discoloration',
    'mental_stress',
    'exposure_to_pollution',
    'long_term_illness',
    'immune_weakness',
    'breathing_issue',
    'alcohol_consumption',
    'throat_discomfort',
    'chest_tightness',
    'family_history',
    'smoking_family_history',
    'stress_immune',
    'pulmonary_disease'
]

df_detection[binary_columns] = df_detection[binary_columns].astype('category')

In [13]:
X = df_detection.drop(columns=['pulmonary_disease']).values
y = df_detection['pulmonary_disease'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=SEED, stratify=y_train)

ValueError: Input y contains NaN.

In [None]:
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Train set shape: (3612, 17)
Validation set shape: (638, 17)
Test set shape: (750, 17)


In [None]:
X_train.mean(axis=0), X_train.std(axis=0)

(array([57.37015504,  0.49612403,  0.66196013,  0.60354374,  0.54512735,
         0.51522702,  0.43992248, 54.99256295,  0.39451827,  0.79983389,
         0.35022148,  0.69988926, 94.9913959 ,  0.6013289 ,  0.303433  ,
         0.20265781,  0.21179402]),
 array([15.83079571,  0.49998498,  0.47304219,  0.48916121,  0.49795936,
         0.49976808,  0.49637757,  7.84740972,  0.48874697,  0.40012453,
         0.4770392 ,  0.45830589,  1.49321387,  0.48962481,  0.4597406 ,
         0.40197963,  0.40857963]))

In [None]:
# scaler = StandardScaler()

# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)
# X_test = scaler.transform(X_test)

In [None]:
# X_train.mean(axis=0), X_train.std(axis=0)

### GA: Feature Selection

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)

In [15]:
device

device(type='cuda')

In [None]:
from genetic_feature_selection import run_genetic_algorithm

# Run genetic algorithm for feature selection
ga_results = run_genetic_algorithm(
    X_train=X_train,  # Use unscaled data (scaling happens inside)
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    device=device,
    n_generations=20,
    population_size=25,
    cx_prob=0.7,
    mut_prob=0.2,
    epochs_per_individual=300,
    patience=75,
    verbose=True,
    verbose_individuals=True 
)

Starting genetic algorithm with 17 features
Population size: 25, Generations: 20
Individual finished: 12 features | Val Loss: 0.5118 | Fitness: -0.5118 | Epochs: 168
Individual finished: 7 features | Val Loss: 0.5944 | Fitness: -0.5944 | Epochs: 64
Individual finished: 9 features | Val Loss: 0.3310 | Fitness: -0.3310 | Epochs: 250
Individual finished: 12 features | Val Loss: 0.3700 | Fitness: -0.3700 | Epochs: 113
Individual finished: 9 features | Val Loss: 0.4437 | Fitness: -0.4437 | Epochs: 152
Individual finished: 8 features | Val Loss: 0.5761 | Fitness: -0.5761 | Epochs: 134
Individual finished: 7 features | Val Loss: 0.6205 | Fitness: -0.6205 | Epochs: 87
Individual finished: 10 features | Val Loss: 0.3250 | Fitness: -0.3250 | Epochs: 204
Individual finished: 6 features | Val Loss: 0.5054 | Fitness: -0.5054 | Epochs: 134
Individual finished: 7 features | Val Loss: 0.4315 | Fitness: -0.4315 | Epochs: 58
Individual finished: 7 features | Val Loss: 0.4748 | Fitness: -0.4748 | Epochs:

In [17]:
ga_results['best_individual']

[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1]

In [22]:
ga_results['hall_of_fame'][2]

[1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1]

In [25]:
ga_results['n_selected_features']

11

In [32]:
ga_results['hall_of_fame'] = [i for i in ga_results['hall_of_fame']]

In [34]:
import pickle
with open("ga_results.pkl", "wb") as f:
    pickle.dump(ga_results, f)