### Setup

In [2]:
import os
from pathlib import Path
import pickle

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from utils import custom_reports_from_proba

In [3]:
project_path = Path(Path.cwd()).parent.parent

In [4]:
SEED = 7777

### Load Data

In [5]:
cancer_detection_path = project_path / "data/inputs/Lung Cancer Dataset.csv"
df_detection = pd.read_csv(cancer_detection_path)

df_detection.columns = [x for x in df_detection.columns.str.lower().str.replace(" ", "_")]

df_detection.head(10)

Unnamed: 0,age,gender,smoking,finger_discoloration,mental_stress,exposure_to_pollution,long_term_illness,energy_level,immune_weakness,breathing_issue,alcohol_consumption,throat_discomfort,oxygen_saturation,chest_tightness,family_history,smoking_family_history,stress_immune,pulmonary_disease
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0,NO
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0,YES
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0,NO
3,44,0,1,0,1,1,0,59.785767,0,1,0,1,95.1879,0,0,0,0,YES
4,72,0,1,1,1,1,1,59.733941,0,1,0,1,93.503008,0,0,0,0,YES
5,37,1,1,1,1,1,1,57.684285,0,1,1,1,94.057151,1,0,0,0,YES
6,50,0,1,1,1,0,1,52.647022,1,1,1,0,96.773598,0,0,0,1,NO
7,68,0,1,1,1,0,1,53.306451,0,0,0,1,95.019018,0,0,0,0,NO
8,48,0,1,1,0,1,1,64.272789,1,1,0,1,98.539379,1,0,0,0,YES
9,52,0,0,0,1,1,1,58.319319,0,1,0,1,96.055097,0,0,0,0,NO


In [6]:
df_detection.shape

(5000, 18)

### Preprocessing

In [7]:
# Convert label column to numerical values
label_map_dict = {
    'NO': 0,
    'YES': 1
}

df_detection['pulmonary_disease'] = df_detection['pulmonary_disease'].map(label_map_dict)

In [8]:
# Convert binary columns to categorical
binary_columns = [
    'gender',
    'smoking',
    'finger_discoloration',
    'mental_stress',
    'exposure_to_pollution',
    'long_term_illness',
    'immune_weakness',
    'breathing_issue',
    'alcohol_consumption',
    'throat_discomfort',
    'chest_tightness',
    'family_history',
    'smoking_family_history',
    'stress_immune',
    'pulmonary_disease'
]

df_detection[binary_columns] = df_detection[binary_columns].astype('category')

In [9]:
df_X = df_detection.drop(columns=['pulmonary_disease'])
X = df_detection.drop(columns=['pulmonary_disease']).values
y = df_detection['pulmonary_disease'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=SEED, stratify=y_train)

In [10]:
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Train set shape: (3612, 17)
Validation set shape: (638, 17)
Test set shape: (750, 17)


In [11]:
X_train.mean(axis=0), X_train.std(axis=0)

(array([57.37015504,  0.49612403,  0.66196013,  0.60354374,  0.54512735,
         0.51522702,  0.43992248, 54.99256295,  0.39451827,  0.79983389,
         0.35022148,  0.69988926, 94.9913959 ,  0.6013289 ,  0.303433  ,
         0.20265781,  0.21179402]),
 array([15.83079571,  0.49998498,  0.47304219,  0.48916121,  0.49795936,
         0.49976808,  0.49637757,  7.84740972,  0.48874697,  0.40012453,
         0.4770392 ,  0.45830589,  1.49321387,  0.48962481,  0.4597406 ,
         0.40197963,  0.40857963]))

In [12]:
with open("ga_results.pkl", "rb") as f:
    ga_results = pickle.load(f)

In [13]:
best_feature_set = ga_results['best_individual']
best_feature_mask = np.array(best_feature_set, dtype=bool)
print(best_feature_set)
print(best_feature_mask)

[1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1]
[ True False  True  True False  True False  True  True  True False  True
 False False  True  True  True]


In [14]:
X_train_fs = X_train[:,best_feature_mask]
X_val_fs = X_val[:,best_feature_mask]
X_test_fs = X_test[:,best_feature_mask]

In [16]:
df_X

Unnamed: 0,age,gender,smoking,finger_discoloration,mental_stress,exposure_to_pollution,long_term_illness,energy_level,immune_weakness,breathing_issue,alcohol_consumption,throat_discomfort,oxygen_saturation,chest_tightness,family_history,smoking_family_history,stress_immune
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0
3,44,0,1,0,1,1,0,59.785767,0,1,0,1,95.187900,0,0,0,0
4,72,0,1,1,1,1,1,59.733941,0,1,0,1,93.503008,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,32,0,1,1,0,0,1,60.700696,1,1,1,1,94.012495,0,1,1,0
4996,80,0,1,1,1,1,1,50.751741,0,1,1,1,94.394968,0,0,0,0
4997,51,1,0,0,1,0,0,61.063496,1,0,0,0,98.108901,0,0,0,1
4998,76,1,0,1,0,0,0,48.662872,0,1,0,1,95.577773,1,0,0,0


In [20]:
X_train_fs[:, [0,4]]

array([[84.        , 34.82742583],
       [46.        , 60.97233322],
       [82.        , 45.0186885 ],
       ...,
       [61.        , 55.51090135],
       [39.        , 61.09246986],
       [55.        , 50.93563797]], shape=(3612, 2))

In [17]:
X_train[:, [0,7,12]]

array([[84.        , 34.82742583, 96.70544603],
       [46.        , 60.97233322, 95.49773073],
       [82.        , 45.0186885 , 94.36662329],
       ...,
       [61.        , 55.51090135, 94.11784314],
       [39.        , 61.09246986, 93.30814995],
       [55.        , 50.93563797, 94.5186515 ]], shape=(3612, 3))

In [14]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


X_train_fs = scaler.fit_transform(X_train_fs)
X_val_fs = scaler.transform(X_val_fs)
X_test_fs = scaler.transform(X_test_fs)

In [15]:
X_columns = df_detection.drop(columns=['pulmonary_disease']).columns

# Apply the boolean mask to get the selected feature names
selected_columns = X_columns[best_feature_mask]

# Display them
print(selected_columns)

Index(['age', 'smoking', 'finger_discoloration', 'exposure_to_pollution',
       'energy_level', 'immune_weakness', 'breathing_issue',
       'throat_discomfort', 'family_history', 'smoking_family_history',
       'stress_immune'],
      dtype='object')


In [16]:
df_X[selected_columns]

Unnamed: 0,age,smoking,finger_discoloration,exposure_to_pollution,energy_level,immune_weakness,breathing_issue,throat_discomfort,family_history,smoking_family_history,stress_immune
0,68,1,1,1,57.831178,0,0,1,0,0,0
1,81,1,0,1,47.694835,1,1,1,0,0,0
2,58,1,0,0,59.577435,0,1,0,0,0,0
3,44,1,0,1,59.785767,0,1,1,0,0,0
4,72,1,1,1,59.733941,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4995,32,1,1,0,60.700696,1,1,1,1,1,0
4996,80,1,1,1,50.751741,0,1,1,0,0,0
4997,51,0,0,0,61.063496,1,0,0,0,0,1
4998,76,0,1,0,48.662872,0,1,1,0,0,0


### Naive Bayes

In [32]:
model = GaussianNB(var_smoothing=1e-12)
model.fit(X_train, y_train)

y_proba = model.predict_proba(X_test)

In [33]:
custom_reports_from_proba(y_test, y_proba)

* Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       444
           1       0.79      0.86      0.82       306

    accuracy                           0.85       750
   macro avg       0.84      0.85      0.85       750
weighted avg       0.85      0.85      0.85       750

* Confusion Matrix:
[[374  70]
 [ 42 264]]

AUROC: 0.8739
Accuracy: 0.8507
Recall: 0.8627
Precision: 0.7904
F1 Score: 0.8250


Unnamed: 0,accuracy,roc_auc,precision,recall,f1_score
0,0.850667,0.873903,0.790419,0.862745,0.825


AUROC: 0.8739
Accuracy: 0.8507
Recall: 0.8627
Precision: 0.7904
F1 Score: 0.8250

In [24]:
model = GaussianNB()
model.fit(X_train_fs, y_train)

y_proba = model.predict_proba(X_test_fs)

In [25]:
custom_reports_from_proba(y_test, y_proba)

* Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.83      0.86       444
           1       0.78      0.86      0.82       306

    accuracy                           0.84       750
   macro avg       0.84      0.85      0.84       750
weighted avg       0.85      0.84      0.84       750

* Confusion Matrix:
[[369  75]
 [ 42 264]]

AUROC: 0.8759
Accuracy: 0.8440
Recall: 0.8627
Precision: 0.7788
F1 Score: 0.8186


Unnamed: 0,accuracy,roc_auc,precision,recall,f1_score
0,0.844,0.875891,0.778761,0.862745,0.818605


In [1]:
X_train_fs

NameError: name 'X_train_fs' is not defined