# 03. Feature Selection

In [10]:
import pandas as pd

file_cleaned = "../data/interim/cleaning/data_without_na_scaled.csv"

df = pd.read_csv(file_cleaned, index_col=0)
df.drop(columns=["cohorte"], inplace=True)

## Removing features with low variance

In [13]:
import numpy as np

variances = df.iloc[:, :-2].var()
threshold = np.percentile(variances, 20)  # drop bottom 20%
selected_genes = variances[variances > threshold].index

df_var = df[selected_genes.tolist() + ['relapse', 'grado_histologico']]

In [14]:
df_var.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191 entries, CM.1.0001.1 to GSM1817999
Columns: 13633 entries, RPS28 to grado_histologico
dtypes: float64(13632), int64(1)
memory usage: 19.9+ MB


In [15]:
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator

variances = df.iloc[:, :-2].var()
sorted_var = np.sort(variances.values)

knee = KneeLocator(range(len(sorted_var)), sorted_var, curve='convex', direction='increasing').knee
threshold = sorted_var[knee]
print(f"Automatic variance threshold (elbow method): {threshold:.6f}")

selected_genes = variances[variances > threshold].index

print(f"Number of genes selected: {len(selected_genes)}")

Automatic variance threshold (elbow method): 1.005263
Number of genes selected: 2


By the elbow method there are only two selected genes. Also from [01. Exploration.ipynb](../notebooks/01_Exploration.ipynb)

## XGBoost

In [20]:
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel

X = df.drop(columns=['relapse'])
y = df['relapse']

SEED = 1
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED
)
model.fit(X, y)

selector = SelectFromModel(model, prefit=True, threshold='median')

X_selected = selector.transform(X)
selected_features = X.columns[selector.get_support()]
print(f"Number of XGBoost-selected features: {len(selected_features)}")


Unnamed: 0_level_0,RPS28,IPO13,FAM86FP,CDT1,CCNE2,BORCS5,SNIP1,COL17A1,BCL6B,ATP13A4,...,MIR551B,MIR643,MIRLET7A1,SNORD114.13,MIR335,MIR663B,MIR320C1,LINC00293,LINC00173,grado_histologico
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CM.1.0001.1,1.373487,1.716361,-0.089564,0.225902,-0.202395,-0.569692,0.161433,-0.244178,-0.239723,-0.279126,...,-1.214030,-0.268478,-1.214020,-0.203979,-1.214639,-1.165553,-0.151698,-0.282754,-0.787219,5.0
CM.1.0002.1,-0.706680,0.127347,-0.907079,-0.986200,-1.153092,-1.146802,-1.457550,-0.255718,-0.563852,-0.279126,...,-1.214030,-0.268478,-1.214020,-0.203979,-1.214639,-0.045805,-0.151698,-0.282754,0.360444,2.0
CM.1.0003.1,-0.346822,1.092426,-0.769052,0.797883,-0.779179,-1.392658,-0.766612,-0.638518,-0.957907,-0.279126,...,-1.214030,-0.268478,-1.214020,-0.203979,-1.214639,-1.165553,-0.151698,-0.282754,0.156276,3.0
CM.1.0004.1,-0.700655,-0.018314,-0.639758,-0.755927,0.422051,-1.205923,-0.423498,0.594531,-0.625982,0.149261,...,-1.214030,-0.268478,-1.214020,-0.203979,-1.214639,-1.165553,-0.151698,-0.282754,-0.405295,3.0
CM.1.0005.1,-0.490950,0.224831,-0.389833,-0.746900,-0.728839,-0.655022,-0.225483,1.167036,-0.928943,0.221714,...,-1.214030,-0.268478,-1.214020,-0.203979,-1.214639,-1.165553,-0.151698,-0.282754,0.259722,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM1817994,-0.115253,-0.768386,2.974915,-0.302277,-0.059741,-1.077355,0.483120,-0.455476,-0.606866,-1.022700,...,0.727899,-0.028622,0.768371,-0.009156,0.750808,0.033327,-0.060937,-1.433715,1.465470,2.0
GSM1817995,0.018916,-0.461088,-0.310531,-0.196803,-0.188263,-0.610565,0.339095,1.963493,-0.951843,-0.305429,...,0.743398,0.007609,0.734478,-0.006224,0.752778,0.033539,-0.063191,1.126295,-0.675912,2.0
GSM1817997,0.054486,1.387792,-0.204365,1.515658,-1.404169,-0.773733,-0.672459,-0.718600,0.198706,-0.903921,...,0.785174,-0.046465,0.731816,-0.006380,0.790351,0.036381,-0.031161,-0.120821,0.808856,2.0
GSM1817998,0.177533,-0.316735,1.153530,0.994171,1.307301,-0.292530,-0.519574,-1.308803,-0.928229,-1.189082,...,0.770549,-0.057475,0.741867,-0.004693,0.776392,0.035609,-0.190390,-2.675097,-0.655306,2.0
