# 03. Feature Selection

In [24]:
import pandas as pd

file_cleaned = "../data/interim/cleaning/data_without_na_scaled.csv"

df = pd.read_csv(file_cleaned, index_col=0)
df.drop(columns=["cohorte"], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191 entries, CM.1.0001.1 to GSM1817999
Columns: 19021 entries, RPS28 to grado_histologico
dtypes: float64(19020), int64(1)
memory usage: 27.7+ MB


## Removing features with low variance

In [13]:
import numpy as np

variances = df.iloc[:, :-2].var()
threshold = np.percentile(variances, 20)  # drop bottom 20%
selected_genes = variances[variances > threshold].index

df_var = df[selected_genes.tolist() + ['relapse', 'grado_histologico']]

In [14]:
df_var.info()

<class 'pandas.core.frame.DataFrame'>
Index: 191 entries, CM.1.0001.1 to GSM1817999
Columns: 13633 entries, RPS28 to grado_histologico
dtypes: float64(13632), int64(1)
memory usage: 19.9+ MB


In [15]:
import numpy as np
import matplotlib.pyplot as plt
from kneed import KneeLocator

variances = df.iloc[:, :-2].var()
sorted_var = np.sort(variances.values)

knee = KneeLocator(range(len(sorted_var)), sorted_var, curve='convex', direction='increasing').knee
threshold = sorted_var[knee]
print(f"Automatic variance threshold (elbow method): {threshold:.6f}")

selected_genes = variances[variances > threshold].index

print(f"Number of genes selected: {len(selected_genes)}")

Automatic variance threshold (elbow method): 1.005263
Number of genes selected: 2


By the elbow method there are only two selected genes. Also from [01. Exploration.ipynb](../notebooks/01_Exploration.ipynb)

## XGBoost

In [21]:
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel

X = df.drop(columns=['relapse'])
y = df['relapse']

SEED = 1
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=3,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED
)
model.fit(X, y)

selector = SelectFromModel(model, prefit=True, threshold='median')

X_selected = selector.transform(X)
selected_features = X.columns[selector.get_support()]
print(f"Number of XGBoost-selected features: {len(selected_features)}")

Number of XGBoost-selected features: 19020




In [22]:
selected_features

Index(['RPS28', 'IPO13', 'FAM86FP', 'CDT1', 'CCNE2', 'BORCS5', 'SNIP1',
       'COL17A1', 'BCL6B', 'ATP13A4',
       ...
       'MIR551B', 'MIR643', 'MIRLET7A1', 'SNORD114.13', 'MIR335', 'MIR663B',
       'MIR320C1', 'LINC00293', 'LINC00173', 'grado_histologico'],
      dtype='object', length=19020)

## Differential expression

Made with limma in R, see [03.1_Differential_expression.Rmd](03.1_Differential_expression.Rmd)