In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Feature extraction
from sklearn.feature_selection import VarianceThreshold

from boruta import BorutaPy
from sklearn.inspection import permutation_importance

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

# Classification models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data_path = "../../data/"

In [3]:
df_clinical = pd.read_excel(data_path + "clinical_data.xls")
df_clinical.replace("[Not Available]", np.nan, inplace=True)
df_clinical.rename(columns={'bcr_patient_barcode': 'Lesion Name'}, inplace=True)
df_clinical = df_clinical[['Lesion Name', 'age_at_initial_pathologic_diagnosis', 'ajcc_neoplasm_disease_stage', "ajcc_tumor_stage_code", "number_of_lymphnodes_positive_by_he",
                          "breast_carcinoma_estrogen_receptor_status", "breast_carcinoma_progesterone_receptor_status"]]

target_class = pd.read_csv(data_path + 'target_class.csv')
target_class.rename(columns={'CLID': 'Lesion Name'}, inplace=True)

  df_clinical.replace("[Not Available]", np.nan, inplace=True)


In [4]:
df = pd.merge(df_clinical, target_class, on='Lesion Name', how='inner')
df = df.drop(columns=df.columns[0])
df

Unnamed: 0,age_at_initial_pathologic_diagnosis,ajcc_neoplasm_disease_stage,ajcc_tumor_stage_code,number_of_lymphnodes_positive_by_he,breast_carcinoma_estrogen_receptor_status,breast_carcinoma_progesterone_receptor_status,Pam50.Call
0,64,Stage IIA,T2,0.0,Positive,Negative,LumB
1,47,Stage IIA,T2,0.0,Positive,Positive,LumA
2,60,Stage I,T1c,0.0,Positive,Negative,LumB
3,41,Stage IIA,T1c,1.0,Positive,Positive,LumA
4,50,Stage IIA,T2,0.0,Positive,Positive,LumA
...,...,...,...,...,...,...,...
71,73,Stage I,T1,0.0,Negative,Negative,Basal
72,40,Stage IIIA,T2,5.0,Positive,Positive,LumA
73,60,Stage II,T2,0.0,Positive,Positive,LumB
74,47,Stage I,T1c,0.0,Positive,Positive,LumA


In [5]:
df_dummies = pd.get_dummies(df, columns=['ajcc_neoplasm_disease_stage', "ajcc_tumor_stage_code",
                                        "breast_carcinoma_estrogen_receptor_status", 
                                         "breast_carcinoma_progesterone_receptor_status"], drop_first=True)

new_dummy_columns = set(df_dummies.columns) - set(df.columns)

df = df_dummies
df = df.dropna()

In [6]:
features = df.columns.difference(['Pam50.Call'])
target_col = 'Pam50.Call'

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target_col],
    test_size=0.2,
    random_state=42,
    stratify=df[target_col]
)

In [7]:
under_sampler = RandomUnderSampler(sampling_strategy={"LumA": 30}, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

over_sampler = SMOTE(sampling_strategy="not majority", random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train_under, y_train_under)

print("\nDistribución tras Under-Sampling + Over-Sampling en train:")
print(y_train_resampled.value_counts())


Distribución tras Under-Sampling + Over-Sampling en train:
Pam50.Call
Basal    30
Her2     30
LumA     30
LumB     30
Name: count, dtype: int64


In [8]:
df_resampled = pd.concat(
    [pd.DataFrame(X_train_resampled), pd.Series(y_train_resampled, name='Pam50.Call')],
    axis=1
)

In [9]:
selected_features = list(new_dummy_columns) + ['age_at_initial_pathologic_diagnosis', "number_of_lymphnodes_positive_by_he"]

In [10]:
X_train_resampled_selected = X_train_resampled[selected_features]
X_test_selected = X_test[selected_features]

In [11]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, features, target_col, model, imprimir=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    if imprimir:
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    return macro_f1

In [12]:
rf_model = RandomForestClassifier(random_state=42)

macro_f1_score = train_and_evaluate_model(
    X_train_resampled_selected, X_test_selected, y_train_resampled, y_test,
    features=[col for col in df.columns if col != "Pam50.Call"],
    target_col="Pam50.Call",
    model=rf_model,
    imprimir=True
)

Classification Report:
              precision    recall  f1-score   support

       Basal       0.67      1.00      0.80         2
        Her2       0.00      0.00      0.00         1
        LumA       0.80      0.80      0.80        10
        LumB       0.00      0.00      0.00         2

    accuracy                           0.67        15
   macro avg       0.37      0.45      0.40        15
weighted avg       0.62      0.67      0.64        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
