In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Feature extraction
from sklearn.feature_selection import VarianceThreshold

from boruta import BorutaPy
from sklearn.inspection import permutation_importance

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

# Classification models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data_path = "../../data/"

In [3]:
df_radiomics = pd.read_excel(data_path + 'quantitative_radiomic_features.xls')
df_radiomics['Lesion Name'] = df_radiomics['Lesion Name'].str.replace(r'-1\.les|-S2-1\.les', '', regex=True)

df_clinical = pd.read_excel(data_path + "clinical_data.xls")
df_clinical.replace("[Not Available]", np.nan, inplace=True)
df_clinical.rename(columns={'bcr_patient_barcode': 'Lesion Name'}, inplace=True)
df_clinical = df_clinical[['Lesion Name', 'age_at_initial_pathologic_diagnosis', 'ajcc_neoplasm_disease_stage', "ajcc_tumor_stage_code", "number_of_lymphnodes_positive_by_he",
                          "breast_carcinoma_estrogen_receptor_status", "breast_carcinoma_progesterone_receptor_status"]]

target_class = pd.read_csv(data_path + 'target_class.csv')
target_class.rename(columns={'CLID': 'Lesion Name'}, inplace=True)

  df_clinical.replace("[Not Available]", np.nan, inplace=True)


In [4]:
df = pd.merge(df_radiomics, target_class, on='Lesion Name', how='inner')
df = pd.merge(df, df_clinical, on='Lesion Name', how='inner')
df = df.drop(columns=df.columns[0])
df

Unnamed: 0,Maximum enhancement (K1),Time to peak (K2),Uptake rate (K3),Washout rate (K4),Curve shape index (K5),E1 (K6),Signal Enhancement Ratio (SER) (K7),Maximum enhancement-variance (E1),Enhancement-Variance Time to Peak (E2),Enhancement-variance Increasing Rate (E3),...,Surface Area to Volume ratio (G3),Volume of most enhancing voxels (S4),Maximum Diameter (S5),Pam50.Call,age_at_initial_pathologic_diagnosis,ajcc_neoplasm_disease_stage,ajcc_tumor_stage_code,number_of_lymphnodes_positive_by_he,breast_carcinoma_estrogen_receptor_status,breast_carcinoma_progesterone_receptor_status
0,1.602573,145.433,0.011019,0.000809,-0.099998,1.550204,1.111109,0.125963,60.000,0.002099,...,0.922937,22.15705,27.60382,LumA,29,Stage IA,T1c,0.0,Positive,Positive
1,4.072152,144.752,0.028132,0.001510,-0.007311,3.973258,1.007365,4.054312,229.504,0.017666,...,0.849641,111.70860,21.31066,LumA,41,Stage IA,T1c,0.0,Positive,Positive
2,1.303264,168.383,0.007740,0.000667,-0.146959,1.273733,1.172277,0.534411,60.000,0.008907,...,0.842762,236.23630,41.33787,LumA,61,Stage IIIC,T2,15.0,Positive,Positive
3,0.758630,60.000,0.012644,0.000479,-0.150482,0.758630,1.177138,0.037956,298.418,0.000127,...,1.028034,122.69640,11.39449,LumA,56,Stage IIA,T1c,0.0,Positive,Negative
4,3.353556,60.000,0.055893,0.000920,-0.080066,3.353556,1.087034,2.330160,60.000,0.038836,...,0.918724,245.36130,68.62797,LumB,40,Stage IIB,T2,2.0,Positive,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,1.277688,60.000,0.021295,0.000611,-0.149392,1.277688,1.175629,0.271695,60.000,0.004528,...,0.978195,94.23584,51.20215,LumA,66,Stage IIA,T2,0.0,Positive,Positive
72,1.495674,60.000,0.024928,0.000093,-0.017541,1.495674,1.017854,0.123041,60.000,0.002051,...,1.142819,88.69263,22.50068,LumA,46,Stage IIA,T2,0.0,Positive,Positive
73,2.182750,60.000,0.036379,0.001081,-0.125678,2.182750,1.143744,0.225339,313.733,0.000718,...,0.679105,89.18701,30.87951,Basal,44,Stage IIA,T2,0.0,Negative,Negative
74,1.354828,192.786,0.007028,0.000067,0.043301,1.290058,0.958496,0.108778,60.000,0.001813,...,0.986278,36.73828,23.13763,LumA,61,Stage IIB,T2,1.0,Positive,Positive


In [5]:
from sklearn.preprocessing import StandardScaler

radiomic_features = df.drop(columns=["Pam50.Call", 'age_at_initial_pathologic_diagnosis', 'ajcc_neoplasm_disease_stage', "ajcc_tumor_stage_code", "number_of_lymphnodes_positive_by_he",
                                    "breast_carcinoma_estrogen_receptor_status", "breast_carcinoma_progesterone_receptor_status"])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(radiomic_features)

df_scaled = pd.DataFrame(scaled_features, columns=radiomic_features.columns)

df_scaled["Pam50.Call"] = df["Pam50.Call"].values
df_scaled["age_at_initial_pathologic_diagnosis"] = df["age_at_initial_pathologic_diagnosis"].values
df_scaled["ajcc_neoplasm_disease_stage"] = df["ajcc_neoplasm_disease_stage"].values
df_scaled["ajcc_tumor_stage_code"] = df["ajcc_tumor_stage_code"].values
df_scaled["number_of_lymphnodes_positive_by_he"] = df["number_of_lymphnodes_positive_by_he"].values 
df_scaled["breast_carcinoma_estrogen_receptor_status"] = df["breast_carcinoma_estrogen_receptor_status"].values
df_scaled["breast_carcinoma_progesterone_receptor_status"] = df["breast_carcinoma_progesterone_receptor_status"].values 

df = df_scaled
df_dummies = pd.get_dummies(df, columns=['ajcc_neoplasm_disease_stage', "ajcc_tumor_stage_code",
                                        "breast_carcinoma_estrogen_receptor_status", 
                                         "breast_carcinoma_progesterone_receptor_status"], drop_first=True)

new_dummy_columns = set(df_dummies.columns) - set(df.columns)

df = df_dummies

In [6]:
df

Unnamed: 0,Maximum enhancement (K1),Time to peak (K2),Uptake rate (K3),Washout rate (K4),Curve shape index (K5),E1 (K6),Signal Enhancement Ratio (SER) (K7),Maximum enhancement-variance (E1),Enhancement-Variance Time to Peak (E2),Enhancement-variance Increasing Rate (E3),...,ajcc_neoplasm_disease_stage_Stage IIA,ajcc_neoplasm_disease_stage_Stage IIB,ajcc_neoplasm_disease_stage_Stage IIIA,ajcc_neoplasm_disease_stage_Stage IIIC,ajcc_tumor_stage_code_T1b,ajcc_tumor_stage_code_T1c,ajcc_tumor_stage_code_T2,ajcc_tumor_stage_code_T3,breast_carcinoma_estrogen_receptor_status_Positive,breast_carcinoma_progesterone_receptor_status_Positive
0,-0.195663,0.011278,-0.496149,-0.151982,-0.297797,-0.155749,0.009570,-0.130554,-1.065664,-0.123696,...,False,False,False,False,False,True,False,False,True,True
1,2.618630,0.003479,0.917726,0.397861,0.255429,2.655287,-0.257523,-0.074160,0.222799,-0.093276,...,False,False,False,False,False,True,False,False,True,True
2,-0.536750,0.274115,-0.767103,-0.263946,-0.578093,-0.476489,0.167049,-0.124690,-1.065664,-0.110393,...,False,False,False,True,False,False,True,False,True,True
3,-1.157407,-0.967149,-0.361928,-0.411282,-0.599120,-1.074071,0.179564,-0.131817,0.746640,-0.127551,...,True,False,False,False,False,True,False,False,True,False
4,1.799730,-0.967149,3.211365,-0.065331,-0.178825,1.936358,-0.052412,-0.098911,-1.065664,-0.051902,...,False,True,False,False,False,False,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.565896,-0.967149,0.352830,-0.307554,-0.592611,-0.471901,0.175679,-0.128462,-1.065664,-0.118950,...,True,False,False,False,False,False,True,False,True,True
72,-0.317483,-0.967149,0.653004,-0.714435,0.194371,-0.219010,-0.230518,-0.130596,-1.065664,-0.123792,...,True,False,False,False,False,False,True,False,True,True
73,0.465498,-0.967149,1.599129,0.061532,-0.451073,0.578081,0.093590,-0.129127,0.863055,-0.126395,...,True,False,False,False,False,False,True,False,False,False
74,-0.477989,0.553592,-0.825950,-0.734472,0.557517,-0.457550,-0.383337,-0.130801,-1.065664,-0.124256,...,False,True,False,False,False,False,True,False,True,True


In [7]:
features = df.columns.difference(['Pam50.Call'])
target_col = 'Pam50.Call'

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target_col],
    test_size=0.2,
    random_state=42,
    stratify=df[target_col]
)

In [8]:
X_train

Unnamed: 0,Contrast (T1),Correlation (T2),Curve shape index (K5),Difference Entropy (T3),Difference Variance (T4),E1 (K6),Effective Diameter (S2),Energy (T5),Enhancement-Variance Time to Peak (E2),Enhancement-variance Decreasing Rate (E4),...,ajcc_neoplasm_disease_stage_Stage IIB,ajcc_neoplasm_disease_stage_Stage IIIA,ajcc_neoplasm_disease_stage_Stage IIIC,ajcc_tumor_stage_code_T1b,ajcc_tumor_stage_code_T1c,ajcc_tumor_stage_code_T2,ajcc_tumor_stage_code_T3,breast_carcinoma_estrogen_receptor_status_Positive,breast_carcinoma_progesterone_receptor_status_Positive,number_of_lymphnodes_positive_by_he
11,-0.684527,0.679692,-0.349218,-0.555099,-0.695339,0.592338,0.478739,-0.177537,-1.065664,-0.124209,...,False,False,False,False,False,False,False,True,True,0.0
70,1.180559,-1.166517,-0.231014,1.075001,1.211064,-0.376303,0.086192,0.208081,1.47308,-0.126759,...,False,False,False,False,False,True,False,True,True,0.0
44,0.425652,-0.410766,-0.190543,0.454262,0.503337,-0.527604,-0.472722,-0.692318,-1.065664,-0.124914,...,False,False,False,False,False,True,False,True,True,0.0
36,-0.247246,0.327783,-0.586582,-0.107842,-0.166853,-0.226578,0.448158,0.283336,0.583773,-0.124998,...,True,False,False,False,False,True,False,True,True,3.0
22,-1.962088,1.926856,-0.609998,-2.151944,-2.038913,0.364438,-0.814722,1.708167,-1.065664,-0.120363,...,False,False,False,False,True,False,False,True,True,0.0
16,-1.51947,1.495774,0.193791,-1.540339,-1.494517,-0.17675,0.397194,1.037686,-1.065664,-0.126152,...,True,False,False,False,False,True,False,False,False,2.0
12,0.365084,-0.408202,-0.692573,0.422172,0.274985,-0.280404,-0.760233,-1.100867,0.72826,-0.126759,...,False,False,False,False,False,True,False,True,True,0.0
55,-1.416246,1.406506,-0.551006,-1.439514,-1.342228,-0.157363,0.710036,1.22446,-1.065664,-0.122676,...,True,False,False,False,False,True,False,True,True,2.0
9,1.94863,-1.850265,-0.388592,1.529725,1.920157,0.503623,0.494166,-1.65645,1.883668,-0.126759,...,False,False,False,False,False,True,False,False,False,0.0
51,-1.266737,1.256714,0.680154,-1.11813,-1.378161,6.396529,0.043263,1.122538,-0.487177,8.631473,...,False,False,False,False,False,True,False,True,True,0.0


In [9]:
under_sampler = RandomUnderSampler(sampling_strategy={"LumA": 30}, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

over_sampler = SMOTE(sampling_strategy="not majority", random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train_under, y_train_under)

print("\nDistribución tras Under-Sampling + Over-Sampling en train:")
print(y_train_resampled.value_counts())


Distribución tras Under-Sampling + Over-Sampling en train:
Pam50.Call
Basal    30
Her2     30
LumA     30
LumB     30
Name: count, dtype: int64


In [10]:
df_resampled = pd.concat(
    [pd.DataFrame(X_train_resampled), pd.Series(y_train_resampled, name='Pam50.Call')],
    axis=1
)

In [11]:
df_train = df_resampled

In [12]:
selected_features = ['Margin Sharpness (M1)', 'Maximum enhancement-variance (E1)', 'Surface Area (S3)'] + list(new_dummy_columns)

In [13]:
X_train_resampled_selected = X_train_resampled[selected_features]
X_test_selected = X_test[selected_features]

In [14]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, features, target_col, model, imprimir=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    if imprimir:
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    return macro_f1

In [15]:
rf_model = RandomForestClassifier(random_state=42)

macro_f1_score = train_and_evaluate_model(
    X_train_resampled_selected, X_test_selected, y_train_resampled, y_test,
    features=[col for col in df.columns if col != "Pam50.Call"],
    target_col="Pam50.Call",
    model=rf_model,
    imprimir=True
)

Classification Report:
              precision    recall  f1-score   support

       Basal       1.00      1.00      1.00         2
        Her2       1.00      1.00      1.00         1
        LumA       0.89      0.73      0.80        11
        LumB       0.25      0.50      0.33         2

    accuracy                           0.75        16
   macro avg       0.78      0.81      0.78        16
weighted avg       0.83      0.75      0.78        16

