In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Feature extraction
from sklearn.feature_selection import VarianceThreshold

from boruta import BorutaPy
from sklearn.inspection import permutation_importance

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

# Classification models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data_path = "../../data/"

In [3]:
df_assays = pd.read_excel(data_path + "multigenic_assays.xlsx")
df_assays = df_assays.drop(columns=df_assays.columns[1])
df_assays.rename(columns={'CLID': 'Lesion Name'}, inplace=True)
df_assays = df_assays[['Lesion Name', 'GHI_RS Score', 'Mammaprint Pcorr_NKI70_Good_Correlation_Nature.2002_PMID.11823860', 
                        'UNC_Proliferation_11_Mean_JCO.2009_PMID.19204204']]

target_class = pd.read_csv(data_path + 'target_class.csv')
target_class.rename(columns={'CLID': 'Lesion Name'}, inplace=True)

  warn(msg)


In [4]:
df = pd.merge(df_assays, target_class, on='Lesion Name', how='inner')
df = df.drop(columns=df.columns[0])
df

Unnamed: 0,GHI_RS Score,Mammaprint Pcorr_NKI70_Good_Correlation_Nature.2002_PMID.11823860,UNC_Proliferation_11_Mean_JCO.2009_PMID.19204204,Pam50.Call
0,100.000000,0.458,-0.232964,LumA
1,48.574519,0.628,-0.673673,LumA
2,55.213839,0.567,-0.263108,LumA
3,72.668351,0.420,-0.398098,LumA
4,100.000000,0.121,0.233268,LumB
...,...,...,...,...
71,100.000000,0.319,0.061330,LumA
72,38.471147,0.469,-0.341741,LumA
73,100.000000,-0.541,0.263237,Basal
74,54.557275,0.704,-0.345247,LumA


In [5]:
from sklearn.preprocessing import StandardScaler

radiomic_features = df.drop(columns=["Pam50.Call"])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(radiomic_features)

df_scaled = pd.DataFrame(scaled_features, columns=radiomic_features.columns)

df_scaled["Pam50.Call"] = df["Pam50.Call"].values

df = df_scaled

In [6]:
features = df.columns.difference(['Pam50.Call'])
target_col = 'Pam50.Call'

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target_col],
    test_size=0.2,
    random_state=42,
    stratify=df[target_col]
)

In [7]:
under_sampler = RandomUnderSampler(sampling_strategy={"LumA": 30}, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

over_sampler = SMOTE(sampling_strategy="not majority", random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train_under, y_train_under)

print("\nDistribución tras Under-Sampling + Over-Sampling en train:")
print(y_train_resampled.value_counts())


Distribución tras Under-Sampling + Over-Sampling en train:
Pam50.Call
Basal    30
Her2     30
LumA     30
LumB     30
Name: count, dtype: int64


In [8]:
df_resampled = pd.concat(
    [pd.DataFrame(X_train_resampled), pd.Series(y_train_resampled, name='Pam50.Call')],
    axis=1
)

In [9]:
df_train = df_resampled

In [10]:
selected_features = ['GHI_RS Score', 
                                             'Mammaprint Pcorr_NKI70_Good_Correlation_Nature.2002_PMID.11823860', 
                                             'UNC_Proliferation_11_Mean_JCO.2009_PMID.19204204']

In [11]:
X_train_resampled_selected = X_train_resampled[selected_features]
X_test_selected = X_test[selected_features]

In [12]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, features, target_col, model, imprimir=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    if imprimir:
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    return macro_f1

In [13]:
rf_model = RandomForestClassifier(random_state=42)

macro_f1_score = train_and_evaluate_model(
    X_train_resampled_selected, X_test_selected, y_train_resampled, y_test,
    features=[col for col in df.columns if col != "Pam50.Call"],
    target_col="Pam50.Call",
    model=rf_model,
    imprimir=True
)

Classification Report:
              precision    recall  f1-score   support

       Basal       1.00      1.00      1.00         2
        Her2       0.33      1.00      0.50         1
        LumA       1.00      0.82      0.90        11
        LumB       0.00      0.00      0.00         2

    accuracy                           0.75        16
   macro avg       0.58      0.70      0.60        16
weighted avg       0.83      0.75      0.78        16

