# Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Feature extraction
from sklearn.feature_selection import VarianceThreshold

from boruta import BorutaPy
from sklearn.inspection import permutation_importance

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

# Classification models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 985, in launch_instance
    app.start()
  File "C:\Use

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 985, in launch_instance
    app.start()
  File "C:\Use

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jose\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Jose\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 985, in launch_instance
    app.start()
  File "C:\Use

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
data_path = "../../data/"

# Data

## Original

In [None]:
df_radiomics = pd.read_excel(data_path + 'quantitative_radiomic_features.xls')
df_radiomics['Lesion Name'] = df_radiomics['Lesion Name'].str.replace(r'-1\.les|-S2-1\.les', '', regex=True)

target_class = pd.read_csv(data_path + 'target_class.csv')
target_class.rename(columns={'CLID': 'Lesion Name'}, inplace=True)

In [None]:
df = pd.merge(df_radiomics, target_class, on='Lesion Name', how='inner')
df = df.drop(columns=df.columns[0])
df

In [None]:
from sklearn.preprocessing import StandardScaler

radiomic_features = df.drop(columns=["Pam50.Call"])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(radiomic_features)

df_scaled = pd.DataFrame(scaled_features, columns=radiomic_features.columns)

df_scaled["Pam50.Call"] = df["Pam50.Call"].values

df = df_scaled

In [None]:
df

## SMOTE

In [None]:
features = df.columns.difference(['Pam50.Call'])
target_col = 'Pam50.Call'

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target_col],
    test_size=0.2,
    random_state=42,
    stratify=df[target_col]
)

In [None]:
under_sampler = RandomUnderSampler(sampling_strategy={"LumA": 30}, random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

over_sampler = SMOTE(sampling_strategy="not majority", random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train_under, y_train_under)

print("\nDistribución tras Under-Sampling + Over-Sampling en train:")
print(y_train_resampled.value_counts())

In [None]:
df_resampled = pd.concat(
    [pd.DataFrame(X_train_resampled), pd.Series(y_train_resampled, name='Pam50.Call')],
    axis=1
)

In [None]:
df_train = df_resampled

# Feature extraction

## Functions

In [None]:
def apply_variance_threshold(df, target_col, threshold=0.01):
    selector = VarianceThreshold(threshold=threshold)
    df_features = selector.fit_transform(df.drop(columns=[target_col]))
    selected_columns = df.drop(columns=[target_col]).columns[selector.get_support()]
    
    return df_features, selected_columns

In [None]:
def apply_boruta(df_features, target, selected_columns, model_name="RandomForest", n_estimators=100, random_state=42):
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=n_estimators, random_state=random_state),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=n_estimators, random_state=random_state),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=n_estimators, random_state=random_state),
    }
    
    if model_name not in models:
        raise ValueError(
            f"Model '{model_name}' is not supported by Boruta. "
        )
    
    model = models[model_name]
    
    boruta_selector = BorutaPy(model, n_estimators='auto', random_state=random_state)
    boruta_selector.fit(df_features, df_train[target])
    selected_features = selected_columns[boruta_selector.support_]
    
    return selected_features

In [None]:
def cluster_and_select_representative_features(correlation_matrix, num_clusters):
    distance_matrix = 1 - np.abs(correlation_matrix)
    linkage_matrix = linkage(squareform(distance_matrix, checks=False), method='ward')
    
    clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')
    
    cluster_df = pd.DataFrame({
        "Feature": correlation_matrix.columns,
        "Cluster": clusters
    })
    
    representative_features = []
    for cluster_id in cluster_df["Cluster"].unique():
        cluster_features = cluster_df.loc[cluster_df["Cluster"] == cluster_id, "Feature"].values
        cluster_corr = correlation_matrix.loc[cluster_features, cluster_features].mean(axis=1)
        representative = cluster_corr.idxmax()
        representative_features.append(representative)
    
    return representative_features

## Procedure

In [None]:
df_features, selected_columns_after_variance = apply_variance_threshold(df_train, "Pam50.Call")

In [None]:
print(f"Number of features retained after applying variance threshold: {len(selected_columns_after_variance)}")
print(f"Selected features: {selected_columns_after_variance}")

In [None]:
selected_features = apply_boruta(df_features, "Pam50.Call", selected_columns_after_variance, model_name="GradientBoosting")
print(f"Number of features retained after applying Boruta: {len(selected_features)}")
print(f"Selected features: {selected_features}")

In [None]:
correlation_matrix = df[selected_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.savefig('correlation_heatmat.png', dpi=300, bbox_inches='tight')  # Línea opcional para guardar el plot
plt.show()

In [None]:
linkage_matrix = linkage(correlation_matrix, method='ward')

plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix, labels=selected_features, leaf_rotation=90)
plt.title("Dendrogram for Feature Clustering")
plt.savefig('dendrogram.png', dpi=300, bbox_inches='tight')  # Línea opcional para guardar el plot
plt.show()

In [None]:
representative_features = cluster_and_select_representative_features(
    correlation_matrix=correlation_matrix,
    num_clusters=3
)

In [None]:
print(f"Number of representative features: {len(representative_features)}")
print(f"Representative features selected from clusters: {representative_features}")

# Train

In [None]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, features, target_col, model, imprimir=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    if imprimir:
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    return macro_f1

In [None]:
X_train_resampled_selected = X_train_resampled[selected_features]
X_test_selected = X_test[selected_features]

In [None]:
rf_model = RandomForestClassifier(random_state=42)

macro_f1_score = train_and_evaluate_model(
    X_train_resampled_selected, X_test_selected, y_train_resampled, y_test,
    features=[col for col in df.columns if col != "Pam50.Call"],
    target_col="Pam50.Call",
    model=rf_model,
    imprimir=True
)