# Análisis de características Radiométricas

Se procesan las características radiométricas y se experimentan para obtener el resultado en modelos
**Roberto Araya**


In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2
#import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler

In [12]:
# parameteres
binwidth = 5
sigma = [1,2,3]
normalize= True
imageTypes = ['Original', 'LoG', 'Square', 'SquareRoot']

sm_radiometrics_file = f'santamaria_data_all__binwidth_{binwidth}_sigma_{sigma}_imtype_{imageTypes}_normalize_{normalize}.csv'
sm_radiometrics_file='santamaria_data_all__binwidth_5_sigma_[1, 2, 3]_imtype_[\'Original\']_normalize_True.csv'
sm_radiometrics = pd.read_csv(sm_radiometrics_file)

In [13]:
sm_radiometrics.head()

Unnamed: 0,PATIENT_ID,SEXO_MASCULINO,EDAD,FECHA_CIRUGIA,BIOPSIA_QX_PULMONAR,BIOPSIA_FBC-EBUS,BIOPSIA_OTRO_SITIO,RESULTADO_BP,BP_COMPLETA,HISTOLOGIA,...,torax3d_original_shape_Maximum2DDiameterColumn,torax3d_original_shape_Maximum2DDiameterRow,torax3d_original_shape_Maximum2DDiameterSlice,torax3d_original_shape_Maximum3DDiameter,torax3d_original_shape_MeshVolume,torax3d_original_shape_MinorAxisLength,torax3d_original_shape_Sphericity,torax3d_original_shape_SurfaceArea,torax3d_original_shape_SurfaceVolumeRatio,torax3d_original_shape_VoxelVolume
0,sm_001,1,70,,NO,SI,NO,BIOPSIAS TRANSBRONQUIALES-ENDOBRONQUIALES: VAR...,Fecha Informe : 26/03/2018\n\n \n\nINF...,ADENOCARCINOMA,...,77.951427,71.542106,74.982329,84.53377,55279.552157,49.332435,0.340003,20640.080294,0.373376,55429.621807
1,sm_002,0,49,2017-05-19 00:00:00,SI,SI,NO,1.- PUNCION ENDOSONOGRAFICA DE LINFONODOS 4L: ...,Fecha Informe : 17/02/2017\n \n\n\n\nI...,ADENOCARCINOMA,...,32.497872,30.012252,33.959077,34.225753,8613.947922,24.832407,0.637823,3186.045702,0.369871,8634.885267
2,sm_003,0,43,2017-07-05 00:00:00,NO,NO,SI,1.- VENTANA AORTOPULMUNAR : METASTASIS DE ADEN...,Fecha Informe : 10/07/2017\n\n\n\nINFORME ANAT...,ADENOCARCINOMA,...,15.506804,14.49942,18.110513,18.164537,956.215685,10.689812,0.774616,605.947428,0.633693,965.217617
3,sm_004,0,54,2017-05-05 00:00:00,SI,SI,NO,2.- PUNCION EBUS LINFONODO 4L: ABUNDANTE MATER...,Fecha Informe : 04/01/2017\n\nINFORME ANATOMO-...,ADENOCARCINOMA,...,52.867093,38.3428,51.380028,56.21877,21895.758462,28.599848,0.659739,5737.0095,0.262015,21915.678306
4,sm_005,0,44,2016-01-06 00:00:00,SI,NO,SI,2.- LOBECTOMIA SUPERIOR IZQUIERDA (124 GR.): A...,Fecha Informe : 11 DE ENERO DE 2016\n\nINFORME...,ADENOCARCINOMA,...,45.658721,47.321523,45.846671,48.005364,26342.882775,33.996793,0.552775,7745.406975,0.294023,26405.130223


## Procesamiento de datos
- Se eliminan columnas no relevantes.
- Se vectorizan las características compuestas por categorías de *strings* con el método *one-hot-encoding*.
- Se eliminan las columnas que posean alguna fila con valor nulo (INVESIGAR ESTOS CASOS, CASOS SIN EXAMENES TORAX3D).

In [14]:
original_drop_columns = ['PATIENT_ID', 'FECHA_CIRUGIA', 'BIOPSIA_QX_PULMONAR', 'BIOPSIA_FBC-EBUS', 'BIOPSIA_OTRO_SITIO', 'RESULTADO_BP', 'BP_COMPLETA', 'HISTOLOGIA', 'MUTACION_EGFR', 'MUTACION_PDL-1', 'MUTACION_ROS', 'RECIDIVA', 'COMENTARIO', '3D_TORAX_SEG', 'PET_SEG', 'BODY_CT_SEG']
images_columns = ['diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Configuration_Settings', 'diagnostics_Image-original_Dimensionality', 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Versions_PyRadiomics', 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Image-original_Hash', 'diagnostics_Image-original_Size', 'diagnostics_Image-original_Spacing', 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_Size', 'diagnostics_Mask-original_Spacing', 'diagnostics_Versions_Numpy', 'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python', 'diagnostics_Versions_SimpleITK', ]
exam_types = ['body_', 'pet_', 'torax3d_']
images_columns = [exam + s for s in images_columns for exam in exam_types]

# extra columns that are not relevant
extra_columns = ['ALK', 'MUTACION_ALK', 'PDL-1','ROS', 'ADENOPATIAS', 'STAGE', 'IV CONTRAST', 'TAMAÑO_BP_mm', 'TAMAÑO_CT_mm']

drop_columns = original_drop_columns+images_columns+extra_columns
sm_radiometrics = sm_radiometrics.drop(columns=drop_columns)

# Apply one-hot encoding to selected columns
#sm_radiometrics = pd.get_dummies(sm_radiometrics, columns=['ADENOPATIAS', 'STAGE'])

In [15]:
td_values = [s+'diagnostics_Mask-original_CenterOfMass' for s in exam_types]
print(td_values)

for dim in td_values:
    # Use str.extract to separate the string into three columns with specified suffixes
    extracted_values = sm_radiometrics[dim].str.extract(r'\(([^,]+), ([^,]+), ([^)]+)\)')
    extracted_values.columns = [f'{dim}_x', f'{dim}_y', f'{dim}_z']

    # Convert the columns to numeric (they are currently strings)
    extracted_values = extracted_values.apply(pd.to_numeric)

    # Concatenate the original DataFrame with the extracted values
    sm_radiometrics = pd.concat([sm_radiometrics, extracted_values], axis=1)

# Drop the original column with the (x, y, z) format
sm_radiometrics = sm_radiometrics.drop(td_values, axis=1)


['body_diagnostics_Mask-original_CenterOfMass', 'pet_diagnostics_Mask-original_CenterOfMass', 'torax3d_diagnostics_Mask-original_CenterOfMass']


In [16]:
# Print columns with NaN values
columns_with_nan = sm_radiometrics.columns[sm_radiometrics.isnull().any()].tolist()

# Drop columns with NaN values
sm_radiometrics = sm_radiometrics.drop(columns=columns_with_nan)

In [17]:
sm_radiometrics.head()

Unnamed: 0,SEXO_MASCULINO,EDAD,EGFR,body_diagnostics_Image-original_Maximum,body_diagnostics_Image-original_Mean,body_diagnostics_Image-original_Minimum,body_diagnostics_Mask-original_VolumeNum,body_diagnostics_Mask-original_VoxelNum,body_original_firstorder_10Percentile,body_original_firstorder_90Percentile,...,body_original_shape_Maximum3DDiameter,body_original_shape_MeshVolume,body_original_shape_MinorAxisLength,body_original_shape_Sphericity,body_original_shape_SurfaceArea,body_original_shape_SurfaceVolumeRatio,body_original_shape_VoxelVolume,body_diagnostics_Mask-original_CenterOfMass_x,body_diagnostics_Mask-original_CenterOfMass_y,body_diagnostics_Mask-original_CenterOfMass_z
0,1,70,1,3071.0,-1202.938331,-3023.0,1,22827,1.182854,1.231058,...,61.41487,29498.918155,41.718609,0.562609,8206.310718,0.27819,29659.219826,-86.126472,9.774562,-445.508455
1,0,49,1,2976.0,-807.028785,-1024.0,1,3313,2.130548,2.372647,...,40.317339,9817.039549,26.071662,0.653165,3394.546571,0.345781,9900.213696,-79.187021,185.140809,-679.330969
2,0,43,1,2976.0,-797.993197,-1024.0,1,231,1.565286,2.353336,...,16.004838,778.305287,10.336201,0.738423,554.130684,0.711971,814.29034,38.859199,108.446252,-670.824675
3,0,54,1,2976.0,-780.499167,-1024.0,1,6970,2.089147,2.294552,...,52.592522,22286.466742,29.75824,0.714336,5361.369664,0.240566,22406.078339,34.583872,196.977285,-585.562841
4,0,44,1,2976.0,-749.704437,-1024.0,1,6418,0.798574,2.031847,...,44.837914,19561.026772,30.574122,0.653921,5368.916809,0.27447,19707.90976,55.264782,125.363621,-631.178716


## Entrenamiento de modelos y selección de características

Se sigue la metodología realizada por Hector Henriquez en el trabajo [EGFR mutation prediction using F18-FDG PET-CT based radiomics features in non-small cell lung cancer](https://arxiv.org/pdf/2303.08569.pdf) para evaluar el rendimiento de modelos en las caracterterísticas radiométricas.

Algunas configuraciones importantes:
- Se entrena con un modelo RandomForestClassifier.
- Se entrena y evalúa el rendimiento para KFold con $k=3$.
- Se obtienen los resultados en las métricas *accuracy*, *AUC*, *True Positive Rate*, *False Positive Rate*.

### I. Evaluar el rendimiento del modelo base

In [18]:
# Evaluate model in the metrics 
def evaluate_model(model, X, y):
    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    
    # Predicted probabilities for class 1 (positive class) for AUC calculation
    probas = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, probas)
    
    # Confusion matrix for true positives and false positives
    cm = confusion_matrix(y, predictions)
    true_positives = cm[1, 1]
    false_positives = cm[0, 1]
    
    # Calculate true positive rate (sensitivity or recall)
    recall = true_positives / (true_positives + cm[1, 0])
    
    # Calculate false positive rate
    false_positive_rate = false_positives / (false_positives + cm[0, 0])
    
    return accuracy, auc, recall, false_positive_rate


# Split the data into features and target
X = sm_radiometrics.drop('EGFR', axis=1)
y = sm_radiometrics['EGFR']

# Scale the features to be non-negative and keep the original column names
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Number of folds for cross-validation
k_folds = 3
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

In [19]:
# Initial model performance
initial_results = []

for train_index, test_index in kf.split(X):
    # Train a RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=42)
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    rf_model.fit(X_train, y_train)
    initial_results.append(evaluate_model(rf_model, X_test, y_test))

# Print initial results
initial_average_results = np.mean(initial_results, axis=0)

print("Initial Results:")
print(f"Accuracy: {initial_average_results[0]}")
print(f"AUC: {initial_average_results[1]}")
print(f"True Positive Rate: {initial_average_results[2]}")
print(f"False Positive Rate: {initial_average_results[3]}")

Initial Results:
Accuracy: 0.5707070707070707
AUC: 0.4702380952380953
True Positive Rate: 0.16666666666666666
False Positive Rate: 0.18333333333333335


### II. Selección univariada de características
Es un paso de preprocesamiento para el entrenamiento de estimadores.

In [20]:
selected_features_per_fold=[]
# Loop through each fold in the cross-validation
for fold, (train_index, test_index) in enumerate(kf.split(X),1):
    # Split the data into training and testing sets for the current fold
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

    # Univariate feature selection using chi-squared test and SelectKBest
    selector = SelectKBest(chi2, k='all')
    X_selected = selector.fit_transform(X_train, y_train)

    # Get selected features with p-value greater than 0.05
    selected_features = X_train.columns[selector.pvalues_ > 0.05]

    # Store selected features for the current fold
    selected_features_per_fold.append(selected_features)

# Merge selected features from all folds
all_selected_features = list(set().union(*selected_features_per_fold))

### III. Selección de características con *backward selection*
Se filtran las características menos relevantes de acuerdo al proceso de *backward selection* usando el modelo *RandomForestClassifier* con KFold con *k=5*.

In [None]:
# Backward feature selection with k-fold cross-validation
selected_features = list(X.columns)

for _ in range(len(selected_features) - 1):
    # Store current performance
    best_accuracy = 0
    best_results = None
    feature_to_remove = None

    # Try removing each feature and evaluate the model using k-fold cross-validation
    for feature in selected_features:
        current_features = [f for f in selected_features if f != feature]
        accuracy_per_fold = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[current_features].iloc[train_index], X[current_features].iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            rf_model = RandomForestClassifier(random_state=42)
            rf_model.fit(X_train, y_train)
            accuracy_per_fold.append(evaluate_model(rf_model, X_test, y_test))

        
        # Update best feature to remove
        if np.mean(accuracy_per_fold, axis=0)[0] > best_accuracy:
            best_accuracy = np.mean(accuracy_per_fold, axis=0)[0]
            best_results = np.mean(accuracy_per_fold, axis=0)
            feature_to_remove = feature

    # Remove the least important feature
    selected_features.remove(feature_to_remove)
    print(f"Removed feature {feature_to_remove}, Current results: {best_results}")

# Final selected features
print("Final selected features:", selected_features)

Removed feature body_original_firstorder_90Percentile, Current results: [0.6010101  0.55833333 0.21428571 0.175     ]
Removed feature body_diagnostics_Image-original_Maximum, Current results: [0.65656566 0.54742063 0.21428571 0.075     ]
Removed feature body_original_glszm_SizeZoneNonUniformityNormalized, Current results: [0.65909091 0.59861111 0.32539683 0.10833333]
Removed feature body_original_firstorder_Median, Current results: [0.65909091 0.57698413 0.21428571 0.1       ]
Removed feature body_original_firstorder_10Percentile, Current results: [0.65909091 0.54722222 0.32539683 0.10833333]
Removed feature body_original_gldm_DependenceEntropy, Current results: [0.65909091 0.59325397 0.32539683 0.10833333]
Removed feature body_original_firstorder_Range, Current results: [0.63131313 0.49206349 0.21428571 0.13333333]
Removed feature body_original_shape_MinorAxisLength, Current results: [0.65909091 0.58234127 0.37301587 0.175     ]
Removed feature body_diagnostics_Image-original_Mean, Cu

### Preguntas
- Revisar el procesamiento y aplicación del excel (filtros, normalización y otros) para la configuración del extractor - comparar con el paper de Hector.
- Notar el problema del filtro *wavelet* que aparece *killed*.
- Consultar si las imágenes PET de los resultados del paper se realiza la normalización con el PET de liver.
- Se tienen que filtrar las columnas de torax3d porque para algunos pacientes no está aquella información.

### Falta
- Implementar para todos los filtros posibles.
- Normalizar imágenes PET (creo).
- hyperparameter search was performed with gridsearch and the performance metrics were
calculated with 100 repetitions of 5-fold cross-validation.
- Implementar lo anterior para que sea entrenado y validado en Stanford, para luego testear en Santa María.