## OBTENCION DE VARIABLES

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Hiperparámetros para VAE (diccionario, optimización manual)
param_grid_vae = {
    'vae__latent_dim': [8, 16, 32],
    'vae__epochs': [50, 100, 150],
    'vae__batch_size': [128, 256, 512]
}

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

print("Conjuntos de datos cargados y variable objetivo codificada exitosamente.")

# Columnas categóricas 
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response']

# Indices de columnas categóricas en el conjunto de entrenamiento
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_columns]
print(f"Índices de las columnas categóricas: {categorical_indices}")

Conjuntos de datos cargados y variable objetivo codificada exitosamente.
Índices de las columnas categóricas: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


## VAE

In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras import layers, models, backend as K
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from ray import tune
import ray
import time
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

# Inicializar Ray
ray.init(ignore_reinit_error=True)

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Normalizar las características numéricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definir la arquitectura del Conditional VAE
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

class ConditionalVAE(models.Model):
    def __init__(self, input_shape, latent_dim):
        super(ConditionalVAE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.encoder_dense = layers.Dense(64, activation='relu')
        self.z_mean = layers.Dense(latent_dim)
        self.z_log_var = layers.Dense(latent_dim)
        self.sampling_layer = layers.Lambda(sampling)

        # Decoder
        self.decoder_dense1 = layers.Dense(64, activation='relu')
        self.decoder_dense2 = layers.Dense(input_shape - 2, activation='sigmoid')

    def call(self, inputs):
        # Encoder
        x = self.encoder_dense(inputs)
        z_mean = self.z_mean(x)
        z_log_var = self.z_log_var(x)
        z = self.sampling_layer([z_mean, z_log_var])

        # Decoder (incluye la etiqueta)
        z_conditional = tf.concat([z, inputs[:, -2:]], axis=1)  # Añadir etiqueta al vector latente
        x_decoded = self.decoder_dense1(z_conditional)
        outputs = self.decoder_dense2(x_decoded)  # Solo reconstruir características

        return outputs, z_mean, z_log_var

    def train_step(self, data):
        inputs, target = data
        inputs = tf.concat([inputs, target], axis=1)  # Concatenar características y etiquetas

        with tf.GradientTape() as tape:
            outputs, z_mean, z_log_var = self(inputs, training=True)
            reconstruction_loss = tf.reduce_mean(
                tf.keras.losses.mse(inputs[:, :-2], outputs)  # Reconstruir solo características
            )
            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        return {"loss": total_loss, "reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss}

# Definir la función de entrenamiento para el VAE
def train_vae(config):
    start_time = time.time()

    # Concatenar etiquetas con las características de entrenamiento
    y_train_one_hot = tf.keras.utils.to_categorical(y_train)
    X_train_with_target = np.concatenate([X_train_scaled, y_train_one_hot], axis=1)

    # Crear y entrenar el VAE
    vae = ConditionalVAE(input_shape=X_train_with_target.shape[1], latent_dim=config["vae__latent_dim"])
    vae.compile(optimizer='adam')
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_one_hot)).batch(config["vae__batch_size"])
    vae.fit(train_dataset, epochs=config["vae__epochs"], verbose=0)

    # Generar datos sintéticos para la clase minoritaria
    majority_class = y_train.value_counts().idxmax()
    minority_class = y_train.value_counts().idxmin()
    samples_to_generate = y_train.value_counts()[majority_class] - y_train.value_counts()[minority_class]

    if samples_to_generate > 0:
        minority_target = np.array([[0, 1]] * samples_to_generate)  # One-hot para clase minoritaria
        synthetic_inputs = np.concatenate([X_train_scaled[:samples_to_generate], minority_target], axis=1)
        synthetic_data = vae(synthetic_inputs)[0].numpy()
        synthetic_data = scaler.inverse_transform(synthetic_data)  # Volver a escala original
        y_synthetic = [minority_class] * samples_to_generate

        # Combinar datos originales y sintéticos
        X_balanced = np.vstack([X_train, synthetic_data])
        y_balanced = np.hstack([y_train, y_synthetic])
    else:
        X_balanced = X_train
        y_balanced = y_train

    # Entrenar y evaluar Logistic Regression con datos balanceados
    log_reg = LogisticRegression(max_iter=1000, random_state=42)
    log_reg.fit(X_balanced, y_balanced)

    # Evaluar en el conjunto de prueba original
    y_pred_prob_lr = log_reg.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob_lr)

    elapsed_time = time.time() - start_time

    return {"auc_roc": auc_roc, "elapsed_time": elapsed_time}

# Definir los hiperparámetros a explorar con Ray Tune
param_grid_vae = {
    "vae__latent_dim": tune.grid_search([8, 16, 32, 64]),
    "vae__epochs": tune.grid_search([50, 100, 150]),
    "vae__batch_size": tune.grid_search([128, 256, 512])
}

# Ejecutar la búsqueda con Ray Tune
analysis = tune.run(
    train_vae,
    config=param_grid_vae,
    resources_per_trial={"cpu": 4, "gpu": 0.5},  # Ajusta según tus recursos
    verbose=1
)

# Obtener los mejores hiperparámetros y el mejor AUC-ROC
best_result = analysis.get_best_config(metric="auc_roc", mode="max")
best_aucroc = analysis.get_best_trial(metric="auc_roc", mode="max").last_result["auc_roc"]

# Imprimir los mejores resultados
print(f"Mejores hiperparámetros para VAE: {best_result}")
print(f"Mejor AUC-ROC obtenido: {best_aucroc:.4f}")


0,1
Current time:,2024-11-14 01:48:40
Running for:,00:00:41.52
Memory:,15.7/125.6 GiB

Trial name,status,loc,vae__batch_size,vae__epochs,vae__latent_dim,iter,total time (s),auc_roc,elapsed_time
train_vae_62fe4_00000,TERMINATED,172.22.2.44:2075186,128,50,8,1,1.21642,0.952893,1.21614
train_vae_62fe4_00001,TERMINATED,172.22.2.44:2075187,256,50,8,1,1.19985,0.947934,1.19959
train_vae_62fe4_00002,TERMINATED,172.22.2.44:2075188,512,50,8,1,1.15361,0.947934,1.15332
train_vae_62fe4_00003,TERMINATED,172.22.2.44:2075189,128,100,8,1,1.52479,0.96281,1.52456
train_vae_62fe4_00004,TERMINATED,172.22.2.44:2092572,256,100,8,1,1.58133,0.952893,1.5809
train_vae_62fe4_00005,TERMINATED,172.22.2.44:2092573,512,100,8,1,1.5006,0.949587,1.50036
train_vae_62fe4_00006,TERMINATED,172.22.2.44:2092574,128,150,8,1,1.91844,0.963636,1.91813
train_vae_62fe4_00007,TERMINATED,172.22.2.44:2092575,256,150,8,1,1.86303,0.961157,1.86278
train_vae_62fe4_00008,TERMINATED,172.22.2.44:2123764,512,150,8,1,1.72641,0.959504,1.72607
train_vae_62fe4_00009,TERMINATED,172.22.2.44:2123840,128,50,16,1,1.18961,0.952066,1.18936


[36m(pid=2075186)[0m 2024-11-14 01:48:00.533668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(pid=2075186)[0m 2024-11-14 01:48:00.546160: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(pid=2075186)[0m 2024-11-14 01:48:00.549953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[36m(pid=2075186)[0m 2024-11-14 01:48:00.559731: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(pid=2075186)[0m To enable the following instructions: AVX2 FMA, in other operat

Mejores hiperparámetros para VAE: {'vae__latent_dim': 8, 'vae__epochs': 150, 'vae__batch_size': 128}
Mejor AUC-ROC obtenido: 0.9636




In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, confusion_matrix, matthews_corrcoef
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Normalizar las características numéricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crear representación de los targets como one-hot
y_train_one_hot = tf.keras.utils.to_categorical(y_train)
y_test_one_hot = tf.keras.utils.to_categorical(y_test)

# Definir el Conditional VAE
class ConditionalVAE(models.Model):
    def __init__(self, input_dim, latent_dim):
        super(ConditionalVAE, self).__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.encoder_dense = layers.Dense(128, activation='relu')
        self.z_mean = layers.Dense(latent_dim)
        self.z_log_var = layers.Dense(latent_dim)
        self.sampling_layer = layers.Lambda(self.sampling)

        # Decoder
        self.decoder_dense1 = layers.Dense(128, activation='relu')
        self.decoder_dense2 = layers.Dense(input_dim, activation='sigmoid')

    def call(self, inputs):
        data, target = inputs
        target = tf.cast(target, dtype=tf.float32)  # Asegurarse de que `target` sea float32
        x = tf.concat([data, target], axis=1)
        x = self.encoder_dense(x)
        z_mean = self.z_mean(x)
        z_log_var = self.z_log_var(x)
        z = self.sampling_layer([z_mean, z_log_var])

        z_conditional = tf.concat([z, target], axis=1)
        x_decoded = self.decoder_dense1(z_conditional)
        outputs = self.decoder_dense2(x_decoded)
        return outputs, z_mean, z_log_var

    def sampling(self, args):
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon

    def train_step(self, data):
        inputs, target = data
        with tf.GradientTape() as tape:
            outputs, z_mean, z_log_var = self([inputs, target], training=True)
            reconstruction_loss = tf.reduce_mean(
                tf.keras.losses.mse(inputs, outputs)
            )
            kl_loss = -0.5 * tf.reduce_mean(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            )
            total_loss = reconstruction_loss + kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {"loss": total_loss, "reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss}

# Configuración del Conditional VAE
input_dim = X_train_scaled.shape[1]
latent_dim = 16                                                                                        # aca cambiar
vae = ConditionalVAE(input_dim=input_dim, latent_dim=latent_dim)
vae.compile(optimizer='adam')

# Crear un dataset de TensorFlow para manejar las entradas como tuplas
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_one_hot)).batch(128)       # aca cambiar

# Entrenar el modelo
vae.fit(
    train_dataset,
    epochs=100,                         # aca cambiar
    verbose=1
)

# Generar datos sintéticos para balancear las clases
majority_class = y_train.value_counts().idxmax()
minority_class = y_train.value_counts().idxmin()
samples_to_generate = y_train.value_counts()[majority_class] - y_train.value_counts()[minority_class]

if samples_to_generate > 0:
    minority_target = np.array([[0, 1]] * samples_to_generate)  # One-hot para clase minoritaria
    synthetic_data = vae([X_train_scaled[:samples_to_generate], minority_target])[0].numpy()
    synthetic_data = scaler.inverse_transform(synthetic_data)  # Volver a la escala original
    y_synthetic = [minority_class] * samples_to_generate

    # Combinar los datos originales con los sintéticos
    X_balanced = np.vstack([X_train, synthetic_data])  # Usar características originales
    y_balanced = np.hstack([y_train, y_synthetic])
else:
    X_balanced = X_train
    y_balanced = y_train

# Transformar el conjunto de prueba al formato original
X_test_original = X_test

# Clasificadores
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42, max_iter=1000),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# Evaluar los clasificadores
results = []
for name, clf in classifiers.items():
    clf.fit(X_balanced, y_balanced)  # Entrenar con datos originales balanceados
    y_pred = clf.predict(X_test_original)  # Predecir usando características originales
    y_pred_prob = clf.predict_proba(X_test_original)[:, 1]  # Obtener probabilidades para AUC-ROC

    # Calcular métricas
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append({
        'Modelo': name,
        'Accuracy': accuracy,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc,
        'Specificity': specificity,
        'MCC': mcc
    })

    print(f"{name} - Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}, Specificity: {specificity:.4f}, MCC: {mcc:.4f}")

# Guardar resultados
results_df = pd.DataFrame(results)
results_df.to_csv('resultados_clasificadores_vae.csv', index=False)
print("\nResultados guardados exitosamente en 'resultados_clasificadores_vae.csv'")


Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - kl_loss: 0.1529 - loss: 1.3866 - reconstruction_loss: 1.2337  
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.1139 - loss: 1.3261 - reconstruction_loss: 1.2123
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.0877 - loss: 1.2757 - reconstruction_loss: 1.1879
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.0702 - loss: 1.2380 - reconstruction_loss: 1.1677
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.0585 - loss: 1.2060 - reconstruction_loss: 1.1475
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.0505 - loss: 1.1872 - reconstruction_loss: 1.1368
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - kl_loss: 0.0446 - loss: 1.1669 - reco



MLP - Accuracy: 0.8831, Recall: 0.9091, F1-Score: 0.8163, AUC-ROC: 0.9653, Specificity: 0.8727, MCC: 0.7402
Logistic Regression - Accuracy: 0.8961, Recall: 0.9545, F1-Score: 0.8400, AUC-ROC: 0.9512, Specificity: 0.8727, MCC: 0.7769

Resultados guardados exitosamente en 'resultados_clasificadores_vae.csv'


