# Evaluación de Clasificadores con CTGAN
Este notebook contiene los pasos para balancear un dataset desbalanceado utilizando CTGAN, optimizando sus hiperparámetros con Ray Tune y evaluando clasificadores en los datos generados.

## 1. Carga de Datos y Configuración Inicial
Se cargan los datos de entrenamiento y prueba y se configuran las columnas categóricas para su uso en CTGAN.

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Índices de las columnas categóricas: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [None]:
# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

In [None]:
# Columnas categóricas 
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response']

In [None]:
# Indices de columnas categóricas en el conjunto de entrenamiento
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_columns]
print(f"Índices de las columnas categóricas: {categorical_indices}")

## 2. Optimización de Hiperparámetros para CTGAN con Ray Tune
Se utilizan diferentes combinaciones de hiperparámetros para optimizar el rendimiento de CTGAN usando AUC-ROC como métrica objetivo.

In [None]:
import pandas as pd
from ctgan import CTGAN
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from ray import tune
import ray
import time

In [None]:
# Inicializar Ray
ray.init(ignore_reinit_error=True)

In [None]:
# Añadir la variable de clase al conjunto de entrenamiento para que CTGAN pueda condicionarse a ella
X_train['target'] = y_train
X_test['target'] = y_test

In [None]:
# Columnas categóricas
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response', 'target']  # Incluimos 'target' como categórica

In [None]:
# Definir la función de entrenamiento para CTGAN
def train_ctgan(config):
    start_time = time.time()
    
    # Inicializar CTGAN con los hiperparámetros del config
    ctgan = CTGAN(
        epochs=config["ctgan__epochs"],
        batch_size=config["ctgan__batch_size"],
        discriminator_steps=config["ctgan__discriminator_steps"],
        generator_dim=config["ctgan__generator_dim"],
        discriminator_dim=config["ctgan__discriminator_dim"],
        pac=config["ctgan__pac"]
    )

    # Entrenar CTGAN con todo el conjunto de entrenamiento
    ctgan.fit(X_train, categorical_columns)

    # Identificar la clase mayoritaria y minoritaria
    majority_class = y_train.value_counts().idxmax()
    minority_class = y_train.value_counts().idxmin()

    # Calcular cuántas muestras generar para equilibrar las clases
    samples_to_generate = y_train.value_counts()[majority_class] - y_train.value_counts()[minority_class]

    # Generar datos sintéticos condicionados a la clase minoritaria
    if samples_to_generate > 0:
        # Generar datos solo de la clase minoritaria (target = minority_class)
        synthetic_data = ctgan.sample(
            samples_to_generate, 
            condition_column='target', 
            condition_value=minority_class
        )
        
        # Quitar la columna 'target' de los datos sintéticos generados
        synthetic_data = synthetic_data.drop(columns=['target'])
        y_synthetic = [minority_class] * samples_to_generate
        
        # Combinar los datos sintéticos con los originales
        X_balanced = pd.concat([X_train.drop(columns=['target']), synthetic_data], ignore_index=True)
        y_balanced = pd.concat([y_train, pd.Series(y_synthetic)], ignore_index=True)
    else:
        X_balanced = X_train.drop(columns=['target'])
        y_balanced = y_train

    # Entrenar y evaluar Logistic Regression
    log_reg = LogisticRegression(max_iter=1000, random_state=42)
    log_reg.fit(X_balanced, y_balanced)
    
    # Predicción en el conjunto de prueba
    y_pred_prob_lr = log_reg.predict_proba(X_test.drop(columns=['target']))[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob_lr)
    
    # Calcular el tiempo de ejecución
    elapsed_time = time.time() - start_time
    
    return {"auc_roc": auc_roc, "elapsed_time": elapsed_time}

In [None]:
# Definir los hiperparámetros a explorar con Ray Tune
param_grid_ctgan = {
    "ctgan__epochs": tune.grid_search([100, 200]),
    "ctgan__batch_size": tune.grid_search([500, 1000, 2000]),
    "ctgan__discriminator_steps": tune.grid_search([1, 2]),
    "ctgan__generator_dim": tune.grid_search([(128, 128), (256, 256), (512, 256)]),
    "ctgan__discriminator_dim": tune.grid_search([(128, 128), (256, 256), (512, 256)]),
    "ctgan__pac": tune.grid_search([1, 5, 10])
}

In [None]:
# Ejecutar la búsqueda con Ray Tune
analysis = tune.run(
    train_ctgan,
    config=param_grid_ctgan,
    resources_per_trial={"cpu": 4, "gpu": 0},  # Ajusta según tus recursos; si no tienes GPU, pon 0
    metric="auc_roc",  # Especifica la métrica a optimizar
    mode="max",        # Especifica que quieres maximizar la métrica
    verbose=1
)

In [9]:
# Obtener los mejores resultados
best_result = analysis.get_best_config(metric="auc_roc", mode="max")
best_auc_roc = analysis.best_result["auc_roc"]
print(f"Mejores hiperparámetros para CTGAN: {best_result}")
print(f"Best AUC-ROC: {best_auc_roc:.4f}")

0,1
Current time:,2024-11-12 21:11:24
Running for:,00:11:06.10
Memory:,38.6/125.6 GiB

Trial name,status,loc,ctgan__batch_size,ctgan__discriminator _dim,ctgan__discriminator _steps,ctgan__epochs,ctgan__generator_dim,ctgan__pac,iter,total time (s),auc_roc,elapsed_time
train_ctgan_085ce_00000,TERMINATED,172.22.2.44:347189,500,"(128, 128)",1,100,"(128, 128)",1,1,5.70563,0.967769,5.70256
train_ctgan_085ce_00001,TERMINATED,172.22.2.44:347190,1000,"(128, 128)",1,100,"(128, 128)",1,1,9.6902,0.967769,9.6869
train_ctgan_085ce_00002,TERMINATED,172.22.2.44:347191,2000,"(128, 128)",1,100,"(128, 128)",1,1,14.0613,0.963636,14.0577
train_ctgan_085ce_00003,TERMINATED,172.22.2.44:347192,500,"(256, 256)",1,100,"(128, 128)",1,1,7.05253,0.971074,7.04858
train_ctgan_085ce_00004,TERMINATED,172.22.2.44:347194,1000,"(256, 256)",1,100,"(128, 128)",1,1,10.414,0.952893,10.4113
train_ctgan_085ce_00005,TERMINATED,172.22.2.44:347193,2000,"(256, 256)",1,100,"(128, 128)",1,1,17.1448,0.97438,17.1418
train_ctgan_085ce_00006,TERMINATED,172.22.2.44:347195,500,"(512, 256)",1,100,"(128, 128)",1,1,8.92673,0.970248,8.92179
train_ctgan_085ce_00007,TERMINATED,172.22.2.44:347196,1000,"(512, 256)",1,100,"(128, 128)",1,1,14.1739,0.969421,14.1705
train_ctgan_085ce_00008,TERMINATED,172.22.2.44:347197,2000,"(512, 256)",1,100,"(128, 128)",1,1,20.7143,0.979339,20.7111
train_ctgan_085ce_00009,TERMINATED,172.22.2.44:347198,500,"(128, 128)",2,100,"(128, 128)",1,1,8.92746,0.971074,8.92447


2024-11-12 21:00:27,666	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'ctgan__discriminator_dim': (128, 128), 'ctgan__generator_dim': (128, 128)}
2024-11-12 21:00:28,923	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'ctgan__discriminator_dim': (256, 256), 'ctgan__generator_dim': (128, 128)}
2024-11-12 21:00:30,940	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'ctgan__discriminator_dim': (128, 128), 'ctgan__generator_dim': (128, 128)}
2024-11-12 21:00:31,036	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'ctgan__discriminator_dim': (512, 256), 'ctgan__generator_dim': (128, 128)}
2024-11-12 21:00:31,742	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'ctgan__discriminator_dim': (128, 128), 'ctgan__generator_dim': (1

Mejores hiperparámetros para CTGAN: {'ctgan__epochs': 200, 'ctgan__batch_size': 1000, 'ctgan__discriminator_steps': 2, 'ctgan__generator_dim': (512, 256), 'ctgan__discriminator_dim': (256, 256), 'ctgan__pac': 10}
Best AUC-ROC: 0.9826


## 3. Generación de Datos Sintéticos con CTGAN
Usamos los mejores hiperparámetros para generar datos balanceados y combinarlos con los datos originales.

In [None]:
import pandas as pd
from ctgan import CTGAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, confusion_matrix, matthews_corrcoef

In [None]:
# Columnas categóricas
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response']

In [None]:
# 1. Generar datos sintéticos usando los mejores hiperparámetros de CTGAN
best_ctgan = CTGAN(
    epochs=best_result['ctgan__epochs'], 
    batch_size=best_result['ctgan__batch_size'], 
    discriminator_steps=best_result['ctgan__discriminator_steps'],
    generator_dim=best_result['ctgan__generator_dim'], 
    discriminator_dim=best_result['ctgan__discriminator_dim'], 
    pac=best_result['ctgan__pac']
)

In [None]:
# Entrenar CTGAN con el conjunto de datos completo
best_ctgan.fit(X_train, categorical_columns)

# Identificar la clase mayoritaria y minoritaria
y_train_series = pd.Series(y_train)
majority_class = y_train_series.value_counts().idxmax()
minority_class = y_train_series.value_counts().idxmin()

# Generar datos sintéticos solo para la clase minoritaria
samples_to_generate = y_train_series.value_counts()[majority_class] - y_train_series.value_counts()[minority_class]
X_synthetic = best_ctgan.sample(samples_to_generate)

# Crear etiquetas para los datos sintéticos
y_synthetic = [minority_class] * samples_to_generate

# Combinar los datos originales con los datos sintéticos
X_balanced = pd.concat([X_train, X_synthetic], ignore_index=True)
y_balanced = pd.concat([pd.Series(y_train), pd.Series(y_synthetic)], ignore_index=True)

## 4. Evaluación de Clasificadores en Datos Balanceados
Entrenamos clasificadores en los datos generados y evaluamos su rendimiento en el conjunto de prueba original.

In [None]:
# Definir los clasificadores que se van a evaluar
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42, max_iter=1000),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

In [None]:
# Lista para guardar los resultados
results = []

In [None]:
# Entrenar y evaluar cada clasificador en los datos generados
for name, clf in classifiers.items():
    clf.fit(X_balanced, y_balanced)  # Entrenar el clasificador en los datos balanceados
    
    # Predicciones en el conjunto de prueba original
    y_pred = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Obtener probabilidades para AUC-ROC
    
    # Calcular las métricas
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    
    # Specificity (Verdaderos Negativos / Total de Negativos)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # MCC (Matthews Correlation Coefficient)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Guardar los resultados en un diccionario
    results.append({
        'Modelo': name,
        'Accuracy': accuracy,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc,
        'Specificity': specificity,
        'MCC': mcc
    })
    
    # Imprimir las métricas
    print(f"{name} - Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}, Specificity: {specificity:.4f}, MCC: {mcc:.4f}")

In [10]:
# Convertir los resultados en un DataFrame para guardarlos
results_df = pd.DataFrame(results)

# Guardar los resultados en un archivo CSV
results_df.to_csv('resultados_clasificadores_ctgan.csv', index=False)

print("\nResultados guardados exitosamente en 'resultados_clasificadores_ctgan.csv'")

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Random Forest - Accuracy: 0.9610, Recall: 0.9091, F1-Score: 0.9302, AUC-ROC: 0.9917, Specificity: 0.9818, MCC: 0.9037
MLP - Accuracy: 0.9351, Recall: 0.9091, F1-Score: 0.8889, AUC-ROC: 0.9876, Specificity: 0.9455, MCC: 0.8435
Logistic Regression - Accuracy: 0.8961, Recall: 0.9545, F1-Score: 0.8400, AUC-ROC: 0.9620, Specificity: 0.8727, MCC: 0.7769

Resultados guardados exitosamente en 'resultados_clasificadores_ctgan.csv'
