## OBTENCION DE VARIABLES

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Columnas categóricas 
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response']

# Indices de columnas categóricas en el conjunto de entrenamiento
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_columns]
print(f"Índices de las columnas categóricas: {categorical_indices}")

Índices de las columnas categóricas: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


## CT-GAN

In [5]:
ray.shutdown()

In [7]:
import pandas as pd
from ctgan import CTGAN
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from ray import tune
import ray
import time

# Inicializar Ray
ray.init(ignore_reinit_error=True)

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Añadir la variable de clase al conjunto de entrenamiento para que CTGAN pueda condicionarse a ella
X_train['target'] = y_train
X_test['target'] = y_test

# Columnas categóricas
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response', 'target']  # Incluimos 'target' como categórica

# Definir la función de entrenamiento para CTGAN
def train_ctgan(config):
    start_time = time.time()
    
    # Inicializar CTGAN con los hiperparámetros del config
    ctgan = CTGAN(
        epochs=config["ctgan__epochs"],
        batch_size=config["ctgan__batch_size"],
        discriminator_steps=config["ctgan__discriminator_steps"],
        generator_dim=config["ctgan__generator_dim"],
        discriminator_dim=config["ctgan__discriminator_dim"],
        pac=config["ctgan__pac"]
    )

    # Entrenar CTGAN con todo el conjunto de entrenamiento
    ctgan.fit(X_train, categorical_columns)

    # Identificar la clase mayoritaria y minoritaria
    majority_class = y_train.value_counts().idxmax()
    minority_class = y_train.value_counts().idxmin()

    # Calcular cuántas muestras generar para equilibrar las clases
    samples_to_generate = y_train.value_counts()[majority_class] - y_train.value_counts()[minority_class]

    # Generar datos sintéticos condicionados a la clase minoritaria
    if samples_to_generate > 0:
        # Generar datos solo de la clase minoritaria (target = minority_class)
        synthetic_data = ctgan.sample(
            samples_to_generate, 
            condition_column='target', 
            condition_value=minority_class
        )
        
        # Quitar la columna 'target' de los datos sintéticos generados
        synthetic_data = synthetic_data.drop(columns=['target'])
        y_synthetic = [minority_class] * samples_to_generate
        
        # Combinar los datos sintéticos con los originales
        X_balanced = pd.concat([X_train.drop(columns=['target']), synthetic_data], ignore_index=True)
        y_balanced = pd.concat([y_train, pd.Series(y_synthetic)], ignore_index=True)
    else:
        X_balanced = X_train.drop(columns=['target'])
        y_balanced = y_train

    # Entrenar y evaluar Logistic Regression
    log_reg = LogisticRegression(max_iter=1000, random_state=42)
    log_reg.fit(X_balanced, y_balanced)
    
    # Predicción en el conjunto de prueba
    y_pred_prob_lr = log_reg.predict_proba(X_test.drop(columns=['target']))[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_prob_lr)
    
    # Calcular el tiempo de ejecución
    elapsed_time = time.time() - start_time
    
    return {"auc_roc": auc_roc, "elapsed_time": elapsed_time}

# Definir los hiperparámetros a explorar con Ray Tune
param_grid_ctgan = {
    "ctgan__epochs": tune.grid_search([100, 200]),
    "ctgan__batch_size": tune.grid_search([500, 1000, 2000]),
    "ctgan__discriminator_steps": tune.grid_search([1, 2]),
    "ctgan__generator_dim": tune.grid_search([(128, 128), (256, 256), (512, 256)]),
    "ctgan__discriminator_dim": tune.grid_search([(128, 128), (256, 256), (512, 256)]),
    "ctgan__pac": tune.grid_search([1, 5, 10])
}

# Ejecutar la búsqueda con Ray Tune
analysis = tune.run(
    train_ctgan,
    config=param_grid_ctgan,
    resources_per_trial={"cpu": 4, "gpu": 0.5},  # Ajusta según tus recursos; si no tienes GPU, pon 0
    metric="auc_roc",  # Especifica la métrica a optimizar
    mode="max",        # Especifica que quieres maximizar la métrica
    verbose=1
)

# Obtener los mejores resultados
best_result = analysis.get_best_config(metric="auc_roc", mode="max")
best_auc_roc = analysis.best_result["auc_roc"]
print(f"Mejores hiperparámetros para CTGAN: {best_result}")
print(f"Best AUC-ROC: {best_auc_roc:.4f}")


0,1
Current time:,2024-11-12 20:26:04
Running for:,00:00:11.96
Memory:,60.5/125.6 GiB

Trial name,# failures,error file
train_ctgan_38aca_00000,1,"/tmp/ray/session_2024-11-12_20-25-35_541712_291780/artifacts/2024-11-12_20-25-52/train_ctgan_2024-11-12_20-25-52/driver_artifacts/train_ctgan_38aca_00000_0_ctgan__batch_size=500,ctgan__discriminator_dim=128_128,ctgan__discriminator_steps=1,ctgan__epochs=100,ct_2024-11-12_20-25-52/error.txt"
train_ctgan_38aca_00001,1,"/tmp/ray/session_2024-11-12_20-25-35_541712_291780/artifacts/2024-11-12_20-25-52/train_ctgan_2024-11-12_20-25-52/driver_artifacts/train_ctgan_38aca_00001_1_ctgan__batch_size=500,ctgan__discriminator_dim=128_128,ctgan__discriminator_steps=2,ctgan__epochs=100,ct_2024-11-12_20-25-52/error.txt"
train_ctgan_38aca_00002,1,"/tmp/ray/session_2024-11-12_20-25-35_541712_291780/artifacts/2024-11-12_20-25-52/train_ctgan_2024-11-12_20-25-52/driver_artifacts/train_ctgan_38aca_00002_2_ctgan__batch_size=500,ctgan__discriminator_dim=128_128,ctgan__discriminator_steps=1,ctgan__epochs=200,ct_2024-11-12_20-25-52/error.txt"
train_ctgan_38aca_00003,1,"/tmp/ray/session_2024-11-12_20-25-35_541712_291780/artifacts/2024-11-12_20-25-52/train_ctgan_2024-11-12_20-25-52/driver_artifacts/train_ctgan_38aca_00003_3_ctgan__batch_size=500,ctgan__discriminator_dim=128_128,ctgan__discriminator_steps=2,ctgan__epochs=200,ct_2024-11-12_20-25-52/error.txt"

Trial name,status,loc,ctgan__batch_size,ctgan__discriminator _dim,ctgan__discriminator _steps,ctgan__epochs,ctgan__generator_dim,ctgan__pac
train_ctgan_38aca_00000,ERROR,172.22.2.44:333665,500,"(128, 128)",1,100,"(128, 128)",1
train_ctgan_38aca_00001,ERROR,172.22.2.44:333765,500,"(128, 128)",2,100,"(128, 128)",1
train_ctgan_38aca_00002,ERROR,172.22.2.44:333864,500,"(128, 128)",1,200,"(128, 128)",1
train_ctgan_38aca_00003,ERROR,172.22.2.44:333964,500,"(128, 128)",2,200,"(128, 128)",1


2024-11-12 20:25:54,706	ERROR tune_controller.py:1331 -- Trial task failed for trial train_ctgan_38aca_00000
Traceback (most recent call last):
  File "/home/nsaboya/.local/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/nsaboya/.local/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/nsaboya/.local/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/nsaboya/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 2691, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/nsaboya/.local/lib/python3.10/site-packages/ray/_private/worker.py", line 871, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): [36mray::ImplicitFunc

TuneError: ('Trials did not complete', [train_ctgan_38aca_00000, train_ctgan_38aca_00001, train_ctgan_38aca_00002, train_ctgan_38aca_00003])

In [9]:
import pandas as pd
from ctgan import CTGAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, confusion_matrix, matthews_corrcoef

# Cargar los conjuntos de entrenamiento y prueba
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Columnas categóricas
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'T', 'N', 'M', 'Stage', 'Response']

# 1. Generar datos sintéticos usando los mejores hiperparámetros de CTGAN
best_ctgan = CTGAN(
    epochs=best_result['ctgan__epochs'], 
    batch_size=best_result['ctgan__batch_size'], 
    discriminator_steps=best_result['ctgan__discriminator_steps'],
    generator_dim=best_result['ctgan__generator_dim'], 
    discriminator_dim=best_result['ctgan__discriminator_dim'], 
    pac=best_result['ctgan__pac']
)

# Entrenar CTGAN con el conjunto de datos completo
best_ctgan.fit(X_train, categorical_columns)

# Identificar la clase mayoritaria y minoritaria
y_train_series = pd.Series(y_train)
majority_class = y_train_series.value_counts().idxmax()
minority_class = y_train_series.value_counts().idxmin()

# Generar datos sintéticos solo para la clase minoritaria
samples_to_generate = y_train_series.value_counts()[majority_class] - y_train_series.value_counts()[minority_class]
X_synthetic = best_ctgan.sample(samples_to_generate)

# Crear etiquetas para los datos sintéticos
y_synthetic = [minority_class] * samples_to_generate

# Combinar los datos originales con los datos sintéticos
X_balanced = pd.concat([X_train, X_synthetic], ignore_index=True)
y_balanced = pd.concat([pd.Series(y_train), pd.Series(y_synthetic)], ignore_index=True)

# 2. Definir los clasificadores que se van a evaluar
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42, max_iter=1000),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# Lista para guardar los resultados
results = []

# 3. Entrenar y evaluar cada clasificador en los datos generados
for name, clf in classifiers.items():
    clf.fit(X_balanced, y_balanced)  # Entrenar el clasificador en los datos balanceados
    
    # Predicciones en el conjunto de prueba original
    y_pred = clf.predict(X_test)
    y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Obtener probabilidades para AUC-ROC
    
    # Calcular las métricas
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_prob)
    
    # Specificity (Verdaderos Negativos / Total de Negativos)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # MCC (Matthews Correlation Coefficient)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # Guardar los resultados en un diccionario
    results.append({
        'Modelo': name,
        'Accuracy': accuracy,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc,
        'Specificity': specificity,
        'MCC': mcc
    })
    
    # Imprimir las métricas
    print(f"{name} - Accuracy: {accuracy:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}, Specificity: {specificity:.4f}, MCC: {mcc:.4f}")

# 4. Convertir los resultados en un DataFrame para guardarlos
results_df = pd.DataFrame(results)

# Guardar los resultados en un archivo CSV
results_df.to_csv('resultados_clasificadores_ctgan.csv', index=False)

print("\nResultados guardados exitosamente en 'resultados_clasificadores_ctgan.csv'")


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Random Forest - Accuracy: 0.9481, Recall: 0.9091, F1-Score: 0.9091, AUC-ROC: 0.9872, Specificity: 0.9636, MCC: 0.8727
MLP - Accuracy: 0.9091, Recall: 0.9545, F1-Score: 0.8571, AUC-ROC: 0.9719, Specificity: 0.8909, MCC: 0.8004
Logistic Regression - Accuracy: 0.8442, Recall: 0.8636, F1-Score: 0.7600, AUC-ROC: 0.9306, Specificity: 0.8364, MCC: 0.6574

Resultados guardados exitosamente en 'resultados_clasificadores_ctgan.csv'
