In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import EarlyStopping
import random
#import keras_tuner as kt


### Cargue de los datos

In [2]:
# Cargar archivos (rutas relativas desde notebooks/)
train = pd.read_csv("../data/raw/train_data.csv")
test = pd.read_csv("../data/raw/test_data.csv")
sample_submission = pd.read_csv("../data/raw/sampleSubmission.csv")

### Normalización de los datos

In [3]:
# Separar features y target
X = train.drop(columns=["ID", "Bankruptcy"])
y = train["Bankruptcy"]

# Guardar ID del test para el submission
test_ids = test["ID"]
X_test = test.drop(columns=["ID"])

# Dividir training en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Escalar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#X_train_scaled[:3]  # Mostrar ejemplo de datos escalados

### Redes Neuronales

In [4]:
# Fijar semillas para reproducibilidad
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# Modelo mejorado
model = models.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(1, activation='sigmoid')
])

# Compilación con tasa de aprendizaje explícita
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[AUC(name='auc')])


early_stop = EarlyStopping(monitor='val_auc', patience=5, mode='max', restore_best_weights=True)

history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,  # subimos el máximo de epochs
                    batch_size=32,
                    callbacks=[early_stop],
                    verbose=1)

# Evaluar en el set de validación
val_predictions = model.predict(X_val_scaled)
auc_score = roc_auc_score(y_val, val_predictions)

# Predecir para el archivo de test
test_predictions = model.predict(X_test_scaled)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [5]:
# Crear DataFrame de submission con las predicciones del modelo (basado en PCA)
submission_df = pd.DataFrame({
    'ID': test_ids,  # Usamos test_ids guardado al inicio
    'Bankruptcy': test_predictions.flatten()
})

# Guardar el archivo correctamente
submission_df.to_csv("../results/submission_scaled1.csv", index=False)

# Verificar
submission_df.head()

Unnamed: 0,ID,Bankruptcy
0,5,0.00846
1,14,0.007052
2,16,0.014421
3,26,0.00189
4,28,0.02056
