In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


print("--- 1. PREPARACIÓN DE DATOS ---")

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv') # Datos sin etiqueta (el examen)
df_statlog = pd.read_csv('statlog_limpio.csv') # Datos externos


print("--- 2. LIMPIEZA DE DATOS ---")

# =====================================================================
# 1 OLDPEAK (limpiar valores negativos)
# =====================================================================
# Convertir oldpeak a numérico, forzando errores a NaN
train['oldpeak'] = pd.to_numeric(train['oldpeak'], errors='coerce')
test['oldpeak']  = pd.to_numeric(test['oldpeak'], errors='coerce')
# Convertir valores negativos en NaN
train.loc[train['oldpeak'] < 0, 'oldpeak'] = np.nan
test.loc[test['oldpeak'] < 0, 'oldpeak'] = np.nan

# =====================================================================
# 2 DEFINIR FUNCIÓN HOSPITAL
# =====================================================================
def asignar_hospital(fila):
    fila_str = fila.astype(str).str.strip()

    # Hospital A: cualquier -9 (que coincide con floats)
    if ('-9' in fila_str.values) or ('-9.0' in fila_str.values):
        return "Hospital A"

    # Hospital B: tiene ? (que coincide con ints)
    elif ('?' in fila_str.values):
        return "Hospital B"

    else:
        return "Hospital A"

train['hospital'] = train.apply(asignar_hospital, axis=1)
test['hospital'] = test.apply(asignar_hospital, axis=1)
# Pasar a 0/1
train['hospital'] = train['hospital'].str.lower().str.strip().map(lambda x: 1 if 'b' in x else 0)
test['hospital']  = test['hospital'].str.lower().str.strip().map(lambda x: 1 if 'b' in x else 0)


# =====================================================================
# 3 ELIMINAR COLUMNAS QUE EMPEORAN EL MODELO
# =====================================================================
for df in [train, test]:
    if 'ca' in df.columns:
        df.drop(columns=['ca'], inplace=True)
for df in [train, test]:
    if 'chol' in df.columns:
        df.drop(columns=['chol'], inplace=True)


# =====================================================================
# 4 TRATAMIENTO DE SLOPE Y THAL
# =====================================================================
for df in [train, test]:
    # slope: '1' → '2' (solo si es string '1', no tocar 1.0)
    df['slope'] = df['slope'].replace('1', '2')

    # thal: '3' → '7' (solo si es string '3', no tocar 3.0)
    df['thal'] = df['thal'].replace('3', '7')


for df in [train, test]:
    df['slope'] = pd.to_numeric(df['slope'], errors='coerce')
    df['thal']  = pd.to_numeric(df['thal'], errors='coerce')

# Regla 1: slope == -9 → slope = 1
train.loc[train['slope'] == -9, 'slope'] = 1
test.loc[test['slope'] == -9, 'slope'] = 1

# Regla 2: slope = 1 y thal = -9 → thal = 3
mask_train = (train['slope'] == 1) & (train['thal'] == -9)
mask_test  = (test['slope'] == 1) & (test['thal'] == -9)

train.loc[mask_train, 'thal'] = 3
test.loc[mask_test, 'thal'] = 3

# Regla 3: thal = -9 y slope != 1 → thal = 7
mask_train2 = (train['thal'] == -9) & (train['slope'] != 1)
mask_test2  = (test['thal'] == -9) & (test['slope'] != 1)

train.loc[mask_train2, 'thal'] = 7
test.loc[mask_test2, 'thal'] = 7


# =====================================================================
# 6 CONVERTIR -9, ? a nan
# =====================================================================
for df in [train, test]:

    # Convertir texto a numérico cuando posible
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')

    # Reemplazar -9, '?'
    df.replace('-9.0', np.nan, inplace=True)
    df.replace('?', np.nan, inplace=True)


# =====================================================================
# 7 RELLENAR NAN CON MEDIANA DE TRAIN
# =====================================================================
medianas = train.median(numeric_only=True)
train.fillna(medianas, inplace=True)
test.fillna(medianas, inplace=True)


# =====================================================================
# 8 FUSIONAR CATEGORÍAS DE RESTECG: 0 = normal, 1 = anormal
# =====================================================================
# Asegurar que restecg sea numérico en train y test
train['restecg'] = pd.to_numeric(train['restecg'], errors='coerce')
test['restecg']  = pd.to_numeric(test['restecg'], errors='coerce')

# Fusionar categorías de restecg
train['restecg'] = train['restecg'].replace({1: 'anormal', 2: 'anormal'})
train['restecg'] = train['restecg'].replace({0: 'normal'})
# Codificar a 0 y 1
train['restecg'] = train['restecg'].map({'normal': 0, 'anormal': 1})

#lo mismo con test
test['restecg'] = test['restecg'].replace({1: 'anormal', 2: 'anormal'})
test['restecg'] = test['restecg'].replace({0: 'normal'})
test['restecg'] = test['restecg'].map({'normal': 0, 'anormal': 1})


# =====================================================================
# 9 AÑADIR NUEVAS MUESTRAS 
# =====================================================================
# FUSIÓN: Unimos train y statlog para tener la base de conocimiento inicial
# Añadimos la variable hospital y asignamos hospital = 0 a todas las nuevas muestras
if 'hospital' not in df_statlog.columns:
    df_statlog['hospital'] = 0  

# Eliminar las mismas columnas que en el train 
if 'ca' in df_statlog.columns:
    df_statlog.drop(columns=['ca'], inplace=True)
if 'chol' in df_statlog.columns:
    df_statlog.drop(columns=['chol'], inplace=True)

# Ahora podemos concatenar
cols_train = train.columns
df_train_full = pd.concat([train, df_statlog], axis=0, ignore_index=True)
df_test = test


# =====================================================================
# 10 LIMPIAR EL DF FUSIONADO 
# =====================================================================
# Quitar IDs y convertimos a numerico todo el df
for col in df_train_full.columns:
    if 'id' in col.lower() or 'patient' in col.lower():
        df_train_full.drop(columns=[col], inplace=True)
        if col in df_test.columns:
            df_test.drop(columns=[col], inplace=True)
    
    df_train_full[col] = pd.to_numeric(df_train_full[col], errors='coerce')
    if col in df_test.columns:
        df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

# Rellenar con mediana por si queda algun nulo
medians = df_train_full.median()
df_train_full.fillna(medians, inplace=True)
df_test.fillna(medians, inplace=True)


print("\n--- 3. CREACIÓN DE SPLITS Y NORMALIZACIÓN ---")

# SEPARACIÓN X e y
target_col = 'label'
X = df_train_full.drop(columns=[target_col])
y = df_train_full[target_col]
# Alinear test
X_test_final = df_test[X.columns]
# ESCALADO
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_final)


print("\n--- 4. DEFINICIÓN DEL MODELO BASE (EL PROFESOR) ---")

# Usamos nuestro mejor Ensemble: Regresión Logística + Random Forest
clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=5000, C=1.0, random_state=42)
clf2 = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
model = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2)],
    voting='soft' # Importante: 'soft' para obtener probabilidades
)


print("\n--- 5. FASE 1: ENTRENAMIENTO INICIAL ---")

model.fit(X_scaled, y)
print("El profesor ha estudiado los datos etiquetados.")


print("\n--- 6. FASE 2: PSEUDO-LABELING (EL TRUCO) ---")

# El modelo predice las PROBABILIDADES sobre el test
probs = model.predict_proba(X_test_scaled)
preds = model.predict(X_test_scaled)
# Buscamos muestras con confianza ALTA
threshold = 0.90 # 90% de seguridad
high_conf_indices = np.where(np.max(probs, axis=1) > threshold)[0]
# Si hay pocas, bajamos un poco la vara para no quedarnos sin datos extra
if len(high_conf_indices) < 50:
    print(f"Pocas muestras seguras. Bajando exigencia al 80%...")
    threshold = 0.80
    high_conf_indices = np.where(np.max(probs, axis=1) > threshold)[0]
print(f"¡Encontradas {len(high_conf_indices)} muestras en el Test con confianza > {threshold*100}%!")

# Extraemos esas muestras y sus predicciones (que ahora son sus 'etiquetas')
X_pseudo_scaled = X_test_scaled[high_conf_indices]
y_pseudo = preds[high_conf_indices]
# AUMENTO DE DATOS
# Unimos los datos originales (X_scaled) con los nuevos datos pseudo-etiquetados (X_pseudo_scaled)
X_augmented = np.vstack((X_scaled, X_pseudo_scaled))
y_augmented = np.concatenate((y, y_pseudo))
print(f"Dataset original: {X_scaled.shape[0]} muestras.")
print(f"Dataset aumentado: {X_augmented.shape[0]} muestras.")


print("\n--- 7. FASE 3: RE-ENTRENAMIENTO FINAL ---")

# El modelo vuelve a estudiar, ahora con más material (incluyendo lo que "aprendió" del test)
model.fit(X_augmented, y_augmented)
print("El modelo ha sido re-entrenado con éxito (Semi-Supervised Learning).")
# Aquí terminaría la lógica del modelo.
# El siguiente paso sería predecir de nuevo sobre X_test_scaled para generar el submission.
# Predicciones finales sobre todo el test
# Predicciones finales sobre todo el test
y_pred = model.predict(X_test_scaled)


print("\n--- 8. GUARDANDO RESULTADOS ---")

# Crear submission con índice como ID
submission = pd.DataFrame({
    'ID': range(len(y_pred)),  # ID consecutivo 0,1,2,...
    'label': y_pred
})
submission.to_csv('submission.csv', index=False)
print("CSV de submission creado con éxito.")




--- 1. PREPARACIÓN DE DATOS ---
--- 2. LIMPIEZA DE DATOS ---

--- 3. CREACIÓN DE SPLITS Y NORMALIZACIÓN ---

--- 4. DEFINICIÓN DEL MODELO BASE (EL PROFESOR) ---

--- 5. FASE 1: ENTRENAMIENTO INICIAL ---
El profesor ha estudiado los datos etiquetados.

--- 6. FASE 2: PSEUDO-LABELING (EL TRUCO) ---
Pocas muestras seguras. Bajando exigencia al 80%...
¡Encontradas 57 muestras en el Test con confianza > 80.0%!
Dataset original: 1002 muestras.
Dataset aumentado: 1059 muestras.

--- 7. FASE 3: RE-ENTRENAMIENTO FINAL ---
El modelo ha sido re-entrenado con éxito (Semi-Supervised Learning).

--- 8. GUARDANDO RESULTADOS ---
CSV de submission creado con éxito.
