In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# 1️⃣ Carga e imputación
df = pd.read_excel("datos_modelo_GRD.xlsx")
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)

# 2️⃣ One‑hot encoding de features
features = [
    'Edad','Sexo','Tipo de ingreso','Dx principal de egreso','Duración estancia',
    'Días de Unidad Cuidado Intensivo','Infecciones','Situacion al alta',
    'Num_Procedimientos','Tipo servicio'
]
X = pd.get_dummies(df[features], drop_first=True)

# 3️⃣ Filtrado de GRDs poco frecuentes
y_raw = df['GRD -Código'].astype(str)
counts = y_raw.value_counts()
threshold = counts.max() * 0.20
frequent = counts[counts >= threshold].index
y = y_raw.where(y_raw.isin(frequent), 'OTRO')

# 4️⃣ Label encoding y quitar clases con una sola muestra
le = LabelEncoder()
y_enc = le.fit_transform(y)
mask = np.array([np.sum(y_enc == lbl) > 1 for lbl in y_enc])
X, y_enc = X[mask], y_enc[mask]

# 5️⃣ Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# 6️⃣ Escalado
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# 7️⃣ Define los dos modelos base
mlp = MLPClassifier(
    hidden_layer_sizes=(128,64,32),
    activation='relu',
    solver='adam',
    alpha=1e-3,
    learning_rate_init=1e-3,
    batch_size=32,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=False
)

xgb = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    tree_method='hist',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    random_state=42,
    verbosity=0
)

# 8️⃣ Ensemble por votación mayoritaria
ensemble = VotingClassifier(
    estimators=[('mlp', mlp), ('xgb', xgb)],
    voting='hard',     # 'hard' para que cada clasificador vote la clase
    n_jobs=1
)

# 9️⃣ Entrena el ensemble
ensemble.fit(X_train_s, y_train)

# 🔟 Evaluación
y_pred = ensemble.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)
print(f"\n🎯 Ensemble Test Accuracy: {acc:.4f}\n")

print("📋 Reporte de clasificación Ensemble:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("🔢 Matriz de confusión Ensemble:")
print(confusion_matrix(y_test, y_pred))



🎯 Ensemble Test Accuracy: 0.7600

📋 Reporte de clasificación Ensemble:
              precision    recall  f1-score   support

      104102       0.51      0.70      0.59        37
      104103       0.54      0.52      0.53        29
      111401       0.38      0.37      0.37        30
      114102       0.52      0.91      0.66        58
      114103       0.50      0.18      0.27        38
      114121       0.38      0.37      0.38        49
      114122       0.53      0.78      0.63       138
      114123       0.55      0.18      0.28        87
      114131       0.54      0.65      0.59        57
      114132       0.36      0.17      0.23        29
       14221       0.69      0.93      0.79        73
       14222       0.56      0.26      0.35        35
       14231       0.50      0.61      0.55        31
       14262       0.38      0.78      0.52        32
       14263       0.68      0.31      0.43        48
      174132       0.88      0.72      0.79        40
      174

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1️⃣ Cargar y preparar datos
df = pd.read_excel("dabe de datos .xlsx")
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)

features = [
    'Edad','Sexo','Tipo de ingreso','Dx principal de egreso','Duración estancia',
    'Días de Unidad Cuidado Intensivo','Infecciones','Situacion al alta',
    'Num_Procedimientos','Tipo servicio'
]
X = pd.get_dummies(df[features], drop_first=True)

y_raw = df['GRD -Código'].astype(str)
counts = y_raw.value_counts()
threshold = counts.max() * 0.20
frequent = counts[counts >= threshold].index
y = y_raw.where(y_raw.isin(frequent), 'OTRO')

le = LabelEncoder()
y_enc = le.fit_transform(y)
mask = np.array([np.sum(y_enc == lbl) > 1 for lbl in y_enc])
X = X[mask]
y_enc = y_enc[mask]

# 2️⃣ Análisis EDA
## a) Baja varianza
low_var_cols = X.loc[:, X.std() < 0.01].columns.tolist()
print(f"📉 Variables con baja varianza (<0.01): {len(low_var_cols)}")
print(low_var_cols)

## b) Colinealidad alta
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_pairs = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
print(f"\n🔁 Variables con alta colinealidad (>0.9): {len(high_corr_pairs)}")
print(high_corr_pairs)

## c) Ranking por información mutua
mi = mutual_info_classif(X, y_enc, discrete_features='auto', random_state=42)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(f"\n🎯 Top 15 variables por información mutua:")
print(mi_series.head(15))

# 3️⃣ Conjunto reducido para nuevo modelo
top_vars = mi_series.head(30).index.tolist()  # puedes ajustar este número
X_reduced = X[top_vars]

# 4️⃣ Split y escalado (para ambas versiones)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

Xr_train_s = scaler.fit_transform(X_reduced.loc[X_train.index])
Xr_test_s  = scaler.transform(X_reduced.loc[X_test.index])

# 5️⃣ Definir clasificadores
mlp = MLPClassifier(
    hidden_layer_sizes=(128,64,32),
    activation='relu',
    solver='adam',
    alpha=1e-3,
    learning_rate_init=1e-3,
    batch_size=32,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42
)

xgb = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    tree_method='hist',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    random_state=42,
    verbosity=0
)

# 6️⃣ Ensemble original
ensemble = VotingClassifier(
    estimators=[('mlp', mlp), ('xgb', xgb)],
    voting='hard',
    n_jobs=1
)

ensemble.fit(X_train_s, y_train)
y_pred = ensemble.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)
print(f"\n📦 Ensemble (TODAS las variables) Accuracy: {acc:.4f}")

# 7️⃣ Ensemble con top variables MI
ensemble_r = VotingClassifier(
    estimators=[('mlp', mlp), ('xgb', xgb)],
    voting='hard',
    n_jobs=1
)
ensemble_r.fit(Xr_train_s, y_train)
y_pred_r = ensemble_r.predict(Xr_test_s)
acc_r = accuracy_score(y_test, y_pred_r)
print(f"🧠 Ensemble (TOP 30 MI variables) Accuracy: {acc_r:.4f}")

# 8️⃣ Reporte comparativo
print("\n📋 Reporte Original:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\n📋 Reporte con Reducción:")
print(classification_report(y_test, y_pred_r, target_names=le.classes_))


📉 Variables con baja varianza (<0.01): 1277
['Dx principal de egreso_A021', 'Dx principal de egreso_A044', 'Dx principal de egreso_A058', 'Dx principal de egreso_A060', 'Dx principal de egreso_A062', 'Dx principal de egreso_A080', 'Dx principal de egreso_A085', 'Dx principal de egreso_A151', 'Dx principal de egreso_A157', 'Dx principal de egreso_A159', 'Dx principal de egreso_A161', 'Dx principal de egreso_A168', 'Dx principal de egreso_A182', 'Dx principal de egreso_A188', 'Dx principal de egreso_A191', 'Dx principal de egreso_A238', 'Dx principal de egreso_A268', 'Dx principal de egreso_A279', 'Dx principal de egreso_A321', 'Dx principal de egreso_A379', 'Dx principal de egreso_A410', 'Dx principal de egreso_A411', 'Dx principal de egreso_A412', 'Dx principal de egreso_A415', 'Dx principal de egreso_A419', 'Dx principal de egreso_A421', 'Dx principal de egreso_A422', 'Dx principal de egreso_A488', 'Dx principal de egreso_A499', 'Dx principal de egreso_A521', 'Dx principal de egreso_A

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1️⃣ Cargar y preparar datos# 1️⃣ Cargar y preparar datos
df = pd.read_excel("dabe de datos.xlsx")
df.columns = df.columns.str.strip()  # 🔧 CORRIGE nombres con espacios
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)


# 2️⃣ Definir todas las columnas útiles como features (evitando fechas e IDs)
features = [
    'Edad', 'Grupo Edad', 'Sexo', 'Codigo de ciudad', 'Tipo de ingreso',
    'Días estancia', 'ServicioAlta', 'Cuidados intensivos', 'Días de Unidad Cuidado Intensivo', 'Dx principal de egreso', 'Dx principal de egreso .1', 'Dx Ppal 3 Caracteres',
    'Dxr 1', 'Dxr 2', 'Dxr 3', 'Dxr 4', 'Dxr 5', 'Dxr-6', 'Dxr 7', 'Dxr 8', 'Dxr 9', 'Dxr 10',
    'Código causa externa', 'Causa externa', 'Situacion al alta',
    'Proc1', 'Proc2', 'Proc3', 'Proc4', 'Proc5', 'Proc6', 'Proc7', 'Proc8', 'Proc9', 'Proc10',
    'Proc11', 'Proc12', 'Proc13', 'Proc14', 'Proc15', 'Proc16', 'Proc17', 'Proc18', 'Proc19', 'Proc20',
    'Proc21', 'Proc22', 'Proc23', 'Proc24', 'Proc25', 'Proc26', 'Proc27', 'Proc28', 'Proc29', 'Proc30',
    'Tipo servicio', 'Causa Basica de muerte', 'Infecciones', 'Infección Quirurgica'
]

# 3️⃣ One-hot encoding para variables categóricas
X = pd.get_dummies(df[features], drop_first=True)

# 4️⃣ Codificar la etiqueta (GRD)
y_raw = df['GRD -Código'].astype(str)
counts = y_raw.value_counts()
threshold = counts.max() * 0.20
frequent = counts[counts >= threshold].index
y = y_raw.where(y_raw.isin(frequent), 'OTRO')

le = LabelEncoder()
y_enc = le.fit_transform(y)

# 5️⃣ Filtrar clases con solo una muestra
mask = np.array([np.sum(y_enc == lbl) > 1 for lbl in y_enc])
X, y_enc = X[mask], y_enc[mask]

# 6️⃣ División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# 7️⃣ Escalado
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# 8️⃣ Red neuronal
mlp = MLPClassifier(
    hidden_layer_sizes=(128,64,32),
    activation='relu',
    solver='adam',
    alpha=1e-3,
    learning_rate_init=1e-3,
    batch_size=32,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=True
)

mlp.fit(X_train_s, y_train)

# 🔟 Evaluación
y_pred = mlp.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)
print(f"\n🎯 Test Accuracy: {acc:.4f}\n")
print("📋 Reporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("🔢 Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))


Iteration 1, loss = 1.38745345
Validation score: 0.760987





🎯 Test Accuracy: 0.7637

📋 Reporte de clasificación:
              precision    recall  f1-score   support

      104102       0.46      0.46      0.46        37
      104103       0.40      0.14      0.21        29
      111401       0.33      0.07      0.11        30
      114102       0.62      0.69      0.66        58
      114103       0.68      0.34      0.46        38
      114121       0.72      0.47      0.57        49
      114122       0.64      0.72      0.68       138
      114123       0.55      0.47      0.51        87
      114131       0.51      0.93      0.66        57
      114132       0.69      0.31      0.43        29
       14221       0.82      0.81      0.81        73
       14222       0.58      0.54      0.56        35
       14231       0.65      0.65      0.65        31
       14262       0.38      0.50      0.43        32
       14263       0.74      0.54      0.63        48
      174132       0.95      0.50      0.66        40
      174133       0.71    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
#prueba con db limpia y completa

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1️⃣ Cargar y preparar datos
df = pd.read_excel("dabe de datos.xlsx")
df.columns = df.columns.str.strip()  # 🔧 Corrige nombres con espacios
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)

# 2️⃣ Definir todas las columnas útiles como features (evitando fechas e IDs)
features = [
    'Edad', 'Grupo Edad', 'Sexo', 'Codigo de ciudad', 'Tipo de ingreso',
    'Días estancia', 'ServicioAlta', 'Cuidados intensivos', 'Días de Unidad Cuidado Intensivo',
    'Dx de ingreso', 'Dx principal de egreso', 'Dx principal de egreso .1', 'Dx Ppal 3 Caracteres',
    'Dxr 1', 'Dxr 2', 'Dxr 3', 'Dxr 4', 'Dxr 5',
    'Código causa externa', 'Causa externa', 'Situacion al alta',
    'Proc1', 'Proc2', 'Proc3', 'Proc4', 'Proc5', 'Proc6', 'Proc7', 'Proc8', 'Proc9', 'Proc10',
    'Tipo servicio', 'Causa Basica de muerte', 'Infecciones', 'Infección Quirurgica'
]

# 3️⃣ One-hot encoding para variables categóricas
X = pd.get_dummies(df[features], drop_first=True)

# 4️⃣ Codificar la etiqueta (GRD), sin eliminar ni agrupar ninguna clase
y_raw = df['GRD -Código'].astype(str)
le = LabelEncoder()
y_enc = le.fit_transform(y_raw)

# 5️⃣ Filtrar clases con solo una muestra (el modelo no puede aprender con 1 muestra)
mask = np.array([np.sum(y_enc == lbl) > 1 for lbl in y_enc])
X = X[mask]
y_enc = y_enc[mask]

# 6️⃣ División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# 7️⃣ Escalado
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# 8️⃣ Red neuronal
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    activation='relu',
    solver='adam',
    alpha=1e-3,
    learning_rate_init=1e-3,
    batch_size=32,
    max_iter=100,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    random_state=42,
    verbose=True
)

mlp.fit(X_train_s, y_train)

# 🔟 Evaluación
y_pred = mlp.predict(X_test_s)
acc = accuracy_score(y_test, y_pred)

print(f"\n🎯 Test Accuracy: {acc:.4f}\n")
print("📋 Reporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("🔢 Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))


Iteration 1, loss = 4.08754181
Validation score: 0.361922
Iteration 2, loss = 2.15358058
Validation score: 0.460260
Iteration 3, loss = 1.27815987
Validation score: 0.475079
Iteration 4, loss = 0.80351743
Validation score: 0.495734
Iteration 5, loss = 0.54390322
Validation score: 0.507409
Iteration 6, loss = 0.40816124
Validation score: 0.502021
Iteration 7, loss = 0.32322568
Validation score: 0.511450
Iteration 8, loss = 0.27661245
Validation score: 0.512797
Iteration 9, loss = 0.24662271
Validation score: 0.506960
Iteration 10, loss = 0.21050463
Validation score: 0.514594
Iteration 11, loss = 0.17487568
Validation score: 0.510103
Iteration 12, loss = 0.16784361
Validation score: 0.512348
Iteration 13, loss = 0.16516545
Validation score: 0.519982
Iteration 14, loss = 0.17487914
Validation score: 0.525819
Iteration 15, loss = 0.17912832
Validation score: 0.497530
Iteration 16, loss = 0.19836622
Validation score: 0.519982
Iteration 17, loss = 0.21168254
Validation score: 0.509654
Iterat

ValueError: Number of classes, 557, does not match size of target_names, 608. Try specifying the labels parameter