In [12]:
import pandas as pd

file_path = "dabe de datos.xlsx"
df = pd.read_excel(file_path)
df.columns = df.columns.str.strip()

# Umbral: conservar solo columnas con al menos el 10% de datos no nulos
min_valid_ratio = 0.02
min_valid_count = int(len(df) * min_valid_ratio)

cols_to_keep = [col for col in df.columns if df[col].notna().sum() >= min_valid_count]
df_cleaned = df[cols_to_keep]

df_cleaned.to_excel("datos_limpios.xlsx", index=False)

print(f"Columnas originales: {len(df.columns)}")
print(f"Columnas conservadas (>= {min_valid_ratio*100:.0f}% datos no nulos): {len(df_cleaned.columns)}")


Columnas originales: 68
Columnas conservadas (>= 2% datos no nulos): 49


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# 1️⃣ Cargar datos
df = pd.read_excel("datos_limpios.xlsx")

# 2️⃣ Imputación de valores nulos
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)

# 3️⃣ Definir variables predictoras (basadas en las columnas conservadas)
features = [
    'Edad',
    'Sexo', 'Tipo de ingreso', 'Dx principal de egreso',
    'Días estancia', 'ServicioAlta', 'Días de Unidad Cuidado Intensivo',
    'Dxr 1', 'Dxr 2', 'Dxr 3', 'Dxr 4', 'Dxr 5', 'Dxr-6', 'Dxr 7', 'Dxr 8', 'Dxr 9',
    'Código causa externa',  'Situacion al alta',
    'Proc1', 'Proc2', 'Proc3', 'Proc4', 'Proc5', 'Proc6',
    'Proc7', 'Proc8', 'Proc9', 'Proc10', 'Proc11', 'Proc12',
    'Tipo servicio', 'Infecciones'
]

X = pd.get_dummies(df[features])

# 4️⃣ Preparar etiquetas (GRD - Código)
y_raw = df['GRD -Código'].astype(str)

# Filtrar GRDs frecuentes (≥ 20% del GRD más frecuente)
grd_counts = y_raw.value_counts()
umbral = grd_counts.max() * 0.20
grds_frec = grd_counts[grd_counts >= umbral].index
y_filtered = y_raw.where(y_raw.isin(grds_frec), 'OTRO')

# Codificar etiquetas
le = LabelEncoder()
y_encoded = le.fit_transform(y_filtered)

# Eliminar clases con solo 1 muestra
mask = [Counter(y_encoded)[label] > 1 for label in y_encoded]
X = X[mask]
y_encoded = y_encoded[mask]

# 5️⃣ Dividir conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# 6️⃣ Escalado
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 7️⃣ Definir modelo
mlp = MLPClassifier(hidden_layer_sizes=(128, 64, 32),
                    activation='relu',
                    solver='adam',
                    alpha=0.001,
                    batch_size=32,
                    learning_rate='adaptive',
                    max_iter=100,
                    early_stopping=True,
                    validation_fraction=0.1,
                    n_iter_no_change=10,
                    random_state=42,
                    verbose=True)

# 8️⃣ Entrenar modelo
mlp.fit(X_train, y_train)

# 9️⃣ Evaluación
y_pred = mlp.predict(X_test)
y_pred_proba = mlp.predict_proba(X_test)

print("\n📋 Reporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# # 🔯 Matriz de confusión
# cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(8,6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=le.classes_, yticklabels=le.classes_)
# plt.title("Matriz de Confusión - MLPClassifier (Model_2)")
# plt.xlabel("Predicción")
# plt.ylabel("Etiqueta Real")
# plt.tight_layout()
# plt.show()

# # 🔯 Curvas ROC por clase
# y_test_bin = label_binarize(y_test, classes=np.arange(len(le.classes_)))
# plt.figure(figsize=(8,6))
# for i in range(len(le.classes_)):
#     fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
#     roc_auc = auc(fpr, tpr)
#     plt.plot(fpr, tpr, label=f'{le.classes_[i]} (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], 'k--')
# plt.title('Curvas ROC por clase - MLPClassifier')
# plt.xlabel('Tasa de Falsos Positivos')
# plt.ylabel('Tasa de Verdaderos Positivos')
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.tight_layout()
# plt.show()


Iteration 1, loss = 1.60865625
Validation score: 0.737220
Iteration 2, loss = 0.50468571
Validation score: 0.760987
Iteration 3, loss = 0.21295736
Validation score: 0.756951
Iteration 4, loss = 0.12213428
Validation score: 0.755157
Iteration 5, loss = 0.08079491
Validation score: 0.768161
Iteration 6, loss = 0.06310853
Validation score: 0.763677
Iteration 7, loss = 0.05638753
Validation score: 0.765471
Iteration 8, loss = 0.05627546
Validation score: 0.769955
Iteration 9, loss = 0.05541248
Validation score: 0.757848
Iteration 10, loss = 0.06407508
Validation score: 0.753363
Iteration 11, loss = 0.08584341
Validation score: 0.752466
Iteration 12, loss = 0.07837976
Validation score: 0.770852
Iteration 13, loss = 0.06517766
Validation score: 0.765919
Iteration 14, loss = 0.06007152
Validation score: 0.763677
Iteration 15, loss = 0.05510257
Validation score: 0.777130
Iteration 16, loss = 0.05482527
Validation score: 0.765919
Iteration 17, loss = 0.06282414
Validation score: 0.778027
Iterat

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# 1️⃣ Cargar datos
df = pd.read_excel("datos_limpios.xlsx")

# 2️⃣ Imputación de valores nulos
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna('Desconocido', inplace=True)

# 3️⃣ Definir variables predictoras
features = [
    'Num caso', 'Aseguradora -Código-', 'Aseguradora -Descripción-', 'Edad', 'Grupo Edad',
    'Sexo', 'Codigo de ciudad', 'Fecha de ingreso', 'Tipo de ingreso', 'Fecha de egreso',
    'Días estancia', 'ServicioAlta', 'Cuidados intensivos', 'Días de Unidad Cuidado Intensivo',
    'Dx de ingreso', 'Dx principal de egreso', 'Dx principal de egreso .1', 'Dx Ppal 3 Caracteres',
    'Dxr 1', 'Dxr 2', 'Dxr 3', 'Dxr 4', 'Dxr 5', 'Dxr-6', 'Dxr 7', 'Dxr 8', 'Dxr 9',
    'Código causa externa', 'Causa externa', 'Situacion al alta',
    'Proc1', 'Proc2', 'Proc3', 'Proc4', 'Proc5', 'Proc6',
    'Proc7', 'Proc8', 'Proc9', 'Proc10', 'Proc11', 'Proc12',
    'Tipo servicio', 'Causa Basica de muerte', 'Infecciones', 'Infección Quirurgica'
]

X = pd.get_dummies(df[features])

# 4️⃣ Preparar etiquetas (GRD - Código)
y_raw = df['GRD -Código'].astype(str)

# Filtrar GRDs frecuentes (≥ 20% del GRD más frecuente)
grds_counts = y_raw.value_counts()
umbral = grds_counts.max() * 0.20
grds_frec = grds_counts[grds_counts >= umbral].index

# Eliminar filas con GRD poco frecuentes
mask_frec = y_raw.isin(grds_frec)
df = df[mask_frec]
X = X[mask_frec]
y_raw = y_raw[mask_frec]

# Codificar etiquetas
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)

# Eliminar clases con solo 1 muestra
mask = [Counter(y_encoded)[label] > 1 for label in y_encoded]
X = X[mask]
y_encoded = y_encoded[mask]

# 5️⃣ Dividir conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# 6️⃣ Escalado
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 7️⃣ Definir modelo
mlp = MLPClassifier(hidden_layer_sizes=(256, 128, 64),
                    activation='relu',
                    solver='adam',
                    alpha=0.001,
                    batch_size=32,
                    learning_rate='adaptive',
                    max_iter=100,
                    early_stopping=True,
                    validation_fraction=0.1,
                    n_iter_no_change=10,
                    random_state=42,
                    verbose=True)

# 8️⃣ Entrenar modelo
mlp.fit(X_train, y_train)

# 9️⃣ Evaluación
y_pred = mlp.predict(X_test)
y_pred_proba = mlp.predict_proba(X_test)

print("\n📋 Reporte de clasificación:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# # 🔯 Matriz de confusión
# cm = confusion_matrix(y_test, y_pred)
# plt.figure(figsize=(8,6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=le.classes_, yticklabels=le.classes_)
# plt.title("Matriz de Confusión - MLPClassifier")
# plt.xlabel("Predicción")
# plt.ylabel("Etiqueta Real")
# plt.tight_layout()
# plt.show()

# # 🔯 Curvas ROC por clase
# y_test_bin = label_binarize(y_test, classes=np.arange(len(le.classes_)))
# plt.figure(figsize=(8,6))
# for i in range(len(le.classes_)):
#     fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
#     roc_auc = auc(fpr, tpr)
#     plt.plot(fpr, tpr, label=f'{le.classes_[i]} (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], 'k--')
# plt.title('Curvas ROC por clase - MLPClassifier')
# plt.xlabel('Tasa de Falsos Positivos')
# plt.ylabel('Tasa de Verdaderos Positivos')
# plt.legend(loc='lower right')
# plt.grid(True)
# plt.tight_layout()
# plt.show()

Iteration 1, loss = 1.81973398
Validation score: 0.691011
Iteration 2, loss = 0.29584614
Validation score: 0.719101
Iteration 3, loss = 0.11079354
Validation score: 0.734831
Iteration 4, loss = 0.06410279
Validation score: 0.752809
Iteration 5, loss = 0.04627133
Validation score: 0.786517
Iteration 6, loss = 0.03802257
Validation score: 0.788764
Iteration 7, loss = 0.03283653
Validation score: 0.795506
Iteration 8, loss = 0.03013335
Validation score: 0.797753
Iteration 9, loss = 0.02698330
Validation score: 0.793258
Iteration 10, loss = 0.03052640
Validation score: 0.794382
Iteration 11, loss = 0.02957596
Validation score: 0.798876
Iteration 12, loss = 0.02745518
Validation score: 0.796629
Iteration 13, loss = 0.02840545
Validation score: 0.808989
Iteration 14, loss = 0.03330989
Validation score: 0.796629
Iteration 15, loss = 0.34097177
Validation score: 0.721348
Iteration 16, loss = 0.33924197
Validation score: 0.730337
Iteration 17, loss = 0.14601229
Validation score: 0.755056
Iterat




📋 Reporte de clasificación:
              precision    recall  f1-score   support

      104102       0.69      0.95      0.80        37
      104103       0.94      0.52      0.67        29
      111401       0.61      0.67      0.63        30
      114102       0.84      0.93      0.89        58
      114103       0.88      0.74      0.80        38
      114121       0.65      0.63      0.64        49
      114122       0.74      0.80      0.77       138
      114123       0.78      0.70      0.74        87
      114131       0.77      0.81      0.79        57
      114132       0.85      0.79      0.82        29
       14221       0.84      0.92      0.88        73
       14222       0.77      0.69      0.73        35
       14231       1.00      1.00      1.00        31
       14262       0.69      0.69      0.69        32
       14263       0.80      0.77      0.79        48
      174132       0.98      1.00      0.99        40
      174133       1.00      1.00      1.00        5

In [5]:
git remote add origin https://github.com/julianag18/FinalInfoMedica.git

SyntaxError: invalid syntax (1398598700.py, line 1)