# üè• Predicci√≥n de No‚ÄëShow en Citas M√©dicas
### Modelo de Machine Learning para predecir inasistencias

**Autora:** Jessica Elizondo Trevi√±o  
**Objetivo:** Construir un modelo capaz de predecir si un paciente asistir√° o no a su cita m√©dica, optimizando F1.
**M√©trica principal:** PR‚ÄëAUC (clase positiva = no_show), tambi√©n reportamos ROC‚ÄëAUC y F1 con umbral √≥ptimo por F1.

---


In [None]:

# ===========================
# 0) Imports y setup
# ===========================
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_recall_curve, confusion_matrix, classification_report
)

import matplotlib.pyplot as plt

RANDOM_STATE = 42
pd.set_option('display.max_columns', 100)


## üì• Carga de Datos


In [None]:

# ===========================
# 1) Carga
# ===========================
csv_path = Path("no_show_appointments.csv")
assert csv_path.exists(), "Coloca el archivo 'no_show_appointments.csv' junto a esta libreta."
df = pd.read_csv(csv_path)

# Normalizar nombres
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace('-', '_')
              .str.replace(' ', '_')
)

display(df.head())
print("Shape:", df.shape)
print("\nNulos por columna:")
print(df.isna().sum().sort_values(ascending=False))


## üßπ Limpieza y Preprocesamiento


In [None]:

# ===========================
# 2) Sanity check & limpieza
# ===========================
# Quitar duplicados exactos:
df = df.drop_duplicates()

# Edades inv√°lidas:
if 'age' in df.columns and (df['age'] < 0).any():
    df = df[df['age'] >= 0].copy()

# Parse de fechas (ajusta nombres si difieren)
for col in ['scheduledday', 'appointmentday']:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Objetivo (Yes/No -> 1/0). Algunas versiones traen 'No-show', otras 'no_show'
target_col = 'no_show' if 'no_show' in df.columns else ('no-show' if 'no-show' in df.columns else None)
assert target_col is not None, "No se encontr√≥ la columna objetivo ('No-show' / 'no_show')."
df['no_show'] = df[target_col].map({'Yes':1, 'No':0}).astype('int8')

display(df.head())


##  Ingenier√≠a de variables

In [None]:

# ===========================
# 3) Ingenier√≠a de variables
# ===========================
# waiting_days
if 'appointmentday' in df.columns and 'scheduledday' in df.columns:
    df['waiting_days'] = (df['appointmentday'].dt.date - df['scheduledday'].dt.date).apply(lambda x: x.days)
    df['waiting_days'] = df['waiting_days'].clip(lower=0)
else:
    df['waiting_days'] = np.nan  # placeholder si faltan columnas

# Hora en que se agenda y d√≠a de la cita
if 'scheduledday' in df.columns:
    df['scheduled_hour'] = df['scheduledday'].dt.hour
else:
    df['scheduled_hour'] = np.nan

if 'appointmentday' in df.columns:
    df['appointment_weekday'] = df['appointmentday'].dt.day_name()
else:
    df['appointment_weekday'] = np.nan

# Bins de edad
if 'age' in df.columns:
    age_bins = [0, 12, 25, 40, 60, 120]
    age_labels = ['0-12','13-25','26-40','41-60','61+']
    df['age_bin'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=True, include_lowest=True)
else:
    df['age_bin'] = np.nan

# Opcionales
 df['is_morning'] = (df['scheduled_hour'] < 12).astype('int8', copy=False) if 'scheduled_hour' in df.columns else 0

display(df.head())


## üîé An√°lisis Exploratorio (EDA)
Exploraci√≥n inicial de las variables.

In [None]:

print("Shape:", df.shape)
print(df[['age','waiting_days','scheduled_hour']].describe(include='all'))

# Distribuciones
plt.figure(figsize=(8,4))
plt.hist(df['age'].dropna(), bins=30)
plt.title("Distribuci√≥n de edad")
plt.xlabel("Edad"); plt.ylabel("Frecuencia")
plt.show()

plt.figure(figsize=(8,4))
plt.hist(df['waiting_days'].dropna(), bins=30)
plt.title("Distribuci√≥n de d√≠as de espera")
plt.xlabel("D√≠as"); plt.ylabel("Frecuencia")
plt.show()

def rate_by(col):
    if col in df.columns:
        tab = (df.groupby(col)['no_show']
                 .mean()
                 .sort_values(ascending=False))
        print(f"\nTasa de no-show por {col} (top 10):")
        print((tab*100).round(2).head(10))

for col in ['appointment_weekday','age_bin','is_morning','sms_received','scholarship','hypertension','diabetes','alcoholism','handicap','neighborhood']:
    rate_by(col)


## Partici√≥n de datos

In [None]:

target = 'no_show'
y = df[target].values

numeric_feats = [c for c in ['age','waiting_days','scheduled_hour'] if c in df.columns]
categorical_feats = [c for c in ['gender','appointment_weekday','age_bin','neighborhood','scholarship',
                                 'hypertension','diabetes','alcoholism','handicap','sms_received','is_morning']
                     if c in df.columns]

X = df[numeric_feats + categorical_feats].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

print("X_train:", X_train.shape, "| X_test:", X_test.shape)
print("Positivos (no_show=1) en test:", np.mean(y_test).round(4))


## Preprocesamiento y modelos

In [None]:

pre = ColumnTransformer(
    transformers=[
        ("num","passthrough", numeric_feats),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_feats)
    ]
)

# Modelo 1: Regresi√≥n Log√≠stica (balanceo de clases)
logreg = Pipeline(steps=[
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=2000, class_weight='balanced'))
])

# Modelo 2: RandomForest (baseline)
rf = Pipeline(steps=[
    ("pre", pre),
    ("clf", RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=2,
        class_weight='balanced_subsample',
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

models = {"logreg": logreg, "rf": rf}


## ü§ñ Entrenamiento de Modelos


## Entrenamiento, umbral √≥ptimo y m√©tricas

In [None]:

def eval_model(name, pipe, X_tr, y_tr, X_te, y_te):
    pipe.fit(X_tr, y_tr)
    proba = pipe.predict_proba(X_te)[:,1]

    roc = roc_auc_score(y_te, proba)
    pr_auc = average_precision_score(y_te, proba)

    # Umbral √≥ptimo por F1 en curva P-R
    p, r, thr = precision_recall_curve(y_te, proba)
    f1s = 2 * (p*r) / (p + r + 1e-10)
    best_idx = np.nanargmax(f1s)
    best_thr = thr[best_idx-1] if 0 < best_idx < len(thr) else 0.5
    y_pred = (proba >= best_thr).astype(int)

    f1 = f1_score(y_te, y_pred)
    cm = confusion_matrix(y_te, y_pred)

    print(f"\n=== {name} ===")
    print(f"ROC-AUC: {roc:.4f} | PR-AUC (clase=1 no_show): {pr_auc:.4f}")
    print(f"F1@best_thr: {f1:.4f} | best_thr‚âà {best_thr:.3f}")
    print("Matriz de confusi√≥n [ [TN FP] [FN TP] ]:")
    print(cm)
    print("\nReporte de clasificaci√≥n:")
    print(classification_report(y_te, y_pred, digits=4))

    return {
        "name": name, "roc_auc": roc, "pr_auc": pr_auc,
        "f1": f1, "best_thr": best_thr, "pipe": pipe, "cm": cm
    }

results = []
for name, pipe in models.items():
    results.append(eval_model(name, pipe, X_train, y_train, X_test, y_test))

best = sorted(results, key=lambda d: d['pr_auc'], reverse=True)[0]
print("\n>>> Mejor modelo por PR-AUC:", best['name'])
final_model = best['pipe']
final_thr = best['best_thr']


##  Interpretabilidad (importancias / coeficientes)

In [None]:

if best['name'] == 'rf':
    ohe = final_model.named_steps['pre'].named_transformers_['cat']
    cat_names = ohe.get_feature_names_out(categorical_feats) if categorical_feats else []
    feat_names = numeric_feats + list(cat_names)
    importances = final_model.named_steps['clf'].feature_importances_
    top = (pd.Series(importances, index=feat_names)
             .sort_values(ascending=False).head(15))
    print("\nTop 15 features (RF):")
    display(top)
else:
    ohe = final_model.named_steps['pre'].named_transformers_['cat']
    cat_names = ohe.get_feature_names_out(categorical_feats) if categorical_feats else []
    feat_names = numeric_feats + list(cat_names)
    coefs = final_model.named_steps['clf'].coef_[0]
    s = pd.Series(coefs, index=feat_names).sort_values()
    print("\nTop -coef (menor prob. de no_show):")
    display(s.head(10))
    print("\nTop +coef (mayor prob. de no_show):")
    display(s.tail(10))


##  Umbral final 

In [None]:

print(f"Umbral final recomendado (max F1): {final_thr:.3f}")
