# Factores asociados a enfermedades cardiovasculares
**Autor:** JLBM  
**Fecha:** 17/09/2025  
**Notebook:** eda_heart_disease.ipynb

---

## Propósito
Este cuaderno guía un **Análisis Exploratorio de Datos (EDA)** de un archivo `.csv` con miras a preparar un **modelo predictivo de *Machine Learning*** sobre enfermedades cardiovasculares.

> **Instrucciones rápidas**
> 1. Actualiza la variable `CSV_PATH` en la siguiente celda con la ruta a tu archivo `.csv`.
> 2. Ejecuta todas las celdas en orden.
> 3. Al finalizar, se generará automáticamente un reporte en Word: **“report_eda_heart_disease v1.docx”** con los principales hallazgos.


In [None]:

# ============ Configuración inicial ============
# Ajusta esta ruta a tu archivo CSV antes de ejecutar
CSV_PATH = Path(__file__).parent  # carpeta donde está el script

# Opciones generales
SAVE_DIR = "outputs"
RANDOM_STATE = 42
REPORT_FILENAME = "report_eda_heart_disease.docx"

import os, sys, json, math, textwrap, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# Visualización (evitar seaborn por compatibilidad)
import matplotlib.pyplot as plt

# Modelos y utilidades
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, r2_score
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

from scipy import stats
from scipy.stats import chi2_contingency

# Reporte en Word
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH

os.makedirs(SAVE_DIR, exist_ok=True)

def savefig(name):
    path = os.path.join(SAVE_DIR, f"{name}.png")
    plt.savefig(path, bbox_inches="tight", dpi=140)
    plt.close()
    return path

def detect_types(df, cat_threshold=20):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Categóricas: dtypes 'object'/'category' o numéricas con pocas categorías
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    for c in num_cols:
        if df[c].nunique(dropna=True) <= cat_threshold:
            if c not in cat_cols:
                cat_cols.append(c)
    # Evitar duplicados y asegurar separación
    cat_cols = list(dict.fromkeys(cat_cols))
    num_cols = [c for c in num_cols if c not in cat_cols]
    return num_cols, cat_cols

def summarize_missing(df):
    miss = df.isna().mean().sort_values(ascending=False)
    return miss[miss > 0]

def iqr_outliers(series):
    q1, q3 = np.nanpercentile(series, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (series < lo) | (series > hi)
    return int(mask.sum()), float(lo), float(hi)

def auto_target(df):
    # Heurística: buscar columnas típicas de etiqueta
    candidates = ["target", "disease", "heart_disease", "outcome", "label", "y"]
    for c in df.columns:
        if c.lower() in candidates or "disease" in c.lower():
            return c
    return None

def is_classification(y):
    return pd.api.types.is_integer_dtype(y) or y.nunique()<=20

def corr_df(df, cols):
    return df[cols].corr(method="pearson")

def plot_hist(series, title):
    plt.figure()
    plt.hist(series.dropna(), bins=30)
    plt.title(title)
    plt.xlabel(series.name)
    plt.ylabel("Frecuencia")
    return savefig(f"hist_{series.name}")

def plot_bar_counts(series, title):
    plt.figure()
    counts = series.astype(str).value_counts(dropna=False)
    counts.plot(kind="bar")
    plt.title(title)
    plt.xlabel(series.name)
    plt.ylabel("Cuenta")
    return savefig(f"bar_{series.name}")

def plot_scatter(x, y, title):
    plt.figure()
    plt.scatter(x, y, alpha=0.6)
    plt.title(title)
    plt.xlabel(x.name); plt.ylabel(y.name)
    return savefig(f"scatter_{x.name}_vs_{y.name}")

def plot_box(y, x_cat, title):
    # Dibujar boxplots por categoría
    plt.figure()
    cats = x_cat.astype(str).unique()
    data = [y[x_cat.astype(str)==c].dropna() for c in cats]
    plt.boxplot(data, labels=cats, showfliers=False)
    plt.title(title); plt.xlabel(x_cat.name); plt.ylabel(y.name)
    return savefig(f"box_{y.name}_by_{x_cat.name}")

def plot_heatmap(mat, labels, title):
    plt.figure()
    plt.imshow(mat, interpolation="nearest")
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.colorbar()
    return savefig("heatmap_correlaciones")

def pca_plot(X_num_scaled, title, y=None):
    pca = PCA(n_components=2, random_state=RANDOM_STATE)
    comps = pca.fit_transform(X_num_scaled)
    plt.figure()
    plt.scatter(comps[:,0], comps[:,1], alpha=0.6)
    plt.title(f"{title} (VarExp: {pca.explained_variance_ratio_.sum():.2f})")
    plt.xlabel("PC1"); plt.ylabel("PC2")
    return savefig("pca_2d"), pca.explained_variance_ratio_

def build_report(context):
    doc = Document()
    # Portada simple
    h = doc.add_heading(case_name, level=1)
    h.alignment = WD_ALIGN_PARAGRAPH.CENTER
    p = doc.add_paragraph(f"Autor: {context['author']} | Fecha: {context['date']}")
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    doc.add_paragraph("Reporte: report_eda_heart_disease v1")
    doc.add_page_break()

    # Objetivos
    doc.add_heading("Objetivos del reporte", level=2)
    doc.add_paragraph("• Presentar un resumen claro y accionable del análisis exploratorio con miras a un modelo predictivo.")

    # Descripción del dataset
    doc.add_heading("Descripción del conjunto de datos y sus variables", level=2)
    doc.add_paragraph(f"Registros: {context['n_rows']:,} | Variables: {context['n_cols']}")
    doc.add_paragraph(f"Variable objetivo (si aplica): {context.get('target','N/D')}")
    if context['missing_top']:
        doc.add_paragraph("Principales variables con valores faltantes:")
        for name, frac in context['missing_top']:
            doc.add_paragraph(f"• {name}: {frac:.1%}")
    else:
        doc.add_paragraph("No se identificaron valores faltantes relevantes.")

    # Univariado
    doc.add_heading("Principales hallazgos - Análisis univariado", level=2)
    for line in context['univariate_findings'][:6]:
        doc.add_paragraph(f"• {line}")
    if context['example_plots'].get('univariate'):
        doc.add_paragraph("Ilustraciones:")
        for fig in context['example_plots']['univariate'][:3]:
            doc.add_picture(fig, width=Inches(5.5))

    # Bivariado
    doc.add_heading("Principales hallazgos - Análisis bivariado", level=2)
    for line in context['bivariate_findings'][:6]:
        doc.add_paragraph(f"• {line}")
    if context['example_plots'].get('bivariate'):
        doc.add_paragraph("Ilustraciones:")
        for fig in context['example_plots']['bivariate'][:3]:
            doc.add_picture(fig, width=Inches(5.5))

    # Multivariado
    doc.add_heading("Principales hallazgos - Análisis multivariado", level=2)
    for line in context['multivariate_findings'][:6]:
        doc.add_paragraph(f"• {line}")
    if context['example_plots'].get('multivariate'):
        doc.add_paragraph("Ilustraciones:")
        for fig in context['example_plots']['multivariate'][:3]:
            doc.add_picture(fig, width=Inches(5.5))

    # Conclusiones
    doc.add_heading("Conclusiones", level=2)
    for line in context['conclusions'][:6]:
        doc.add_paragraph(f"• {line}")

    # Guardar
    report_path = REPORT_FILENAME
    doc.save(report_path)
    return report_path

print("✅ Librerías importadas y utilidades definidas.")


In [None]:

# ============ Carga de datos ============
assert CSV_PATH != Path(__file__).parent  # carpeta donde está el script
df = pd.read_csv(CSV_PATH)
original_shape = df.shape
print("Dimensiones:", original_shape)
df.head(3)


In [None]:

# ============ Perfil general y limpieza ligera ============
df_ = df.copy()

# Tipos y detección
num_cols, cat_cols = detect_types(df_)
print("Numéricas:", len(num_cols), "| Categóricas:", len(cat_cols))

# Duplicados
dups = df_.duplicated().sum()
print("Duplicados:", dups)
if dups > 0:
    df_ = df_.drop_duplicates()

# Faltantes
missing = summarize_missing(df_)
display(missing.to_frame("proporción"))

# Estadísticos básicos
display(df_[num_cols].describe().T)

# Guardar lista de columnas
with open(os.path.join(SAVE_DIR, "columns.json"), "w") as f:
    json.dump({"numeric": num_cols, "categorical": cat_cols}, f, indent=2)


In [None]:

# ============ Análisis univariado ============
example_plots_uni = []

# Numéricas
for c in num_cols[:12]:  # limitar para notebooks ligeros
    path = plot_hist(df_[c], f"Histograma: {c}")
    example_plots_uni.append(path)

# Categóricas
for c in cat_cols[:12]:
    path = plot_bar_counts(df_[c], f"Conteos: {c}")
    example_plots_uni.append(path)

# Outliers (IQR)
uni_findings = []
for c in num_cols:
    n_out, lo, hi = iqr_outliers(df_[c].dropna())
    if n_out>0:
        uni_findings.append(f"{c}: {n_out} potenciales outliers (IQR), límites [{lo:.2f}, {hi:.2f}]")
    else:
        uni_findings.append(f"{c}: sin outliers relevantes por IQR")

print("Ejemplos de gráficas univariadas guardadas en", SAVE_DIR)


In [None]:

# ============ Análisis bivariado ============
example_plots_bi = []

# Correlaciones numéricas
if len(num_cols) >= 2:
    corr = corr_df(df_, num_cols)
    fig_path = plot_heatmap(corr.values, num_cols, "Matriz de correlaciones (Pearson)")
    example_plots_bi.append(fig_path)
    # Top correlaciones absolutas
    corr_vals = corr.abs().where(~np.eye(corr.shape[0], dtype=bool)).stack().sort_values(ascending=False)
    top_corr = corr_vals.head(5)
else:
    top_corr = pd.Series(dtype=float)

# Boxplots: cat vs num
for cn in cat_cols[:5]:
    for nn in num_cols[:3]:
        try:
            p = plot_box(df_[nn], df_[cn], f"{nn} por {cn}")
            example_plots_bi.append(p)
        except Exception as e:
            pass

# Chi-cuadrado: cat vs cat
chi_findings = []
for i, c1 in enumerate(cat_cols[:5]):
    for c2 in cat_cols[i+1: i+6]:
        tab = pd.crosstab(df_[c1].astype(str), df_[c2].astype(str))
        if tab.shape[0] > 1 and tab.shape[1] > 1:
            chi2, p, dof, exp = chi2_contingency(tab)
            chi_findings.append(f"{c1} ~ {c2}: chi2={chi2:.1f}, p={p:.3g}")

bi_findings = []
if len(top_corr)>0:
    for (a,b), v in top_corr.items():
        bi_findings.append(f"Fuerte relación {a}~{b} (|r|={v:.2f})")
bi_findings += chi_findings[:5]


In [None]:

# ============ Análisis multivariado ============
example_plots_multi = []
multi_findings = []

# Preparar datos numéricos escalados para PCA
if len(num_cols) >= 2:
    X_num = df_[num_cols].copy()
    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    X_num_scaled = scaler.fit_transform(imputer.fit_transform(X_num))
    p_path, varexp = pca_plot(X_num_scaled, "PCA sobre variables numéricas")
    example_plots_multi.append(p_path)
    multi_findings.append(f"PCA: Varianza explicada por 2 componentes = {varexp.sum():.2f}")
else:
    varexp = np.array([])

# Importancias (si hay target reconocible)
target_col = auto_target(df_)
if target_col is not None and target_col in df_.columns:
    y = df_[target_col]
    X = df_.drop(columns=[target_col])
    num_cols2, cat_cols2 = detect_types(X)
    # Pipeline simple de imputación/one-hot (solo para importancias rápidas)
    X_num = X[num_cols2]
    X_cat = X[cat_cols2].astype("category")
    X_cat = pd.get_dummies(X_cat, dummy_na=True, drop_first=False)
    X_all = pd.concat([X_num, X_cat], axis=1)
    X_all = X_all.fillna(X_all.median(numeric_only=True))

    # Elegir tarea
    if is_classification(y):
        model = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
    else:
        model = RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)

    X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y if is_classification(y) else None)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if is_classification(y):
        try:
            y_proba = model.predict_proba(X_test)[:,1]
            perf = roc_auc_score(y_test, y_proba)
            multi_findings.append(f"Modelo bosque aleatorio (baseline) AUC≈{perf:.3f}")
        except:
            acc = accuracy_score(y_test, y_pred)
            multi_findings.append(f"Modelo bosque aleatorio (baseline) ACC≈{acc:.3f}")
        importances = pd.Series(model.feature_importances_, index=X_all.columns).sort_values(ascending=False).head(10)
    else:
        perf = r2_score(y_test, y_pred)
        multi_findings.append(f"Modelo bosque aleatorio (baseline) R2≈{perf:.3f}")
        importances = pd.Series(model.feature_importances_, index=X_all.columns).sort_values(ascending=False).head(10)

    plt.figure()
    importances.iloc[::-1].plot(kind="barh")
    plt.title("Top 10 importancias de características (baseline)")
    imp_path = savefig("feature_importances")
    example_plots_multi.append(imp_path)
else:
    importances = pd.Series(dtype=float)


In [None]:

# ============ Construcción de reporte (Word) ============
context = {
    "author": "JLBM",
    "date": "17/09/2025",
    "n_rows": int(df_.shape[0]),
    "n_cols": int(df_.shape[1]),
    "target": target_col if 'target_col' in globals() else None,
    "missing_top": [(k, float(v)) for k,v in summarize_missing(df_).head(5).items()],
    "univariate_findings": uni_findings[:10] if 'uni_findings' in globals() else [],
    "bivariate_findings": bi_findings[:10] if 'bi_findings' in globals() else [],
    "multivariate_findings": multi_findings[:10] if 'multi_findings' in globals() else [],
    "conclusions": [],
    "example_plots": {
        "univariate": example_plots_uni if 'example_plots_uni' in globals() else [],
        "bivariate": example_plots_bi if 'example_plots_bi' in globals() else [],
        "multivariate": example_plots_multi if 'example_plots_multi' in globals() else [],
    }
}
# Reglas de conclusiones automáticas (breves)
if context['missing_top']:
    context['conclusions'].append("Existen variables con valores faltantes que requerirán imputación.")
if len(context['bivariate_findings'])>0:
    context['conclusions'].append("Se identificaron relaciones entre variables que orientan la selección de características.")
if len(context['multivariate_findings'])>0:
    context['conclusions'].append("Un modelo baseline ofrece una referencia inicial de desempeño.")
if len(context['conclusions'])==0:
    context['conclusions'].append("No se detectaron hallazgos críticos en la exploración inicial.")

report_path = build_report(context)
print("✅ Reporte generado:", report_path)



---

## Notas
- Las figuras se guardan en la carpeta `outputs/`.
- El reporte Word se guarda en la raíz del proyecto como **report_eda_heart_disease v1.docx** (máximo ~7 páginas, con resúmenes e ilustraciones).
- Este EDA es reproducible y sirve como base para la siguiente fase de modelado (selección de variables y validación).

