<a href="https://colab.research.google.com/github/joshua12cx/tarea-7/blob/main/tarea_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tarea 7

- alumno: joshua josue pedraza perez

In [None]:
# =========================================
# 1) Cargar datos y objetivo
# =========================================
import os, warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_FILE = "insurance.csv"   # <-- tu archivo
TARGET    = "charges"         # variable a predecir
assert os.path.exists(DATA_FILE), f"No se encuentra {DATA_FILE}"

df = pd.read_csv(DATA_FILE)
df.info()
df.head()

y  = df[TARGET]
X  = df.drop(columns=[TARGET])

print("Shape:", X.shape,
      "| y(mean):", round(y.mean(), 4),
      "| y(std):", round(y.std(), 4),
      "| y[min,max]:", (round(y.min(), 4), round(y.max(), 4)))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
Shape: (1338, 6) | y(mean): 13270.4223 | y(std): 12110.0112 | y[min,max]: (1121.8739, 63770.428)


In [None]:
X , y

(    edad  estudio  horas clase_salario
 0     27        9     40         MENOR
 1     30        9     40         MENOR
 2     42        9     40         MAYOR
 3     68        9     25         MAYOR
 4     34        8     40         MENOR
 ..   ...      ...    ...           ...
 95    32       13     55         MENOR
 96    20       10     25         MENOR
 97    46        3     40         MENOR
 98    37        9     45         MENOR
 99    47        9     12         MAYOR
 
 [100 rows x 4 columns],
 0     12967.33
 1     20171.40
 2     39051.04
 3     33567.25
 4      8247.42
         ...   
 95     3297.72
 96     2266.79
 97    17190.41
 98      552.72
 99    32805.86
 Name: valor_salario, Length: 100, dtype: float64)

In [None]:
# =========================================
# 2) Split temprano (80/20)
# =========================================
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE
)
print(f"Train: {X_train.shape} | Test: {X_test.shape}")


Train: (1070, 6) | Test: (268, 6)


In [None]:
# =========================================
# 3) Preprocesamiento (en pipeline)
# =========================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.pipeline import Pipeline as ImbPipeline  # imblearn solo por consistencia de API

cat_features = X_train.select_dtypes(include=["object","category"]).columns.tolist()
num_features = X_train.select_dtypes(include=["number","bool"]).columns.tolist()

# OneHotEncoder compatible (con fallback)
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", ohe,              cat_features),
    ],
    remainder="drop",
)

def build_pipe(model):
    # Nota: en regresión NO se usa SMOTE
    return ImbPipeline([
        ("prep", preprocessor),
        ("var0", VarianceThreshold(0.0)),  # limpia columnas constantes tras OHE
        ("model", model),
    ])

print(f"Features numéricas: {num_features}")
print(f"Features categóricas: {cat_features}")

Features numéricas: ['age', 'bmi', 'children']
Features categóricas: ['sex', 'smoker', 'region']


In [None]:
# =========================================
# 4) Modelos candidatos (REGRESIÓN)
# =========================================
%pip install -q catboost lightgbm xgboost
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor   # <- ahora sí funcionará



candidates = [
    ("LR",  LinearRegression()),
    ("RG",  Ridge(random_state=RANDOM_STATE)),
    ("LS",  Lasso(random_state=RANDOM_STATE, max_iter=5000)),
    ("EN",  ElasticNet(random_state=RANDOM_STATE, max_iter=5000)),
    ("KNR", KNeighborsRegressor()),
    ("DTR", DecisionTreeRegressor(random_state=RANDOM_STATE)),
    ("RFR", RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)),
    ("MLP", MLPRegressor(hidden_layer_sizes=(64,), max_iter=800, random_state=RANDOM_STATE)),
    ("XGB", XGBRegressor(tree_method="hist", random_state=RANDOM_STATE,
                         n_estimators=400, learning_rate=0.05, max_depth=6,
                         subsample=0.9, colsample_bytree=0.9, n_jobs=-1)),
    ("LGB", LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=-1,
                          subsample=0.9, colsample_bytree=0.9,
                          random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1)),
    ("CAT", CatBoostRegressor(iterations=600, learning_rate=0.05, depth=6,
                              random_state=RANDOM_STATE, l2_leaf_reg=3.0,
                              verbose=False, allow_writing_files=False, thread_count=-1)),
]


In [None]:
# =========================================
# 5) Baseline con CV (sin tuning)
# =========================================
from sklearn.model_selection import KFold, cross_validate

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {
    "rmse": "neg_root_mean_squared_error",
    "mae":  "neg_mean_absolute_error",
    "r2":   "r2",
}

rows = []
for name, model in candidates:
    pipe = build_pipe(model)
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    row = {
        "model": name,
        "rmse": -scores["test_rmse"].mean(),
        "mae":  -scores["test_mae"].mean(),
        "r2":    scores["test_r2"].mean(),
    }
    rows.append(row)
    print(f"{name:>3} | RMSE {row['rmse']:.3f} | MAE {row['mae']:.3f} | R² {row['r2']:.3f}")

baseline_df = pd.DataFrame(rows).sort_values("rmse")
display(baseline_df)
baseline_best_name  = baseline_df.iloc[0]["model"]
baseline_best_model = dict(candidates)[baseline_best_name]
print(f">>> Baseline ganador: {baseline_best_name}")


 LR | RMSE 6123.354 | MAE 4234.984 | R² 0.739
 RG | RMSE 6123.344 | MAE 4240.596 | R² 0.739
 LS | RMSE 6123.254 | MAE 4234.960 | R² 0.739
 EN | RMSE 8535.779 | MAE 6287.739 | R² 0.493
KNR | RMSE 5929.600 | MAE 3654.660 | R² 0.754
DTR | RMSE 6509.477 | MAE 3062.401 | R² 0.705
RFR | RMSE 4882.499 | MAE 2743.686 | R² 0.834
MLP | RMSE 13817.309 | MAE 8327.476 | R² -0.327
XGB | RMSE 5085.556 | MAE 2954.053 | R² 0.820
LGB | RMSE 5169.661 | MAE 3225.067 | R² 0.814
CAT | RMSE 4806.357 | MAE 2755.987 | R² 0.839


Unnamed: 0,model,rmse,mae,r2
10,CAT,4806.357264,2755.98675,0.838629
6,RFR,4882.498571,2743.685581,0.834013
8,XGB,5085.556013,2954.053106,0.819977
9,LGB,5169.660763,3225.067165,0.814158
4,KNR,5929.599636,3654.66034,0.754348
2,LS,6123.253505,4234.960273,0.738862
1,RG,6123.344389,4240.596409,0.738855
0,LR,6123.353823,4234.98357,0.738853
5,DTR,6509.476765,3062.401066,0.704827
3,EN,8535.778876,6287.739072,0.493491


>>> Baseline ganador: CAT


In [None]:
%pip install catboost



In [None]:
# =========================================
# 6) Tuning con CV y elección del ganador
# =========================================
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
try:
    from scipy.stats import loguniform
except:
    from sklearn.utils.fixes import loguniform

cv_light = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_heavy = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

param_spaces = {
    "RG":  {"model__alpha": loguniform(1e-3, 1e3)},
    "LS":  {"model__alpha": loguniform(1e-3, 1e2)},
    "EN":  {"model__alpha": loguniform(1e-3, 1e2), "model__l1_ratio": uniform(0.0, 1.0)},
    "KNR": {"model__n_neighbors": randint(2, 50), "model__weights": ["uniform","distance"], "model__p":[1,2]},
    "DTR": {"model__max_depth": randint(3, 16), "model__min_samples_leaf": randint(1, 10)},
    "RFR": {"model__n_estimators": randint(200, 600), "model__max_depth": randint(4, 16),
            "model__min_samples_split": randint(2, 20), "model__min_samples_leaf": randint(1, 10),
            "model__max_features": ["sqrt","log2", None], "model__bootstrap": [True, False]},
    "MLP": {"model__alpha": loguniform(1e-4, 1e-1), "model__learning_rate_init": loguniform(1e-4, 1e-2)},
    "XGB": {"model__n_estimators": randint(250, 600), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__max_depth": randint(3, 9), "model__subsample": uniform(0.7, 0.3),
            "model__colsample_bytree": uniform(0.7, 0.3), "model__min_child_weight": randint(1, 6)},
    "LGB": {"model__n_estimators": randint(300, 800), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__num_leaves": randint(16, 128), "model__max_depth": randint(-1, 12),
            "model__min_child_samples": randint(10, 50), "model__subsample": uniform(0.7, 0.3),
            "model__colsample_bytree": uniform(0.7, 0.3), "model__reg_lambda": loguniform(1e-3, 10)},
    "CAT": {"model__iterations": randint(300, 700), "model__learning_rate": loguniform(5e-3, 2e-1),
            "model__depth": randint(4, 10), "model__l2_leaf_reg": loguniform(1e-2, 30),
            "model__border_count": randint(32, 255)},
}

to_tune = [
    ("RG",  Ridge(random_state=RANDOM_STATE)),
    ("EN",  ElasticNet(random_state=RANDOM_STATE, max_iter=5000)),
    ("RFR", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=1)),
    ("XGB", XGBRegressor(tree_method="hist", random_state=RANDOM_STATE, n_jobs=1)),
    ("LGB", LGBMRegressor(random_state=RANDOM_STATE, n_jobs=1, verbosity=-1)),
    ("CAT", CatBoostRegressor(random_state=RANDOM_STATE, verbose=False, allow_writing_files=False, thread_count=1)),
]

refit_metric = "rmse"
scoring = {"rmse": "neg_root_mean_squared_error", "mae": "neg_mean_absolute_error", "r2": "r2"}

best_models = []
for name, base_model in to_tune:
    pipe = build_pipe(base_model)
    search = RandomizedSearchCV(
        pipe, param_spaces[name],
        n_iter=12, cv=(cv_heavy if name in ["RFR","XGB","LGB","CAT"] else cv_light),
        scoring=scoring, refit="rmse",
        n_jobs=-1, random_state=RANDOM_STATE, verbose=1
    )
    search.fit(X_train, y_train)
    best_models.append((name, search.best_estimator_, -search.best_score_, search.best_params_))

best_models.sort(key=lambda x: x[2])
best_name, final_pipe_opt, best_cv_rmse, best_params = best_models[0]
print(f">>> GANADOR OPTIMIZADO: {best_name} (RMSE CV={best_cv_rmse:.3f})")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Fitting 3 folds for each of 12 candidates, totalling 36 fits
>>> GANADOR OPTIMIZADO: RFR (RMSE CV=4544.495)


In [None]:
# =========================================
# 7) Evaluación final en TEST
# =========================================
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

winner_pipe = final_pipe_opt
winner_pipe.fit(X_train, y_train)
y_pred = winner_pipe.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"TEST → RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
>>> GANADOR OPTIMIZADO: RFR (RMSE CV=6503.536)


In [None]:
# =========================================
# 7) Comparación justa (solo CV) - baseline vs ganador
# =========================================
from sklearn.model_selection import KFold, cross_validate

same_cv = KFold(n_splits=5, shuffle=True, random_state=123)
pipe_baseline_best = build_pipe(baseline_best_model)
pipe_tuned_best    = final_pipe_opt

def cv_rmse(pipe, name):
    s = cross_validate(pipe, X_train, y_train, cv=same_cv,
                       scoring={"rmse":"neg_root_mean_squared_error"}, n_jobs=-1)
    rmse = -s["test_rmse"].mean()
    print(f"{name}: RMSE {rmse:.4f}")
    return rmse

rmse_base = cv_rmse(pipe_baseline_best, f"Baseline({baseline_best_name})")
rmse_tune = cv_rmse(pipe_tuned_best,   f"Tuned({best_name})")

# Regla: si la mejora < 1% del RMSE base, nos quedamos con el baseline (más simple)
if (rmse_base - rmse_tune) / rmse_base >= 0.01:
    winner_name, winner_pipe = best_name, pipe_tuned_best
else:
    winner_name, winner_pipe = baseline_best_name, pipe_baseline_best

print(f">>> Modelo seleccionado para TEST: {winner_name}")


Baseline(CAT): RMSE 4838.5754
Tuned(RFR): RMSE 4555.8941
>>> Modelo seleccionado para TEST: RFR


In [None]:
# =========================================
# 8) Interpretabilidad + Error Analysis
# =========================================
from sklearn.inspection import permutation_importance

# Importancia de features originales
r = permutation_importance(
    winner_pipe, X_test, y_test,
    n_repeats=10, random_state=RANDOM_STATE,
    scoring="neg_root_mean_squared_error"
)

imp = pd.DataFrame({
        "feature": X_test.columns,
        "importance": r.importances_mean,
        "std": r.importances_std
     }).sort_values("importance", ascending=False).head(10)

print("\nTop-10 variables más importantes:")
print(imp.to_string(index=False))



Top-10 variables más importantes:
 feature   importance        std
  smoker 11009.279084 499.842425
     bmi  3247.687885 255.720153
     age  2537.138266 157.873788
children   198.397191  64.897310
  region    29.550823  14.548357
     sex    -3.484138   4.795162


In [None]:
# =========================================
# 9) Evaluación final en TEST
# =========================================
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Entrenamos el mejor modelo en todo el train
winner_pipe = final_pipe_opt
winner_pipe.fit(X_train, y_train)

# Predicciones en test
y_pred = winner_pipe.predict(X_test)

# Métricas finales
rmse = mean_squared_error(y_test, y_pred, )
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print(f"TEST → RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")

# vistazo rápido de las primeras 10 predicciones
preview = pd.DataFrame({
    "y_true": y_test.reset_index(drop=True),
    "y_pred": pd.Series(y_pred)
}).head(10)
print("\nPreview:")
print(preview.to_string(index=False))

TEST → RMSE: 18771960.1786 | MAE: 2457.5694 | R²: 0.8791

Preview:
     y_true       y_pred
 9095.06825 10567.678342
 5272.17580  5899.859021
29330.98315 27540.767948
 9301.89355 10495.533324
33750.29180 34827.732703
 4536.25900  6413.515171
 2117.33885  2320.954717
14210.53595 14146.747310
 3732.62510  5481.393229
10264.44210 11342.579257


In [None]:
# =========================================
# 10) Interpretabilidad + Error Analysis
# =========================================
from sklearn.inspection import permutation_importance

# 10.1 Importancia de variables (sobre las features originales)
r = permutation_importance(
    winner_pipe, X_test, y_test,
    n_repeats=10, random_state=RANDOM_STATE,
    scoring="neg_root_mean_squared_error"
)

imp = (pd.DataFrame({
        "feature": X_test.columns,
        "importance": r.importances_mean,
        "std": r.importances_std
     })
     .sort_values("importance", ascending=False)
     .head(10)
)
print("\nTop-10 variables más importantes:")
print(imp.to_string(index=False))

# 10.2 Resumen de errores
res = pd.DataFrame({
    "y_true": y_test.reset_index(drop=True),
    "y_pred": pd.Series(y_pred)
})
res["abs_err"] = (res["y_true"] - res["y_pred"]).abs()
print("\nResumen de |error|:")
print(res["abs_err"].describe(percentiles=[.1,.25,.5,.75,.9]).to_string())

# 10.3 Peores 10 casos
print("\nPeores 10 casos (|error| alto):")
top_bad_idx = res["abs_err"].nlargest(10).index
print(pd.concat([res.loc[top_bad_idx], X_test.reset_index(drop=True).loc[top_bad_idx]], axis=1)
      .to_string(index=False))



Top-10 variables más importantes:
 feature   importance        std
  smoker 11009.279084 499.842425
     bmi  3247.687885 255.720153
     age  2537.138266 157.873788
children   198.397191  64.897310
  region    29.550823  14.548357
     sex    -3.484138   4.795162

Resumen de |error|:
count      268.000000
mean      2457.569382
std       3574.912513
min         32.646485
10%        506.357492
25%        940.310877
50%       1511.499691
75%       2208.876644
90%       3509.915637
max      21404.175218

Peores 10 casos (|error| alto):
     y_true       y_pred      abs_err  age    sex    bmi  children smoker    region
28476.73499  7072.559772 21404.175218   40 female 41.420         1     no northwest
33471.97189 12355.994136 21115.977754   52 female 37.525         2     no northwest
23082.95533  2413.921899 20669.033431   19   male 33.100         0     no southwest
30259.99556 13584.880776 16675.114784   60   male 28.595         0     no northeast
63770.42801 47147.751920 16622.676090   

In [None]:
# =========================================
# BONUS: Importancia de variables después del OHE
# =========================================
# Transformamos X_test con el preprocesador (scaler + OHE)
Xtr = winner_pipe.named_steps["prep"].transform(X_test)

# Extraemos el modelo final
model = winner_pipe.named_steps["model"]

# Calculamos importancias por permutación sobre las features procesadas
r2 = permutation_importance(
    model, Xtr, y_test,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring="neg_root_mean_squared_error"
)

# Nombres de las features después del preprocesamiento (numéricas escaladas + OHE)
feat_names_ohe = winner_pipe.named_steps["prep"].get_feature_names_out()

# DataFrame con top 20 variables más importantes
imp_ohe = (pd.DataFrame({
                "feature": feat_names_ohe,
                "importance": r2.importances_mean,
                "std": r2.importances_std
          })
          .sort_values("importance", ascending=False)
          .head(20)
)
print("\nTop-20 variables más importantes (post-OHE):")
print(imp_ohe.to_string(index=False))



Top-20 variables más importantes (post-OHE):
              feature  importance        std
       cat__smoker_no 6773.926078 328.241829
             num__bmi 3247.687885 255.720153
             num__age 2537.138266 157.873788
      cat__smoker_yes 1824.191509 137.676467
        num__children  198.397191  64.897310
cat__region_northeast   14.668718  12.030894
cat__region_northwest    6.162347   5.273618
cat__region_southwest    4.759870   3.475133
cat__region_southeast    2.526905   2.494326
      cat__sex_female   -1.457321   2.152520
        cat__sex_male   -2.504692   2.667217


In [None]:
imp_ohe

Unnamed: 0,feature,importance,std
5,cat__smoker_no,6773.926078,328.241829
1,num__bmi,3247.687885,255.720153
0,num__age,2537.138266,157.873788
6,cat__smoker_yes,1824.191509,137.676467
2,num__children,198.397191,64.89731
7,cat__region_northeast,14.668718,12.030894
8,cat__region_northwest,6.162347,5.273618
10,cat__region_southwest,4.75987,3.475133
9,cat__region_southeast,2.526905,2.494326
3,cat__sex_female,-1.457321,2.15252
