<a href="https://colab.research.google.com/github/lalithsrivatsa/Machine_Learning/blob/master/ML101.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Multi-model REGRESSION: Linear/Lasso/Ridge/DecisionTree/Bagging/RF/AdaBoost/CatBoost

# - CV hyperparameter tuning for each model

# - Single holdout evaluation

# ---------------------------------------------------------

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.base import clone

# Optional: CatBoost (install via pip)

from catboost import CatBoostRegressor


In [None]:
rng = np.random.RandomState(42)
# 1) Create synthetic data ----------------------------------------------------
n = 3000
age = rng.normal(40, 12, n).clip(18, 80)
income = rng.lognormal(mean=10, sigma=0.5, size=n)
tenure = rng.exponential(scale=5, size=n)
score1 = rng.normal(0, 1, n)
score2 = rng.normal(5, 2, n)
city = rng.choice(["NY", "SF", "CHI", "DAL"], size=n, p=[0.35, 0.25, 0.25, 0.15])
segment = rng.choice(["A", "B", "C"], size=n, p=[0.5, 0.3, 0.2])

y = (
    0.05 * age
    + 0.00003 * income
    + 0.2 * np.log1p(tenure)
    + 0.8 * score1
    - 0.4 * score2
    + (city == "SF") * 1.5
    + (segment == "A") * 0.8
    + rng.normal(0, 1.2, n)
)



df = pd.DataFrame({
    "age": age, "income": income, "tenure": tenure,
    "score1": score1, "score2": score2,
    "city": city, "segment": segment, "target": y
})



# Inject some missingness
for col in ["age","income","tenure"]:
    idx = rng.choice(n, size=int(0.05*n), replace=False)
    df.loc[idx, col] = np.nan
idx = rng.choice(n, size=int(0.02*n), replace=False)
df.loc[idx, "segment"] = np.nan


target = "target"
num_cols = ["age","income","tenure","score1","score2"]
cat_cols = ["city","segment"]

X = df[num_cols + cat_cols].copy()
y = df[target].copy()

In [None]:
print(X.head())
print(X.describe())
print(X.isna().sum())

         age        income     tenure    score1    score2 city segment
0  45.960570   8485.348260   2.050758  0.568645  4.093481  DAL       A
1  38.340828  14325.658292   1.186265  0.382922  8.436879   NY       C
2  47.772262  17911.481903   1.878430  1.626431  2.420648   NY       A
3  58.276358  56604.503529   0.985193  0.236043  7.507470   NY       A
4  37.190160  29093.689159  11.609661 -0.386023  3.470302   NY       A
               age         income       tenure       score1       score2
count  2850.000000    2850.000000  2850.000000  3000.000000  3000.000000
mean     40.470615   24531.223885     4.842064    -0.008811     5.040783
std      11.513878   13120.434262     4.721768     0.967599     2.021178
min      18.000000    4499.169456     0.001205    -3.922400    -1.642229
25%      32.357557   15430.128475     1.460349    -0.660915     3.694789
50%      40.223701   21743.307498     3.410642    -0.005663     5.037790
75%      48.064605   30308.949649     6.663398     0.639934    

In [None]:
# 2) Preprocessors ------------------------------------------------------------

# Scaled preprocessor (for linear models)

num_scaled = Pipeline([

    ("impute", SimpleImputer(strategy="median")),

    ("scale", StandardScaler()),

])

cat_ohe = Pipeline([

    ("impute", SimpleImputer(strategy="most_frequent")),

    ("ohe", OneHotEncoder(handle_unknown="ignore")),

])



pre_scaled = ColumnTransformer([

    ("num", num_scaled, num_cols),

    ("cat", cat_ohe,   cat_cols),

])



# Unscaled preprocessor (trees/boosting don’t need scaling)

num_unscaled = Pipeline([

    ("impute", SimpleImputer(strategy="median")),

])

pre_unscaled = ColumnTransformer([

    ("num", num_unscaled, num_cols),

    ("cat", cat_ohe,      cat_cols),

])

In [None]:

# 3) Holdout split ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42

)


In [None]:
# 4) Define models + param grids ---------------------------------------------
models = {
    # Linear family (use scaled preprocessor)
    "LinearRegression": {
        "pre": pre_scaled,
        "est": LinearRegression(),
        "grid": {
            # LinearRegression has no classic hyperparams; keep empty or add fit_intercept toggle
            "model__fit_intercept": [True, False],
            # "model__positive": [True, False],  # available in recent versions
        }
    },
    "Lasso": {
        "pre": pre_scaled,
        "est": Lasso(max_iter=10000, random_state=42),
        "grid": {
            "model__alpha": [0.001, 0.01, 0.1, 1.0]
        }
    },
    "Ridge": {
        "pre": pre_scaled,
        "est": Ridge(random_state=42),
        "grid": {
            "model__alpha": [0.1, 1.0, 10.0, 100.0]
        }
    },

    # Tree/ensemble family (use unscaled preprocessor)
    "DecisionTree": {
        "pre": pre_unscaled,
        "est": DecisionTreeRegressor(random_state=42),
        "grid": {
            "model__max_depth": [None, 6, 10, 16],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 5]
        }
    },
    "Bagging": {
        "pre": pre_unscaled,
        "est": BaggingRegressor(
            estimator=DecisionTreeRegressor(random_state=42),
            random_state=42, n_jobs=-1
        ),
        "grid": {
            "model__n_estimators": [100, 300],
            "model__max_samples": [0.6, 0.8, 1.0],
            "model__max_features": [0.6, 0.8, 1.0]
        }
    },
    "RandomForest": {
        "pre": pre_unscaled,
        "est": RandomForestRegressor(random_state=42, n_jobs=-1),
        "grid": {
            "model__n_estimators": [300, 600],
            "model__max_depth": [None, 10, 16],
            "model__min_samples_split": [2, 5],
            "model__min_samples_leaf": [1, 2]
        }
    },
    "AdaBoost": {
        "pre": pre_unscaled,
        "est": AdaBoostRegressor(
            estimator=DecisionTreeRegressor(random_state=42),
            random_state=42
        ),
        "grid": {
            "model__n_estimators": [200, 400],
            "model__learning_rate": [0.05, 0.1, 0.2],
            "model__estimator__max_depth": [2, 3, 4]
        }
    },
    "CatBoost": {
        "pre": pre_unscaled,
        "est": CatBoostRegressor(
            loss_function="RMSE",
            random_seed=42,
            verbose=False,  # silence training logs
        ),
        "grid": {
            "model__depth": [4, 6, 8],
            "model__learning_rate": [0.03, 0.06, 0.1],
            "model__l2_leaf_reg": [1, 3, 5],
            "model__iterations": [500, 800]
        }
    },
}


In [None]:

# 5) CV + GridSearch for each model ------------------------------------------

cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, cfg in models.items():
    print(f"\n=== {name} ===")
    pipe = Pipeline([
        ("prep", cfg["pre"]),
        ("model", clone(cfg["est"]))
    ])

    gs = GridSearchCV(
        estimator=pipe,
        param_grid=cfg["grid"],
        scoring="neg_root_mean_squared_error",
        cv=cv,
        n_jobs=-1,
        refit=True,
        verbose=0
    )

    gs.fit(X_train, y_train)
    best_rmse_cv = -gs.best_score_
    print("Best CV RMSE:", round(best_rmse_cv, 4))
    print("Best params:", gs.best_params_)

    # Holdout evaluation
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)
    print(f"Holdout -> RMSE: {rmse:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")

    if hasattr(best_model.named_steps["model"], "coef_"):
      # Get the coefficients from the model step in the pipeline
      coefficients = best_model.named_steps["model"].coef_

      # Get the feature names from the preprocessing step in the pipeline
      feature_names = best_model.named_steps["prep"].get_feature_names_out()

      # Create a pandas Series for easy viewing
      importances = pd.Series(coefficients, index=feature_names)

      print("\n--- Top 10 Feature Importances ---")
      # Sort by the absolute value to see the most impactful features
      print(importances.abs().sort_values(ascending=False).head(10))
      # --------------------------------------------------


    results.append({
        "model": name,
        "cv_rmse": best_rmse_cv,
        "holdout_rmse": rmse,
        "holdout_mae": mae,
        "holdout_r2": r2,
        "best_params": gs.best_params_
    })



=== LinearRegression ===
Best CV RMSE: 1.2183
Best params: {'model__fit_intercept': True}
Holdout -> RMSE: 1.2143 | MAE: 0.9794 | R2: 0.6182

--- Top 10 Feature Importances ---
cat__city_SF      1.086871
num__score2       0.849713
num__score1       0.821383
num__age          0.557701
cat__segment_A    0.490010
cat__city_DAL     0.375353
cat__city_CHI     0.374654
num__income       0.351114
cat__city_NY      0.336864
cat__segment_C    0.276981
dtype: float64

=== Lasso ===
Best CV RMSE: 1.2182
Best params: {'model__alpha': 0.001}
Holdout -> RMSE: 1.2146 | MAE: 0.9796 | R2: 0.6181

--- Top 10 Feature Importances ---
cat__city_SF      1.422554
num__score2       0.848706
num__score1       0.820325
cat__segment_A    0.701170
num__age          0.556819
num__income       0.350333
num__tenure       0.118395
cat__segment_C    0.058741
cat__city_CHI     0.030735
cat__city_DAL     0.029423
dtype: float64

=== Ridge ===
Best CV RMSE: 1.2183
Best params: {'model__alpha': 1.0}
Holdout -> RMSE: 1.21



Best CV RMSE: 1.3088
Best params: {'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 300}
Holdout -> RMSE: 1.3102 | MAE: 1.0493 | R2: 0.5556

=== AdaBoost ===
Best CV RMSE: 1.3268
Best params: {'model__estimator__max_depth': 4, 'model__learning_rate': 0.2, 'model__n_estimators': 400}
Holdout -> RMSE: 1.3263 | MAE: 1.0580 | R2: 0.5446

=== CatBoost ===




Best CV RMSE: 1.2418
Best params: {'model__depth': 4, 'model__iterations': 500, 'model__l2_leaf_reg': 3, 'model__learning_rate': 0.03}
Holdout -> RMSE: 1.2333 | MAE: 0.9884 | R2: 0.6062


In [None]:
# 6) Compare models -----------------------------------------------------------
res_df = pd.DataFrame(results).sort_values("holdout_rmse")
print("\n=== Model Comparison (sorted by holdout RMSE) ===")
print(res_df[["model","cv_rmse","holdout_rmse","holdout_mae","holdout_r2"]])
print("\nBest params per model (peek):")
for row in results:
    print(row["model"], "->", row["best_params"])


=== Model Comparison (sorted by holdout RMSE) ===
              model   cv_rmse  holdout_rmse  holdout_mae  holdout_r2
0  LinearRegression  1.218256      1.214317     0.979415    0.618232
2             Ridge  1.218250      1.214325     0.979442    0.618227
1             Lasso  1.218179      1.214552     0.979571    0.618084
7          CatBoost  1.241800      1.233347     0.988358    0.606173
5      RandomForest  1.308784      1.310164     1.049334    0.555587
4           Bagging  1.308481      1.310270     1.049200    0.555515
6          AdaBoost  1.326802      1.326264     1.058007    0.544598
3      DecisionTree  1.510409      1.526592     1.209347    0.396633

Best params per model (peek):
LinearRegression -> {'model__fit_intercept': True}
Lasso -> {'model__alpha': 0.001}
Ridge -> {'model__alpha': 1.0}
DecisionTree -> {'model__max_depth': 6, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5}
Bagging -> {'model__max_features': 1.0, 'model__max_samples': 0.6, 'model__n_esti