In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score

In [3]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
data = {
    'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass
}

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']

### LASSO (without feature standardization)
Coefficients are not directly comparable across features with different scales; the L1 penalty becomes unfair and may zero out the “smaller-scale” features. Results and feature selection can be unstable. 


In [4]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Lasso model with alpha (λ) as the regularization parameter
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train, y_train)

# Display the coefficients of the features
print(f"Lasso Coefficients: {lasso_model.coef_}")

Lasso Coefficients: [0.08153909 0.01180619]


### LASSO (with feature standardization)
All features are scaled to mean=0 and std=1, making the L1 penalty fair and coefficients comparable (per 1 SD). Feature selection becomes more meaningful and CV/hyperparameter tuning more stable. Use a Pipeline to avoid data leakage.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

# L1 regularization is sensitive to feature scale.
# Use a Pipeline so the scaler is fit ONLY on the training folds (no data leakage).
pipe = Pipeline([
    ("scaler", StandardScaler()),                 # mean=0, std=1 for each feature
    ("lasso", LassoCV(alphas=[0.001, 0.01, 0.1, 1],
                      cv=5, max_iter=5000, random_state=42))
])

pipe.fit(X_train, y_train)

best_alpha = pipe.named_steps["lasso"].alpha_
coefs = pipe.named_steps["lasso"].coef_

print("Best alpha (via CV):", best_alpha)
print("Selected (non-zero) features:", (coefs != 0).sum(), "/", len(coefs))

# Map coefficients back to feature names for readability
coef_df = pd.DataFrame({"Feature": X_train.columns, "Lasso_Coefficient": coefs})
print(coef_df.sort_values("Lasso_Coefficient", ascending=False).reset_index(drop=True))

# Optional: evaluate with CV on the whole pipeline (scaler refit inside each fold)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring="r2")
print("CV R^2 per fold:", cv_scores)
print("Mean CV R^2:", np.mean(cv_scores))


Best alpha (via CV): 0.1
Selected (non-zero) features: 1 / 2
         Feature  Lasso_Coefficient
0     StudyHours           0.317815
1  PrevExamScore           0.000000
CV R^2 per fold: [0.32578012 0.78164702 0.2458622         nan        nan]
Mean CV R^2: nan




## Ridge (without feature standardization)

**What this block does**
- Fits a `Ridge(alpha=1.0)` model on raw (unscaled) features.
- Prints raw coefficients and intercept.
- Reports cross-validated \(R^2\) with `cv=4`.

**Why results can be misleading**
- **Different feature scales** (e.g., `StudyHours: 0–10` vs `PrevExamScore: 0–100`) make raw coefficients **not directly comparable**.
- The L2 penalty acts on the **magnitude of coefficients**. Small-scale features typically need **larger numeric coefficients** to have a similar effect on \(y\), so they get **penalized more**, which can distort interpretability.
- CV scores can be more **variable/unstable** when scale differs across features.

**How to read the output**
- `Coefficients`: effects **per 1 unit** in the *original units* (not comparable across features of different scales).
- `Intercept`: baseline prediction when all features are zero (in original units).
- `Cross-validated R^2`: average out-of-fold fit; treat cautiously when inputs are unscaled.

**Takeaway**
> Use this block only for demonstration. For fair regularization and interpretable coefficients, prefer the standardized pipeline below.


In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# NOTE: No standardization here.
# If features have different scales (e.g., StudyHours: 0–10 vs PrevExamScore: 0–100),
# the raw coefficients are NOT directly comparable.
# L2 penalty in Ridge acts on |beta|, so a small-scale feature tends to get a larger
# numerical coefficient, which is penalized more—even if its "true" effect is similar.
# This can distort interpretability and make regularization "unfair" across features.

ridge = Ridge(alpha=1.0)  # alpha = λ
ridge.fit(X_train, y_train)

print("Coefficients:", ridge.coef_) # L2 regularization coefficient
print("Intercept:", ridge.intercept_) # L2 regularization intercept

score = np.mean(cross_val_score(ridge, X_train, y_train, cv=4)) # cv=3 because not enough samples
print("Cross-validated R^2:", score)

Coefficients: [0.07407407 0.01304121]
Intercept: -0.6898800208659339
Cross-validated R^2: 0.3662920165248332


## Ridge (with feature standardization via Pipeline + RidgeCV)

**What this block does**
- Builds a `Pipeline` with `StandardScaler` (mean=0, std=1) and `RidgeCV` to **select alpha via CV**.
- Fits the pipeline, prints the **best alpha**, standardized **coefficients**, a tidy coefficient table, and **CV \(R^2\)** per fold.

**Why standardization matters**
- Puts all features on the **same scale**, so L2 penalty treats coefficients **evenly** (fair regularization).
- Coefficients become **comparable**: each reflects the change in \(y\) (in SD units) per **1 SD** increase in the feature.
- `Pipeline` avoids **data leakage**: the scaler is fit **inside each fold** during CV.
- Typically yields **more stable** CV/hyperparameter selection and better generalization.

**How to read the output**
- `Best alpha (via CV)`: regularization strength chosen to balance bias–variance.
- `Standardized coefficients`: effects **per 1 SD** of each feature (now comparable across features).
- `CV R^2 per fold` & `Mean CV R^2`: out-of-fold performance; inspect spread to gauge stability.

**Extra notes**
- With **highly correlated features**, Ridge tends to **share weight** across them (no coefficients exactly zero).
- For automatic feature elimination, consider **LASSO** or **Elastic Net** (with nonzero L1 component).

**Takeaway**
> Standardization + Pipeline + RidgeCV = fair penalties, comparable coefficients, and reliable model selection.


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

# Standardization puts all features on the same scale (mean=0, std=1).
# Benefits:
# - Fair regularization: L2 penalty treats all coefficients evenly.
# - Coefficient comparability: each coefficient ≈ effect per 1 SD increase in the feature.
# - More stable CV / hyperparameter selection.
# Using a Pipeline ensures the scaler is fit ONLY on training folds (no data leakage).

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=[0.1, 1.0, 10.0], cv=4))  # CV selects the best alpha
])

pipe.fit(X_train, y_train)

best_alpha = pipe.named_steps["ridge"].alpha_
coefs = pipe.named_steps["ridge"].coef_

print("Best alpha (via CV):", best_alpha)
print("Standardized coefficients:", dict(zip(X_train.columns, coefs)))
# Interpretation: each coefficient is the change in y (in SD units) per 1 SD increase in that feature.

# Optional: show coefficients neatly
coef_df = pd.DataFrame({"Feature": X_train.columns, "Std_Coefficient": coefs})
print(coef_df)

# Evaluate the whole pipeline with CV (scaler is refit in each fold properly)
cv_scores = cross_val_score(pipe, X_train, y_train, cv=4, scoring="r2")
print("CV R^2 per fold:", cv_scores)
print("Mean CV R^2:", np.mean(cv_scores))


Best alpha (via CV): 10.0
Standardized coefficients: {'StudyHours': np.float64(0.12959298903600444), 'PrevExamScore': np.float64(0.1270166191709816)}
         Feature  Std_Coefficient
0     StudyHours         0.129593
1  PrevExamScore         0.127017
CV R^2 per fold: [-0.03476487  0.91030999  0.02135171  0.66114942]
Mean CV R^2: 0.3895115658031173




### Why the coefficients look so different — and why standardization matters

When features live on very different scales (e.g., **StudyHours: 0–10** vs **PrevExamScore: 0–100**), raw linear/Ridge coefficients are **not directly comparable**. A one-unit increase in StudyHours is a much bigger relative change than a one-unit increase in PrevExamScore, so the model assigns a **larger numeric coefficient** to StudyHours (e.g., 0.074 vs 0.013) even if their *true* effects are similar. This also makes L2 regularization **unfair**: Ridge penalizes the magnitude of coefficients, so small-scale features are effectively penalized more.

Standardizing features to **mean = 0, std = 1** fixes this:
- The L2 penalty treats all coefficients evenly (“fair regularization”).
- Coefficients become **interpretable per 1 standard deviation** of each feature and thus comparable (e.g., both ≈0.13 after scaling).
- Cross-validation and hyperparameter selection become more stable.
- Using a `Pipeline` ensures the scaler is fit **inside each CV fold** (no data leakage).

**Bottom line:** Standardize continuous features before Ridge/LASSO/Elastic Net to get fair penalties, comparable coefficients, and more reliable model selection.


## Elastic Net (without feature standardization)

**What this block does**
- Fits `ElasticNetCV` directly on raw (unscaled) features.
- Uses CV to select the best `alpha` (overall penalty) and `l1_ratio` (L1–L2 balance).
- Prints intercept, coefficients, and a tidy coefficient table; also inspects feature correlations.

**Why results can be misleading**
- With features on **different scales** (e.g., 0–10 vs 0–100), the **L1/L2 penalties act unfairly** because they penalize the *numeric size* of coefficients, not the real effect size.
- Coefficients across unscaled features are **not directly comparable**; feature selection (zeros from L1) can become **unstable**.
- CV may prefer hyperparameters that compensate for scale, not true predictive value.

**How to read the output**
- `Best alpha` and `Best l1_ratio`: chosen by CV but **scale-dependent**.
- `Coefficients`: effects **per 1 unit in original units**; not comparable across features with different magnitudes.
- Correlation matrix helps explain **redundancy**; highly correlated features can cause one coefficient to be shrunk toward zero.

**Takeaway**
> Use this unscaled version only as a baseline demo. For fair penalties, stable selection, and interpretable coefficients, prefer the standardized pipeline below.


In [8]:
from sklearn.linear_model import ElasticNetCV
import pandas as pd

# Initialize Elastic Net model with built-in cross-validation
# - l1_ratio controls the balance between L1 (LASSO) and L2 (Ridge) penalties
# - alphas are the candidate regularization strengths (λ values)
# - cv sets the number of cross-validation folds
model = ElasticNetCV(
    l1_ratio=[0.1, 0.5, 0.9],
    alphas=[0.001, 0.01, 0.1, 1, 10],
    cv=5
)

# Fit the model on the training data
model.fit(X_train, y_train)

# Display the best hyperparameters selected via cross-validation
print("Best alpha:", model.alpha_)       # Optimal regularization strength (λ)
print("Best l1_ratio:", model.l1_ratio_) # Optimal balance between L1 and L2 penalties

# Display model parameters
print("Intercept:", model.intercept_)    # Intercept term (bias)
print("Coefficients:", model.coef_)      # Coefficients of the selected features

# Create a readable summary of coefficients mapped to their corresponding features
coef_df = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
})
print(coef_df)

# Examine feature correlations — helps explain why some coefficients are zero
print("\nFeature correlation matrix:")
print(X_train.corr())


Best alpha: 10.0
Best l1_ratio: 0.1
Intercept: -0.7411971830985915
Coefficients: [0.         0.02068662]
         Feature  Coefficient
0     StudyHours     0.000000
1  PrevExamScore     0.020687

Feature correlation matrix:
               StudyHours  PrevExamScore
StudyHours       1.000000       0.993809
PrevExamScore    0.993809       1.000000


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


## Elastic Net (with feature standardization via Pipeline + ElasticNetCV)

**What this block does**
- Builds a `Pipeline` with `StandardScaler` (mean=0, std=1) and `ElasticNetCV`.
- Performs CV *inside the pipeline* (the scaler is fit **within each fold**, avoiding data leakage).
- Prints the best `alpha`, best `l1_ratio`, intercept, standardized coefficients, and a readable coefficient table.

**Why standardization matters**
- Puts all features on the **same scale**, making L1/L2 penalties **fair and comparable**.
- Coefficients become **interpretable per 1 standard deviation** increase in each feature (now comparable across features).
- Leads to **more stable** CV/hyperparameter selection and typically better generalization.
- `Pipeline` ensures proper **train/test separation** for scaling during CV.

**How to read the output**
- `Best alpha` / `Best l1_ratio`: data-driven balance between sparsity (L1) and shrinkage (L2).
- `Coefficients`: change in \(y\) (in its native units) per **1 SD** increase in each feature (since only \(X\) is standardized).
- Intercept reflects the baseline when standardized features are 0 (i.e., at their mean levels), **not necessarily near 0**.

**Extra notes**
- If \(X\) is sparse, use `StandardScaler(with_mean=False)`.
- Increase `max_iter` if you see convergence warnings.
- With highly correlated features, Elastic Net (nonzero L2) tends to **share weight** more stably than pure LASSO.

**Takeaway**
> Standardization + Pipeline + ElasticNetCV = fair regularization, comparable coefficients, stable feature selection, and leakage-free model evaluation.


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np

# Standardize features inside CV to ensure fair L1/L2 penalties and no data leakage
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNetCV(
        l1_ratio=[0.1, 0.5, 0.9],
        alphas=[0.001, 0.01, 0.1, 1, 10],
        cv=5,
        max_iter=5000,
        random_state=42
    ))
])

pipe.fit(X_train, y_train)

enet = pipe.named_steps["enet"]
print("Best alpha:", enet.alpha_)        # Optimal regularization strength (λ)
print("Best l1_ratio:", enet.l1_ratio_)  # Optimal L1/L2 balance
print("Intercept:", enet.intercept_)     # Note: after standardization, intercept often near 0
print("Coefficients:", enet.coef_)       # Comparable: effect per 1 SD increase in each feature

coef_df = pd.DataFrame({"Feature": X_train.columns, "Coefficient": enet.coef_})
print(coef_df)

# (Optional) Inspect correlations to understand redundancy / why some coefficients are zero
print("\nFeature correlation matrix:")
print(X_train.corr())


Best alpha: 0.1
Best l1_ratio: 0.5
Intercept: 0.5
Coefficients: [0.20814529 0.15021085]
         Feature  Coefficient
0     StudyHours     0.208145
1  PrevExamScore     0.150211

Feature correlation matrix:
               StudyHours  PrevExamScore
StudyHours       1.000000       0.993809
PrevExamScore    0.993809       1.000000


## Interpretation of the standardized Elastic Net results

- **Best alpha = 0.1**  
  Mild overall regularization; coefficients are shrunk but not heavily.

- **Best l1_ratio = 0.5**  
  Balanced Elastic Net (L1 + L2).  
  → L1 gives some sparsity pressure, L2 stabilizes weights and shares them across correlated features.

- **Intercept ≈ 0.5**  
  With **X standardized (mean=0)**, the intercept is the model’s baseline prediction when features are at their means  
  (≈ mean of *y* for regression, or the base positive rate if *y* is 0/1).

- **Coefficients (comparable after scaling)**  
  - `StudyHours ≈ 0.208`  
  - `PrevExamScore ≈ 0.150`  
  Interpretation: **+1 standard deviation** in a feature increases the prediction by the shown amount (in *y*’s units).  
  Both are positive; **StudyHours is slightly stronger**.

- **Feature correlation ≈ 0.994 (very high)**  
  The two features carry almost the same information.  
  With L2 in Elastic Net, the model **shares weight** between them rather than dropping one (as pure LASSO might).

> **If you want a sparser model:** increase `l1_ratio` (e.g., 0.8–0.9) and/or `alpha`.  
> **If you want stability with correlated features:** keep some L2 (current setting is reasonable).
