In [18]:
# compare_linear_regularization_boston_or_california_with_accuracy.py
# Linear vs Polynomial Linear vs Ridge (L2) vs Lasso (L1) vs Elastic-Net (L1+L2)
# Reports TRAIN/TEST "Accuracy" as R²×100, plus Test RMSE/MAE.

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -----------------------------
# Data loader (Boston -> fallback to California)
# -----------------------------
def load_boston_or_california():
    try:
        from sklearn.datasets import fetch_openml
        boston = fetch_openml(name="boston", version=1, as_frame=False)
        X = boston.data
        y = boston.target.astype(np.float64)
        feature_names = boston.feature_names
        return X, y, feature_names, "Boston (OpenML)"
    except Exception:
        from sklearn.datasets import fetch_california_housing
        cal = fetch_california_housing(as_frame=False)
        X = cal.data
        y = cal.target.astype(np.float64)
        feature_names = cal.feature_names
        return X, y, feature_names, "California Housing (fallback)"

X, y, feature_names, dataset_name = load_boston_or_california()
print(f"Loaded dataset: {dataset_name} | X shape: {X.shape}, y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0
)

# -----------------------------
# Helpers
# -----------------------------
def test_metrics(y_true, y_pred):
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae  = float(mean_absolute_error(y_true, y_pred))
    return rmse, mae

def add_result(name, model, fit=True, store=[]):
    if fit:
        model.fit(X_train, y_train)
    # "Accuracy" for regression = R² × 100
    train_acc = model.score(X_train, y_train) * 100.0
    test_acc  = model.score(X_test,  y_test)  * 100.0
    y_te = model.predict(X_test)
    te_rmse, te_mae = test_metrics(y_test, y_te)
    store.append({
        "model": name,
        "train_acc(%)": round(train_acc, 2),
        "test_acc(%)":  round(test_acc,  2),
        "test_RMSE":    round(te_rmse,   4),
        "test_MAE":     round(te_mae,    4),
    })
    return store

rows = []

# -----------------------------
# 1) Basic Linear Regression (no scaling, no poly)
# -----------------------------
lin_basic = LinearRegression()
rows = add_result("Linear", lin_basic, store=rows)

# -----------------------------
# 2) Polynomial Linear Regression (deg=2)
# -----------------------------
poly_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", LinearRegression())
])
rows = add_result("Polynomial (deg=2) + Linear", poly_lin, store=rows)

# -----------------------------
# 3) Ridge (L2) + Polynomial(deg=2), CV on alpha
# -----------------------------
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", Ridge())
])
ridge_grid = GridSearchCV(
    ridge_pipe,
    param_grid={"model__alpha": np.logspace(-3, 3, 25)},
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
ridge_grid.fit(X_train, y_train)
print("\n[Ridge] best params:", ridge_grid.best_params_)
rows = add_result("Polynomial (deg=2) + Ridge (CV)", ridge_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# 4) Lasso (L1) + Polynomial(deg=2), CV on alpha
# -----------------------------
lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", Lasso(max_iter=10000))
])
lasso_grid = GridSearchCV(
    lasso_pipe,
    param_grid={"model__alpha": np.logspace(-3, 1, 25)},
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
lasso_grid.fit(X_train, y_train)
print("[Lasso] best params:", lasso_grid.best_params_)
rows = add_result("Polynomial (deg=2) + Lasso (CV)", lasso_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# 5) Elastic-Net + Polynomial(deg=2), CV on alpha & l1_ratio
# -----------------------------
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", ElasticNet(max_iter=10000))
])
enet_grid = GridSearchCV(
    enet_pipe,
    param_grid={
        "model__alpha": np.logspace(-3, 1, 15),
        "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
    },
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
enet_grid.fit(X_train, y_train)
print("[ElasticNet] best params:", enet_grid.best_params_)
rows = add_result("Polynomial (deg=2) + ElasticNet (CV)", enet_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# Results table (sorted by Test Accuracy)
# -----------------------------
res = pd.DataFrame(rows).sort_values("test_acc(%)", ascending=False)
print("\n=== Comparison —", dataset_name, "===\n")
print(res.to_string(index=False))


Loaded dataset: Boston (OpenML) | X shape: (506, 13), y shape: (506,)

[Ridge] best params: {'model__alpha': np.float64(17.78279410038923)}
[Lasso] best params: {'model__alpha': np.float64(0.03162277660168379)}
[ElasticNet] best params: {'model__alpha': np.float64(0.0517947467923121), 'model__l1_ratio': 0.3}

=== Comparison — Boston (OpenML) ===

                               model  train_acc(%)  test_acc(%)  test_RMSE  test_MAE
Polynomial (deg=2) + ElasticNet (CV)         93.08        80.42     4.0380    2.6187
     Polynomial (deg=2) + Ridge (CV)         93.24        79.99     4.0815    2.6498
     Polynomial (deg=2) + Lasso (CV)         93.42        79.26     4.1555    2.6486
                              Linear         76.45        67.34     5.2150    3.6099
         Polynomial (deg=2) + Linear         95.17        64.87     5.4086    3.0520


In [19]:
# compare_linear_regularization_california.py
# Linear vs Polynomial Linear vs Ridge (L2) vs Lasso (L1) vs Elastic-Net (L1+L2)
# Dataset: California Housing
# Reports TRAIN/TEST "Accuracy" as R²×100, plus Test RMSE/MAE.

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -----------------------------
# Data: California Housing
# -----------------------------
cal = fetch_california_housing(as_frame=False)
X = cal.data
y = cal.target.astype(np.float64)
feature_names = cal.feature_names
dataset_name = "California Housing"
print(f"Loaded dataset: {dataset_name} | X shape: {X.shape}, y shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=0
)

# -----------------------------
# Helpers
# -----------------------------
def test_metrics(y_true, y_pred):
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae  = float(mean_absolute_error(y_true, y_pred))
    return rmse, mae

def add_result(name, model, fit=True, store=[]):
    if fit:
        model.fit(X_train, y_train)
    # "Accuracy" for regression = R² × 100
    train_acc = model.score(X_train, y_train) * 100.0
    test_acc  = model.score(X_test,  y_test)  * 100.0
    y_te = model.predict(X_test)
    te_rmse, te_mae = test_metrics(y_test, y_te)
    store.append({
        "model": name,
        "train_acc(%)": round(train_acc, 2),
        "test_acc(%)":  round(test_acc,  2),
        "test_RMSE":    round(te_rmse,   4),
        "test_MAE":     round(te_mae,    4),
    })
    return store

rows = []

# -----------------------------
# 1) Basic Linear Regression (no scaling, no poly)
# -----------------------------
lin_basic = LinearRegression()
rows = add_result("Linear", lin_basic, store=rows)

# -----------------------------
# 2) Polynomial Linear Regression (deg=2)
# -----------------------------
poly_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", LinearRegression())
])
rows = add_result("Polynomial (deg=2) + Linear", poly_lin, store=rows)

# -----------------------------
# 3) Ridge (L2) + Polynomial(deg=2), CV on alpha
# -----------------------------
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", Ridge())
])
ridge_grid = GridSearchCV(
    ridge_pipe,
    param_grid={"model__alpha": np.logspace(-3, 3, 25)},
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
ridge_grid.fit(X_train, y_train)
print("\n[Ridge] best params:", ridge_grid.best_params_)
rows = add_result("Polynomial (deg=2) + Ridge (CV)", ridge_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# 4) Lasso (L1) + Polynomial(deg=2), CV on alpha
# -----------------------------
lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", Lasso(max_iter=10000))
])
lasso_grid = GridSearchCV(
    lasso_pipe,
    param_grid={"model__alpha": np.logspace(-3, 1, 25)},
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
lasso_grid.fit(X_train, y_train)
print("[Lasso] best params:", lasso_grid.best_params_)
rows = add_result("Polynomial (deg=2) + Lasso (CV)", lasso_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# 5) Elastic-Net + Polynomial(deg=2), CV on alpha & l1_ratio
# -----------------------------
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2)),
    ("model", ElasticNet(max_iter=10000))
])
enet_grid = GridSearchCV(
    enet_pipe,
    param_grid={
        "model__alpha": np.logspace(-3, 1, 15),
        "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
    },
    scoring="neg_mean_squared_error",
    cv=5, n_jobs=-1, refit=True
)
enet_grid.fit(X_train, y_train)
print("[ElasticNet] best params:", enet_grid.best_params_)
rows = add_result("Polynomial (deg=2) + ElasticNet (CV)", enet_grid.best_estimator_, fit=False, store=rows)

# -----------------------------
# Results table (sorted by Test Accuracy)
# -----------------------------
res = pd.DataFrame(rows).sort_values("test_acc(%)", ascending=False)
print("\n=== Comparison — California Housing ===\n")
print(res.to_string(index=False))


Loaded dataset: California Housing | X shape: (20640, 8), y shape: (20640,)

[Ridge] best params: {'model__alpha': np.float64(177.82794100389228)}
[Lasso] best params: {'model__alpha': np.float64(0.01467799267622069)}
[ElasticNet] best params: {'model__alpha': np.float64(0.026826957952797246), 'model__l1_ratio': 0.1}

=== Comparison — California Housing ===

                               model  train_acc(%)  test_acc(%)  test_RMSE  test_MAE
     Polynomial (deg=2) + Ridge (CV)         67.07        64.62     0.6868    0.4823
                              Linear         61.13        59.26     0.7370    0.5362
Polynomial (deg=2) + ElasticNet (CV)         64.58        45.69     0.8510    0.5139
     Polynomial (deg=2) + Lasso (CV)         62.90        -8.28     1.2015    0.5370
         Polynomial (deg=2) + Linear         68.62       -60.29     1.4618    0.4791


In [20]:
# l1_vs_l2_20newsgroups.py     L1 vs L2 on 20 Newsgroups (Logistic Regression)
# Compare L1 vs L2 regularization on high-dimensional sparse text
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# 1) Load data
train = fetch_20newsgroups(subset="train", remove=("headers","footers","quotes"))
test  = fetch_20newsgroups(subset="test",  remove=("headers","footers","quotes"))

# 2) Pipeline: TF-IDF -> LogisticRegression(saga)
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9)),
    ("clf",   LogisticRegression(solver="saga", max_iter=5000, n_jobs=-1))
])

# 3) Hyperparameters: L2 vs L1 and strength C
Cgrid = np.logspace(-3, 2, 8)
param_grid = [
    {"clf__penalty": ["l2"], "clf__C": Cgrid},
    {"clf__penalty": ["l1"], "clf__C": Cgrid},
]

gs = GridSearchCV(
    pipe, param_grid, cv=5, n_jobs=-1, scoring="accuracy", refit=True, verbose=0
)
gs.fit(train.data, train.target)

best = gs.best_estimator_
y_pred = best.predict(test.data)
acc = accuracy_score(test.target, y_pred)

print("Best params:", gs.best_params_)
print(f"Test accuracy: {acc:.4f}")

# 4) Coefficient sparsity (meaningful for L1)
clf = best.named_steps["clf"]
is_l1 = clf.penalty == "l1"
try:
    # number of features equals length of vectorizer vocab
    n_feats = len(best.named_steps["tfidf"].get_feature_names_out())
except Exception:
    n_feats = clf.coef_.shape[1]

coefs = clf.coef_.ravel()
n_zeros = int(np.isclose(coefs, 0.0).sum())
print(f"Penalty: {clf.penalty.upper()} | Non-zero coefficients: {coefs.size - n_zeros}/{coefs.size}")

# 5) Quick side-by-side: refit best C under the *other* penalty for comparison
def score_for(penalty):
    # choose the best C found for that penalty; if missing, reuse gs.best_params_['clf__C']
    C_candidates = [p["clf__C"] for p in gs.cv_results_["params"] if p["clf__penalty"] == penalty]
    C_use = gs.best_params_["clf__C"] if penalty == gs.best_params_["clf__penalty"] else (C_candidates[0] if C_candidates else 1.0)
    alt = Pipeline([
        ("tfidf", best.named_steps["tfidf"]),
        ("clf",   LogisticRegression(solver="saga", max_iter=5000, n_jobs=-1, penalty=penalty, C=C_use))
    ]).fit(train.data, train.target)
    pred = alt.predict(test.data)
    acc  = accuracy_score(test.target, pred)
    co   = alt.named_steps["clf"].coef_.ravel()
    zeros = int(np.isclose(co, 0.0).sum())
    print(f"{penalty.upper()} → C={C_use} | Test acc={acc:.4f} | Non-zero coefs={co.size - zeros}/{co.size}")

score_for("l2")
score_for("l1")


Best params: {'clf__C': np.float64(100.0), 'clf__penalty': 'l2'}
Test accuracy: 0.6764
Penalty: L2 | Non-zero coefficients: 2459531/2461460
L2 → C=100.0 | Test acc=0.6763 | Non-zero coefs=2460936/2461460
L1 → C=0.001 | Test acc=0.0524 | Non-zero coefs=0/2461460
