In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import warnings

# Assume these are your own implementations
from models import LinRegLasso, LinRegFusedLasso

np.random.seed(42)

def generate_data(n_samples, n_features, beta_type="sparse", noise_std=1.0):
    X = np.random.randn(n_samples, n_features)

    if beta_type == "sparse":
        beta = np.zeros(n_features)
        nonzero_idx = np.random.choice(n_features, size=n_features // 5, replace=False)
        beta[nonzero_idx] = np.random.randn(len(nonzero_idx))

    elif beta_type == "piecewise":
        beta = np.zeros(n_features)
        block_size = n_features // 5
        for i in range(5):
            beta[i * block_size:(i + 1) * block_size] = np.random.randn()  # same value in block

    elif beta_type == "smooth":
        beta = np.cumsum(np.random.randn(n_features)) / 5  # slow varying

    elif beta_type == "correlated":
        # Use correlated X
        X = np.random.multivariate_normal(np.zeros(n_features),
                                          cov=0.8 * np.ones((n_features, n_features)) + 0.2 * np.eye(n_features),
                                          size=n_samples)
        beta = np.random.randn(n_features)

    else:
        raise ValueError("Invalid beta_type")

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")

        y = X @ beta + np.random.randn(n_samples) * noise_std

    assert np.all(np.isfinite(y)), "y contains NaN or Inf values!"

    return X, y, beta

def zero_prediction_accuracy(beta_true, beta_pred, tol=1e-5):
    true_zeros = np.abs(beta_true) < tol
    pred_zeros = np.abs(beta_pred) < tol
    correct_zeros = np.sum(true_zeros & pred_zeros)
    total_true_zeros = np.sum(true_zeros)
    return correct_zeros / total_true_zeros if total_true_zeros > 0 else np.nan

def run_experiments_synthetic_data():
    results = []
    models = {
        "Sklearn-Lasso": Lasso(),
        "Sklearn-LinReg": LinearRegression(),
        "MyLasso": LinRegLasso(),
        "MyFusedLasso": LinRegFusedLasso()
    }

    regimes = [
        ("sparse", 1.0),
        ("piecewise", 1.0),
        ("smooth", 1.0),
        ("correlated", 1.0),
        ("sparse", 3.0),  # high noise
        ("piecewise", 3.0),
    ]

    

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for regime, noise_std in regimes:
        X, y, beta_true = generate_data(1000, 5, beta_type=regime, noise_std=noise_std)
        
        for name, model in models.items():
            print(f'\nmodel: {name}, regime {regime}, noise_std {noise_std}')
            try:
                mse_scores = []
                zero_accs = []

                for train_idx, test_idx in kf.split(X):
                    X_train, X_test = X[train_idx], X[test_idx]

                    X_train_mean = X_train.mean(axis=0)
                    X_train_std = X_train.std(axis=0)
                    # X_train_std[X_train_std == 0] = 1.0  # Avoid division by zero
                    epsilon = 1e-8

                    X_train_standarized = (X_train - X_train_mean) / (X_train_std + epsilon)
                    X_test_standardized = (X_test - X_train_mean) / (X_train_std + epsilon)
                    
                    y_train, y_test = y[train_idx], y[test_idx]
                    y_train_mean = y_train.mean()
                    y_train_centered = y_train - y_train_mean

                    assert not np.isnan(X_train_standarized).any(), "NaNs found in X_train_standarized"
                    assert not np.isinf(X_train_standarized).any(), "Infs found in X_train_standarized"

                    assert not np.isnan(X_test_standardized).any(), "NaNs found in X_test_standardized"
                    assert not np.isinf(X_test_standardized).any(), "Infs found in X_test_standardized"

                    assert not np.isnan(y_train_centered).any(), "NaNs found in y_train_centered"
                    assert not np.isinf(y_train_centered).any(), "Infs found in y_train_centered"

                    with warnings.catch_warnings(record=True) as w:
                        warnings.simplefilter("always")
                        model.fit(X_train_standarized, y_train_centered)
                        y_pred = model.predict(X_test_standardized) + y_train_mean

                    assert not np.isnan(y_pred).any(), "NaNs found in y_pred"
                    assert not np.isinf(y_pred).any(), "Infs found in y_pred"

                    beta_pred = getattr(model, 'coef_', getattr(model, '__beta_tilde', None))

                    mse_scores.append(mean_squared_error(y_test, y_pred))
                    zero_accs.append(zero_prediction_accuracy(beta_true, beta_pred))

                results.append({
                    "Model": name,
                    "Regime": regime,
                    "Noise": noise_std,
                    "MSE": np.mean(mse_scores),
                    "Zero Accuracy": np.mean(zero_accs)
                })

            except Exception as e:
                print('Error: ', e)
                results.append({
                    "Model": name,
                    "Regime": regime,
                    "Noise": noise_std,
                    "MSE": None,
                    "Zero Accuracy": None,
                    "Error": str(e)
                })

    return pd.DataFrame(results)


In [93]:
df_results = run_experiments_synthetic_data()
df_results


model: Sklearn-Lasso, regime sparse, noise_std 1.0

model: Sklearn-LinReg, regime sparse, noise_std 1.0

model: MyLasso, regime sparse, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: MyFusedLasso, regime sparse, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: Sklearn-Lasso, regime piecewise, noise_std 1.0

model: Sklearn-LinReg, regime piecewise, noise_std 1.0

model: MyLasso, regime piecewise, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: MyFusedLasso, regime piecewise, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: Sklearn-Lasso, regime smooth, noise_std 1.0

model: Sklearn-LinReg, regime smooth, noise_std 1.0

model: MyLasso, regime smooth, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: MyFusedLasso, regime smooth, noise_std 1.0
algorithm did not converge
Error:  NaNs found in y_pred

model: Sklearn-Lasso, regime correlated, nois

Unnamed: 0,Model,Regime,Noise,MSE,Zero Accuracy,Error
0,Sklearn-Lasso,sparse,1.0,1.133198,1.0,
1,Sklearn-LinReg,sparse,1.0,1.040611,0.0,
2,MyLasso,sparse,1.0,,,NaNs found in y_pred
3,MyFusedLasso,sparse,1.0,,,NaNs found in y_pred
4,Sklearn-Lasso,piecewise,1.0,1.805487,,
5,Sklearn-LinReg,piecewise,1.0,1.086176,,
6,MyLasso,piecewise,1.0,,,NaNs found in y_pred
7,MyFusedLasso,piecewise,1.0,,,NaNs found in y_pred
8,Sklearn-Lasso,smooth,1.0,1.994529,,
9,Sklearn-LinReg,smooth,1.0,1.012576,,
