In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LassoCV

np.random.seed(123)

In [2]:
dataset = 'synthetic'

In [3]:
# --- Load dataset ---
df = pd.read_csv(f"../data/{dataset}.csv")
feature_cols = [c for c in df.columns if c not in ["subset", "y"]]

# Scale features
feature_scaler = StandardScaler()
df[feature_cols] = feature_scaler.fit_transform(df[feature_cols])

# Scale target
target_scaler = StandardScaler()
df["y"] = target_scaler.fit_transform(df[["y"]])

In [4]:
# --- Model dictionary ---
def featureless_model(X_train, X_test, y_train, y_test):
    y_pred = np.repeat(y_train.mean(), len(y_test))
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, mae

def cv_gam_model(X_train, X_test, y_train, y_test):
    pipeline = Pipeline([
        ('poly', PolynomialFeatures()),
        ('scaler', StandardScaler()),
        ('lasso', LassoCV(cv=4, n_jobs=-1))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    return mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred)

model_dict = {
    "featureless": featureless_model,
    "cv_gam": cv_gam_model
}

In [5]:
# --- record results ---
results = []

for subset in np.unique(df["subset"]):
    subset_idx = np.where(df["subset"] == subset)[0]
    kf = KFold(n_splits = 5)
    for fold, (train_idx, test_idx) in enumerate(kf.split(subset_idx)):
        test_indices = subset_idx[test_idx]
        same_indices = subset_idx[train_idx]
        other_indices = np.where(df["subset"] != subset)[0]
        all_indices = np.concatenate([same_indices, other_indices])
        
        train_dict = {
            "same":  same_indices,
            "other": other_indices,
            "all":   all_indices
        }
        
        for category, train_indices in train_dict.items():
            for model in list(model_dict.keys()):
                mse, mae = model_dict[model](
                    X_train = df[feature_cols].values[train_indices],
                    X_test  = df[feature_cols].values[test_indices],
                    y_train = df["y"].values[train_indices],
                    y_test  = df["y"].values[test_indices]
                )
                
                results.append({
                    "subset":    subset,
                    "category":  category,
                    "test_fold": fold + 1,
                    "model":     model,
                    "mse":       mse,
                    "mae":       mae
                })

results_df = pd.DataFrame(results)
results_df.to_csv(f"../results/{dataset}.csv", index=False)