In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

np.random.seed(123)

In [2]:
# --- Load dataset ---
dataset = 'test'
df = pd.read_csv(f"../data/{dataset}.csv")
feature_cols = [c for c in df.columns if c not in ["subset", "y"]]

In [3]:
# --- Model dictionary ---
def featureless_model(X_train, X_test, y_train, y_test):
    y_pred = np.repeat(y_train.mean(), len(y_test))
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, mae


def cv_gam_model(X_train, X_test, y_train, y_test):
    # Function to add exponential features
    def add_exponential_features(X):
        return np.hstack([X, np.exp(X)])
    
    pipeline = Pipeline([
        ('poly', PolynomialFeatures()),
        ('exp', FunctionTransformer(add_exponential_features)),
        ('scaler', StandardScaler()),
        ('lasso', Lasso(max_iter=10000))
    ])

    # Grid search for best L1 alpha
    param_grid = {'lasso__alpha': [0.01 * (1.2 ** i) for i in range(10)]}
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid.fit(X_train, y_train)

    # Predictions
    y_pred = grid.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, mae


model_dict = {
    "featureless": featureless_model,
    "cv_gam": cv_gam_model
}

In [4]:
results = []

for subset in np.unique(df["subset"]):
    subset_idx = np.where(df["subset"] == subset)[0]
    kf = KFold(n_splits=5)
    for fold, (train_idx, test_idx) in enumerate(kf.split(subset_idx)):
        test_indices = subset_idx[test_idx]
        same_indices = subset_idx[train_idx]
        other_indices = np.where(df["subset"] != subset)[0]
        all_indices = np.concatenate([same_indices, other_indices])

        print(f"test size: {len(test_idx)} - same size: {len(train_idx)} - other size: {len(other_indices)} - all size: {len(all_indices)}")
        
        train_dict = {
            "same": same_indices,
            "other": other_indices,
            "all": all_indices
        }
        
        for category, train_indices in train_dict.items():
            for model in ["featureless", "cv_gam"]:
                mse, mae = model_dict[model](
                    X_train = df[feature_cols].values[train_indices],
                    X_test  = df[feature_cols].values[test_indices],
                    y_train = df["y"].values[train_indices],
                    y_test  = df["y"].values[test_indices])
                
                results.append({
                    "subset": subset,
                    "category": category,
                    "test_fold": fold + 1,
                    "model": model,
                    "mse": mse,
                    "mae": mae
                })

test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900
test size: 100 - same size: 400 - other size: 500 - all size: 900


In [5]:
# Save results
results_df = pd.DataFrame(results)
results_df.to_csv(f"../results/{dataset}.csv", index=False)