In [1]:

import numpy as np
from sklearn.linear_model import ElasticNet
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

##### Data loading

In [2]:
# Path to the data files
data_path_1 = '../data/case1Data.csv'

# Load the data into a numpy array
data_np = np.loadtxt(data_path_1, delimiter=',', skiprows=1)

# Create a pandas dataframe and use the first row as the column names
data_pd = pd.read_csv(data_path_1, sep=',', header=0)


# Splitting the data into features and target
X = data_pd.iloc[:, 1:]
y = data_pd.iloc[:, 0]

print("X: ", X.shape)
print("y: ", y.shape)


# Define which columns are continuous and categorical
# Example: assuming all features are continuous
# If you have categorical columns, update accordingly
categorical_cols = list(range(X.shape[1]-5, X.shape[1]))  # Last 5 columns as categorical
continuous_cols = list(range(X.shape[1] - 5))  # All columns before the last 5 as continuous

X:  (100, 100)
y:  (100,)


#### Here we specify the preprocesses that needs to be done

##### For continous variables
- Impute using KNN with k = 5 nearest neighbours
- Apply standardscaler this does for each of the inner splits apply:
  - fit_transform on the X_train
  - transform on the y_train




##### For categorical variables
- impute missing values by choosing the most common in the column
- 1-hot encode 

In [3]:
# Define preprocessing steps for continuous and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', KNNImputer(n_neighbors=5)),  # KNN Imputer for continuous features
            ('scaler', StandardScaler())  # Standardization of continuous features
        ]), continuous_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute categorical missing values
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One hot encode categorical features
        ]), categorical_cols)
    ]
)


#### We here define the pipeline for what is being done in the inner split. 1. preprocess 2. apply regressor

In [4]:

# Define the classifier (example with RandomForest)
regressor = ElasticNet()

# Create a pipeline that first applies preprocessing and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', regressor)
])

In [7]:
# Nested Cross-Validation (Outer loop for model evaluation, Inner loop for hyperparameter tuning)
# Inner loop for hyperparameter tuning
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'regressor__l1_ratio': [.1, .5, .7, .9, .95, .99, 1]  # Mix between Lasso and Ridge
}

# Outer loop for cross-validation (use KFold for regression)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Inner loop for GridSearchCV (hyperparameter tuning)
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Evaluate with cross_val_score (outer loop)
nested_score = cross_val_score(grid_search, X, y, cv=outer_cv, n_jobs=1, scoring='neg_mean_squared_error')

# Flip the negative sign to get positive MSE and then take the square root to get RMSE
nested_score_rmse = np.sqrt(-nested_score)

# Print the results
print(f"Nested CV RMSE: {nested_score_rmse.mean()} ± {nested_score_rmse.std()}")
# Print hyperparameters of the best model
print(f"Best hyperparameters: {grid_search.best_params_}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Nested CV RMSE: 27.775401231080156 ± 3.2739866888748668


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [10]:
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline


# Define hyperparameter grid
param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  
    'regressor__l1_ratio': [.1, .5, .7, .9, .95, .99, 1]  
}

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Store results
nested_scores = []
best_params_list = []

for train_idx, test_idx in outer_cv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] 
    # Inner CV for hyperparameter tuning
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    
    # Fit GridSearchCV on training data
    grid_search.fit(X_train, y_train)
    
    # Store best parameters
    best_params_list.append(grid_search.best_params_)
    
    # Evaluate best model on test data
    best_model = grid_search.best_estimator_
    test_score = -grid_search.score(X_test, y_test)  # Flip sign since scoring is neg MSE
    nested_scores.append(np.sqrt(test_score))  # Convert to RMSE

# Print final results
print(f"Nested CV RMSE: {np.mean(nested_scores):.4f} ± {np.std(nested_scores):.4f}")

# Print best parameters for each fold
for i, params in enumerate(best_params_list, 1):
    print(f"Best hyperparameters in fold {i}: {params}")


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Nested CV RMSE: 27.3947 ± 3.2507
Best hyperparameters in fold 1: {'regressor__alpha': 1, 'regressor__l1_ratio': 1}
Best hyperparameters in fold 2: {'regressor__alpha': 1, 'regressor__l1_ratio': 1}
Best hyperparameters in fold 3: {'regressor__alpha': 1, 'regressor__l1_ratio': 1}
Best hyperparameters in fold 4: {'regressor__alpha': 1, 'regressor__l1_ratio': 1}
Best hyperparameters in fold 5: {'regressor__alpha': 1, 'regressor__l1_ratio': 1}
