Explain here

In [22]:
import pandas as pd
import numpy as np
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import catboost
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LassoCV
import joblib  

In [20]:
df_unscaled = pd.read_csv('data_regular.csv')
df_scaled = pd.read_csv('data_scaled.csv')

In [None]:
#Defining target and features. 
y = df_regular['bandgap_transformed'].values
y_scaled = df_scaled['bandgap_transformed'].values
X_scaled = df_scaled.drop(columns=['formula','bandgap_transformed']).values
X_unscaled = df_unscaled.drop(columns=['formula','bandgap_transformed']).values

print(f"Loaded data. Samples: {len(y)}")
print(f"Unscaled Features shape: {X_unscaled.shape}, Scaled Features shape : {X_scaled.shape}")

Loaded data. Samples: 4604
Unscaled Features shape: (4604, 115), Scaled Features shape : (4604, 115)


In [27]:
test_reg = df_unscaled.drop(columns=['formula','bandgap_transformed'])
test_scaled = df_scaled.drop(columns=['formula','bandgap_transformed'])
test_reg.to_csv("test_regular.csv",index=False)
test_scaled.to_csv("test_scaled.csv",index=False)

In [None]:
#Define the three models. 

# Model A: CatBoost. These hyperparameters were obtained via fine-tuning
cat_model = CatBoostRegressor(
    iterations=1556,
    learning_rate=0.11039,
    depth=7,
    l2_leaf_reg=8,
    random_strength=4.48,
    bagging_temperature=0.6,
    loss_function='RMSE',
    verbose=False,
    allow_writing_files=False
)

# Model B: Kernel Ridge Regression. Hyperparameters obtained via fine-tuning
krr_model = KernelRidge(
    alpha=0.0439, 
    kernel='rbf', 
    gamma=0.008535
)

# Model C: Multi Layer Perceptron
mlp_model = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    alpha=6.9168e-05,
    batch_size=32,
    learning_rate_init=0.0011,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)

In [None]:
# Lets tune our hyperparameters for each model first.
# Model 1: CATBOOST
def tune_catboost(trial):
    # 1. Define the search space
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        
        # Fixed settings
        'loss_function': 'RMSE',
        'verbose': False,
        'random_state': 42
    }
    
    # 2. Setup the model with these params
    model = catboost.CatBoostRegressor(**params)
    
    # 3. Evaluate with Cross-Validation (Match your OOF folds!)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # We use negative MSE because Optuna wants to MAXIMIZE by default
    # or we can tell Optuna to MINIMIZE and return positive MSE.
    # Here we calculate RMSE manually.
    scores = cross_val_score(model, X_unscaled, y, cv=kf, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    
    return rmse

# 4. Run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(tune_catboost, n_trials=10) # Run 10 experiments

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)


[I 2026-01-07 13:32:23,898] A new study created in memory with name: no-name-dc9e6e94-6a8d-46f3-99dc-e8a2adced5bf
[I 2026-01-07 13:33:23,189] Trial 0 finished with value: 0.3164717427988849 and parameters: {'iterations': 834, 'depth': 9, 'learning_rate': 0.01925965306763024, 'l2_leaf_reg': 3.5129738106965727, 'random_strength': 0.12725395375330315, 'bagging_temperature': 0.7223388407811896}. Best is trial 0 with value: 0.3164717427988849.
[I 2026-01-07 13:35:15,104] Trial 1 finished with value: 0.3074623933731757 and parameters: {'iterations': 1635, 'depth': 9, 'learning_rate': 0.13710936575035773, 'l2_leaf_reg': 7.51892183105149, 'random_strength': 0.9298783749496169, 'bagging_temperature': 0.4825748907083077}. Best is trial 1 with value: 0.3074623933731757.
[I 2026-01-07 13:35:59,452] Trial 2 finished with value: 0.3196598649121201 and parameters: {'iterations': 1890, 'depth': 7, 'learning_rate': 0.014152941114918143, 'l2_leaf_reg': 6.497382799829499, 'random_strength': 0.00448455987

Best RMSE: 0.3062658917545854
Best Params: {'iterations': 1556, 'depth': 7, 'learning_rate': 0.1103922005542335, 'l2_leaf_reg': 8.003181868648774, 'random_strength': 4.480410045372684e-08, 'bagging_temperature': 0.6000398690450359}


In [17]:
# Fine tune the KRR model
# Define CV Strategy (Must match your final Stacking strategy)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print("\n--- Starting KRR Optimization ---")

def objective_krr(trial):
    # KRR Search Space
    # Alpha: Regularization (Small = fit closer to data, Large = smoother)
    alpha = trial.suggest_float('alpha', 1e-4, 10.0, log=True)
    
    # Gamma: Kernel coefficient (Low = broad influence, High = tight influence)
    gamma = trial.suggest_float('gamma', 1e-4, 10.0, log=True)
    
    # Define model
    model = KernelRidge(
        kernel='rbf', 
        alpha=alpha, 
        gamma=gamma
    )
    
    # Evaluate
    # We use negative MSE because Optuna maximizes by default
    scores = cross_val_score(model, X_scaled, y_scaled, cv=kf, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    return rmse

# Run KRR Study
study_krr = optuna.create_study(direction='minimize')
study_krr.optimize(objective_krr, n_trials=25) # 25 trials is usually enough for KRR

print(f"Best KRR RMSE: {study_krr.best_value:.4f}")
print(f"Best KRR Params: {study_krr.best_params}")

[I 2026-01-07 13:49:10,394] A new study created in memory with name: no-name-791c62ce-c27e-4512-bfd1-4286d9ad5403



--- Starting KRR Optimization ---


[I 2026-01-07 13:49:14,448] Trial 0 finished with value: 0.6064623723030624 and parameters: {'alpha': 0.0006833302185876645, 'gamma': 0.000892843032345799}. Best is trial 0 with value: 0.6064623723030624.
[I 2026-01-07 13:49:18,267] Trial 1 finished with value: 0.6893230474753267 and parameters: {'alpha': 0.0009019537193413744, 'gamma': 0.010877454796272269}. Best is trial 0 with value: 0.6064623723030624.
[I 2026-01-07 13:49:21,947] Trial 2 finished with value: 0.9674810422764966 and parameters: {'alpha': 0.0034159798723400426, 'gamma': 2.83525982924391}. Best is trial 0 with value: 0.6064623723030624.
[I 2026-01-07 13:49:25,886] Trial 3 finished with value: 0.7725117567216324 and parameters: {'alpha': 0.00018242601610643337, 'gamma': 0.010188470144849882}. Best is trial 0 with value: 0.6064623723030624.
[I 2026-01-07 13:49:29,623] Trial 4 finished with value: 0.6552353777881657 and parameters: {'alpha': 0.08855414321565064, 'gamma': 0.0010174818226046471}. Best is trial 0 with value:

Best KRR RMSE: 0.5705
Best KRR Params: {'alpha': 0.043967340094409654, 'gamma': 0.008535174416681082}


In [18]:
#Finally, lets optimize the MLP
print("\n--- Starting MLP Optimization ---")

def objective_mlp(trial):
    # MLP Search Space
    # Hidden Layers: Try different depths and widths
    layers = trial.suggest_categorical('hidden_layer_sizes', [
        (64, 32),
        (128, 64),
        (128, 64, 32),
        (256, 128)
    ])
    
    # Alpha: L2 Regularization (Crucial for preventing overfitting in neural nets)
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
    
    # Learning Rate: How big the steps are
    lr_init = trial.suggest_float('learning_rate_init', 1e-4, 1e-2, log=True)
    
    # Activation: ReLU is standard, but Tanh sometimes works for physics
    activation = trial.suggest_categorical('activation', ['relu', 'tanh'])

    # Define model
    model = MLPRegressor(
        hidden_layer_sizes=layers,
        alpha=alpha,
        learning_rate_init=lr_init,
        activation=activation,
        solver='adam',
        max_iter=500,           # Give it enough epochs to converge
        early_stopping=True,    # Stop if validation score stops improving
        n_iter_no_change=20,    # Patience
        random_state=42
    )
    
    # Evaluate
    scores = cross_val_score(model, X_scaled, y_scaled, cv=kf, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    return rmse

# Run MLP Study
# Note: This takes longer than KRR. Adjust n_trials if your computer is slow.
study_mlp = optuna.create_study(direction='minimize')
study_mlp.optimize(objective_mlp, n_trials=15) 

print(f"Best MLP RMSE: {study_mlp.best_value:.4f}")
print(f"Best MLP Params: {study_mlp.best_params}")

[I 2026-01-07 14:00:38,673] A new study created in memory with name: no-name-e8a3a405-3fdf-437d-bf23-b9f99fdd80f0



--- Starting MLP Optimization ---


[I 2026-01-07 14:00:50,108] Trial 0 finished with value: 0.603840578548358 and parameters: {'hidden_layer_sizes': (128, 64, 32), 'alpha': 0.0004939754382258558, 'learning_rate_init': 0.003278069254867378, 'activation': 'relu'}. Best is trial 0 with value: 0.603840578548358.
[I 2026-01-07 14:01:16,053] Trial 1 finished with value: 0.6011178107688402 and parameters: {'hidden_layer_sizes': (256, 128), 'alpha': 2.2104708189624165e-05, 'learning_rate_init': 0.005359893431294426, 'activation': 'relu'}. Best is trial 1 with value: 0.6011178107688402.
[I 2026-01-07 14:01:34,865] Trial 2 finished with value: 0.6267020308399325 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 0.002327719747572693, 'learning_rate_init': 0.0003401198954243955, 'activation': 'relu'}. Best is trial 1 with value: 0.6011178107688402.
[I 2026-01-07 14:01:41,503] Trial 3 finished with value: 0.6259498474191206 and parameters: {'hidden_layer_sizes': (64, 32), 'alpha': 1.6905676647098464e-05, 'learning_rate_init'

Best MLP RMSE: 0.5970
Best MLP Params: {'hidden_layer_sizes': (128, 64), 'alpha': 6.916803706597267e-05, 'learning_rate_init': 0.0011251266305436149, 'activation': 'relu'}


In [21]:
# Prepare array to store OOF predictions
# Rows = n_samples, Cols = 3 models
oof_preds = np.zeros((X_unscaled.shape[0], 3))

# Define K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("\nStarting Out-of-Fold Prediction Loop...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_unscaled, y)):
    print(f"Processing Fold {fold + 1}...")
    
    # --- Split Data ---
    # Targets are the same for all
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Tree Data (Raw)
    X_tree_train, X_tree_val = X_unscaled[train_idx], X_unscaled[val_idx]
    
    # Scaled data
    X_scaled_train, X_scaled_val = X_scaled[train_idx], X_scaled[val_idx]
    
    # --- Train & Predict Model A (CatBoost) ---
    cat_model.fit(X_tree_train, y_train)
    p_cat = cat_model.predict(X_tree_val)
    oof_preds[val_idx, 0] = p_cat
    
    # --- Train & Predict Model B (KRR) ---
    krr_model.fit(X_scaled_train, y_train)
    p_krr = krr_model.predict(X_scaled_val)
    oof_preds[val_idx, 1] = p_krr
    
    # --- Train & Predict Model C (MLP) ---
    mlp_model.fit(X_scaled_train, y_train)
    p_mlp = mlp_model.predict(X_scaled_val)
    oof_preds[val_idx, 2] = p_mlp
    
    # Optional: Print fold RMSE for sanity check
    fold_rmse = np.sqrt(mean_squared_error(y_val, p_cat))
    print(f"  > CatBoost Fold RMSE: {fold_rmse:.4f}")

# SAVE OUTPUT


# Create a DataFrame for the Meta-Learner
df_oof = pd.DataFrame(oof_preds, columns=['pred_cat', 'pred_krr', 'pred_mlp'])
# Add the actual target for easy training of the Meta-Learner
df_oof['true_target'] = y

# Save
df_oof.to_csv('oof_predictions.csv', index=False)
print("\nSuccess! OOF predictions saved to 'oof_predictions.csv'.")
print("You can now train the Lasso Meta-Learner on this file.")


Starting Out-of-Fold Prediction Loop...
Processing Fold 1...
  > CatBoost Fold RMSE: 0.3148
Processing Fold 2...
  > CatBoost Fold RMSE: 0.2956
Processing Fold 3...
  > CatBoost Fold RMSE: 0.3396
Processing Fold 4...
  > CatBoost Fold RMSE: 0.2982
Processing Fold 5...
  > CatBoost Fold RMSE: 0.3086

Success! OOF predictions saved to 'oof_predictions.csv'.
You can now train the Lasso Meta-Learner on this file.


In [23]:
# 1. LOAD OOF DATA
df_oof = pd.read_csv('oof_predictions.csv')

# X_meta: The predictions from your 3 base models
X_meta = df_oof[['pred_cat', 'pred_krr', 'pred_mlp']].values

# y_meta: The actual log-transformed bandgaps
y_meta = df_oof['true_target'].values

print(f"Loaded OOF Data. Shape: {X_meta.shape}")

# Check correlations between models (Just to see how similar they are)
print("\nCorrelation between base models:")
print(df_oof[['pred_cat', 'pred_krr', 'pred_mlp']].corr())

# ==========================================
# 2. TRAIN LASSO BLENDER
# ==========================================
# We use LassoCV, which automatically finds the best 'alpha' (regularization strength)
# positive=True enforces that weights must be >= 0 (No negative contributions)
meta_model = LassoCV(cv=5, random_state=42, positive=True)

print("\nTraining Meta-Learner...")
meta_model.fit(X_meta, y_meta)

# ==========================================
# 3. INSPECT RESULTS
# ==========================================
# The Coefficients tell you how much the stack "trusts" each model
coefs = meta_model.coef_
intercept = meta_model.intercept_

print("\n--- Meta-Learner Weights ---")
print(f"CatBoost Weight : {coefs[0]:.4f}")
print(f"KRR Weight      : {coefs[1]:.4f}")
print(f"MLP Weight      : {coefs[2]:.4f}")
print(f"Intercept       : {intercept:.4f}")

# Sanity Check: RMSE of the Stack vs. The Best Single Model
stack_preds = meta_model.predict(X_meta)
stack_rmse = np.sqrt(mean_squared_error(y_meta, stack_preds))

print(f"\nStack RMSE (CV) : {stack_rmse:.4f}")

# ==========================================
# 4. SAVE THE META-LEARNER
# ==========================================
joblib.dump(meta_model, 'final_lasso_meta_model.pkl')
print("\nMeta-learner saved as 'final_lasso_meta_model.pkl'")

Loaded OOF Data. Shape: (4604, 3)

Correlation between base models:
          pred_cat  pred_krr  pred_mlp
pred_cat  1.000000  0.936243  0.899331
pred_krr  0.936243  1.000000  0.898431
pred_mlp  0.899331  0.898431  1.000000

Training Meta-Learner...

--- Meta-Learner Weights ---
CatBoost Weight : 0.9006
KRR Weight      : 0.0000
MLP Weight      : 0.1215
Intercept       : -0.0179

Stack RMSE (CV) : 0.3104

Meta-learner saved as 'final_lasso_meta_model.pkl'
