In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor

In [2]:
# 1. LOAD ALL DATA
print("Loading datasets...")

# Training Data (For retraining base models)
df_train_reg = pd.read_csv('data_regular.csv')
df_train_scl = pd.read_csv('data_scaled.csv')

# Test Data (For the leaderboard)
# IMPORTANT: Ensure these have the exact same columns as training data (minus the target)
df_test_reg = pd.read_csv('test_regular.csv')
df_test_scl = pd.read_csv('test_scaled.csv')

# OOF Data (For training the Meta-Learner)
df_oof = pd.read_csv('oof_predictions.csv')

Loading datasets...


In [4]:
# Define Target (Log-Transformed Bandgap)
y_train = df_train_reg['bandgap_transformed'].values

# Define Training Features
X_train_tree = df_train_reg.drop(columns=['formula', 'bandgap_transformed']).values
X_train_dense = df_train_scl.drop(columns=['formula', 'bandgap_transformed']).values

# Define Test Features (Ensure 'formula' is dropped if present)
# We assume test files don't have the target column
X_test_tree = df_test_reg.values
X_test_dense = df_test_scl.values

print(f"Training on {len(y_train)} samples.")
print(f"Predicting on {X_test_tree.shape[0]} test samples.")

Training on 4604 samples.
Predicting on 4604 test samples.


In [5]:
# 2. RETRAIN BASE MODELS (Full Dataset)
print("\n--- Retraining Base Models on Full Data ---")

# --- Model A: CatBoost ---
cat_model = CatBoostRegressor(
    iterations=1556,
    learning_rate=0.11039,
    depth=7,
    l2_leaf_reg=8,
    random_strength=4.48,
    bagging_temperature=0.6,
    loss_function='RMSE',
    verbose=False
)
print("Training CatBoost...")
cat_model.fit(X_train_tree, y_train)

# --- Model B: KRR ---
krr_model = KernelRidge(
    alpha=0.0439, 
    kernel='rbf', 
    gamma=0.008535
)
print("Training KRR...")
krr_model.fit(X_train_dense, y_train)

# --- Model C: MLP ---
mlp_model = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    alpha=6.9168e-05,
    batch_size=32,
    learning_rate_init=0.0011,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42
)
print("Training MLP...")
mlp_model.fit(X_train_dense, y_train)


--- Retraining Base Models on Full Data ---
Training CatBoost...
Training KRR...
Training MLP...


In [6]:
# 3. GENERATE TEST PREDICTIONS
print("\n--- Generating Base Predictions for Test Set ---")

pred_test_cat = cat_model.predict(X_test_tree)
pred_test_krr = krr_model.predict(X_test_dense)
pred_test_mlp = mlp_model.predict(X_test_dense)

# Stack them into a matrix (Rows = Test Samples, Cols = Models)
X_test_meta = np.column_stack((pred_test_cat, pred_test_krr, pred_test_mlp))


--- Generating Base Predictions for Test Set ---


In [7]:
# 4. TRAIN META-LEARNER & PREDICT
print("\n--- Training Meta-Learner & Finalizing ---")

# Prepare Meta-Training Data (From OOF file)
X_train_meta = df_oof[['pred_cat', 'pred_krr', 'pred_mlp']].values
y_train_meta = df_oof['true_target'].values

# Train Lasso
meta_model = LassoCV(cv=5, random_state=42, positive=True)
meta_model.fit(X_train_meta, y_train_meta)

# Predict on Test Stack
final_log_pred = meta_model.predict(X_test_meta)


--- Training Meta-Learner & Finalizing ---


In [9]:
# 5. INVERSE TRANSFORM & SAVE
# Convert from Log-Space back to eV
final_pred_ev = np.expm1(final_log_pred)

# Sanity Check: Clip negative values to 0
final_pred_ev = np.maximum(final_pred_ev, 0)

# Create Submission DataFrame
submission = pd.DataFrame({
    # Assuming your test file has formulas, useful for checking
    'formula': df_train_reg['formula'], 
    'bandgap_predicted': final_pred_ev
})

submission.to_csv('submission_final.csv', index=False)

print("\nDONE! Submission saved to 'submission_final.csv'")
print("Model Weights used:")
print(f"Cat: {meta_model.coef_[0]:.2f}, KRR: {meta_model.coef_[1]:.2f}, MLP: {meta_model.coef_[2]:.2f}")


DONE! Submission saved to 'submission_final.csv'
Model Weights used:
Cat: 0.90, KRR: 0.00, MLP: 0.12


In [10]:
joblib.dump(meta_model,"final_lasso_meta_model.pkl") 

['final_lasso_meta_model.pkl']