In [1]:
import pandas as pd

import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


# Load train and test sets
X_train_final = pd.read_csv("X_train_final.csv")
X_test_final = pd.read_csv("X_test_final.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

# Optional: check shapes and first few rows
print("Train shape:", X_train_final.shape)
print("Test shape:", X_test_final.shape)
X_train_final.head()


Train shape: (1022, 253)
Test shape: (438, 253)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.769036,0.524097,0.192386,0.487952,1.0,-0.038646,-0.586579,1.016985,-0.87039,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.764812,-1.871658,-1.528533,-0.512048,0.0,0.773486,0.378167,0.969814,0.069493,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.128164,0.105124,-0.23421,0.487952,0.0,0.887117,0.537425,0.0,-0.465322,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.769036,0.732724,0.493162,1.4064,0.0,0.801443,0.41681,1.018377,-0.87039,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.128164,-1.437438,0.256074,0.487952,1.0,0.534806,0.062763,0.0,-0.002587,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import (
    LinearRegression, Ridge, ElasticNet, HuberRegressor,
    RANSACRegressor, TheilSenRegressor, LassoCV, QuantileRegressor,
    TweedieRegressor, OrthogonalMatchingPursuit, SGDRegressor
)
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
import os
from datetime import datetime

# =============================================================================
# ULTIMATE BENCHMARK — WITH TRAIN SCORE + TRACKS BEST TEST SCORE EVER
# =============================================================================

# 1. Target
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

# 2. File to save best score
BEST_SCORE_FILE = "best_test_rmse.txt"

def load_best_score():
    if os.path.exists(BEST_SCORE_FILE):
        with open(BEST_SCORE_FILE, 'r') as f:
            return float(f.read().strip())
    return float('inf')

def save_best_score(score):
    with open(BEST_SCORE_FILE, 'w') as f:
        f.write(str(score))

previous_best = load_best_score()

# History
HIST_FILE = "model_history.csv"

# Load previous history if exists
if os.path.exists(HIST_FILE):
    history_df = pd.read_csv(HIST_FILE, index_col="Model")
else:
    history_df = pd.DataFrame(columns=["Model", "Prev_Test_RMSE"]).set_index("Model")


# 3. ALL YOUR MODELS — one line each
'''
Model Name : (estimator_object, hyperparameter_grid, scaled_flag),
scaled_flag = True → model expects scaled input features (e.g., Ridge, Lasso, ElasticNet).
None or empty {} → no GridSearch; fit directly.

'''
MODELS = {
    "LinearRegression":   (LinearRegression(),                                      {},                                                False),
    "SGDRegressor": (SGDRegressor(max_iter=5000, tol=1e-3, random_state=42), {
          "alpha": [1e-4, 1e-3, 1e-2],        # Regularization strength
        "learning_rate": ["constant", "invscaling", "adaptive"],
        "eta0": [0.01, 0.1, 0.5],           # Initial learning rate
        "loss": ["squared_error", "huber"]  # Loss function
    }, True),
    "Ridge":              (Ridge(random_state=42),                                  {"alpha": np.logspace(-3, 3, 10)},                 True),
    "LassoCV":            (LassoCV(alphas=np.logspace(-4, 1, 20), cv=5, max_iter=10000, random_state=42, n_jobs=-1), None, True),
    "ElasticNet":         (ElasticNet(max_iter=5000, random_state=42),               {"alpha": np.logspace(-3, 2, 10),
                                                                                      "l1_ratio": np.linspace(0.1, 1.0, 6)},            True),
    #"HuberRegressor":     (HuberRegressor(max_iter=2000),                            {"epsilon": [1.2, 1.35, 1.5, 1.8],"alpha": [1e-4, 1e-3, 1e-2, 0.1]},True),
    #"QuantileRegressor": (QuantileRegressor(quantile=0.5, solver='highs'), {"alpha": [0.0, 0.01, 0.1, 1.0],
     #                                                                   "quantile": [0.25, 0.5, 0.75]}, True),
    "TweedieRegressor": (TweedieRegressor(), {"power": [0, 1, 1.5, 2],"alpha": [0.0, 0.01, 0.1, 1.0]}, True),
    #"OrthogonalMatchingPursuit": (OrthogonalMatchingPursuit(), {"n_nonzero_coefs": [5, 10, 15, 20, None]}, True),
    #"RANSAC_Huber":       (RANSACRegressor(estimator=HuberRegressor(max_iter=2000),min_samples=0.7, residual_threshold=1.5, random_state=42),{"min_samples": [0.5, 0.7, 0.9],"residual_threshold": [0.5, 1.0, 1.5, 2.0]},       True),
    #"TheilSen":           (TheilSenRegressor(random_state=42, n_jobs=-1),           {"max_subpopulation": [1000, 3000, 5000]},         True),
    #"PLSRegression":      (PLSRegression(),                                         {"n_components": [2, 5, 10, 20, min(40, X_train.shape[1])]}, True),
    #"KNeighborsRegressor":(KNeighborsRegressor(),                                    {"n_neighbors": [3, 5, 7, 10, 15],
    #                                                                                  "weights": ["uniform", "distance"],
    #                                                                                  "p": [1, 2]},                                      True),
    #"RandomForest": (RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1),
    #                 {"max_depth": [None, 5, 10, 20]}, False),
    #"GradientBoosting": (GradientBoostingRegressor(random_state=42),
    #                     {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1], "max_depth": [3,5]}, False),
    #"XGBoost": (XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    #            {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1], "max_depth": [3,5]}, False),
    "BayesianRidge": (BayesianRidge(), {}, True),
    #"GPR": (GaussianProcessRegressor(), {}, True) # not Completed
}

print("ULTIMATE BENCHMARK WITH TRAIN SCORE & BEST TRACKING".center(100, "="))
print(f"Previous Best Test RMSE: {previous_best:.5f} (or never set)\n")

results = []
current_best = float('inf')

"""
For each model:

1. Uses scaled or unscaled features based on the scaled flag.

2. If hyperparameters exist, runs GridSearchCV with 5-fold CV to find the best.

3. Otherwise, fits the model directly.
"""

for name, (estimator, params, scaled) in MODELS.items():
    # Needs special Attention Latter
    X_tr = X_train_final if scaled else X_train_final
    X_te = X_test_final  if scaled else X_test_final
    
    
    print(f"\n{name:18} → ", end="")
    
    if params is None or len(params) == 0:
        model = estimator.fit(X_tr, y_train_log)
        cv_rmse = "N/A" if name != "LassoCV" else "built-in"
        best_params = "default" if name != "LassoCV" else f"α={model.alpha_:.2e}"
        print("No tuning", end=" → ")
    else:
        grid = GridSearchCV(estimator, params, scoring=rmse_scorer, cv=5, n_jobs=-1, refit=True)
        grid.fit(X_tr, y_train_log)
        model = grid.best_estimator_
        cv_rmse = -grid.best_score_
        best_params = grid.best_params_
        print(f"CV: {cv_rmse:.5f}", end=" → ")
    
    # Train RMSE
    train_pred = model.predict(X_tr)
    train_rmse = np.sqrt(mean_squared_error(y_train_log, train_pred))
    
    # Test RMSE
    test_pred = model.predict(X_te)
    test_rmse = np.sqrt(mean_squared_error(y_test_log, test_pred))
    
    print(f"TRAIN: {train_rmse} | TEST: {test_rmse}")
    
    if test_rmse < current_best:
        current_best = test_rmse
    
    # Previous RMSE
    prev_rmse = history_df.loc[name, "Prev_Test_RMSE"] if name in history_df.index else np.nan
    
    results.append({
        "Model": name,
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,
        "CV RMSE": cv_rmse,
        "Prev Test RMSE": prev_rmse,
        "Scaled": "Yes" if scaled else "No",
        "Best Params": best_params
    })
    # Update history
    history_df.loc[name, "Prev_Test_RMSE"] = test_rmse

# FINAL LEADERBOARD
df = pd.DataFrame(results)
df = df.sort_values("Test RMSE").reset_index(drop=True)
df.index = df.index + 1
df.index.name = "Rank"

print("\n" + "═" * 110)
print("FINAL LEADERBOARD — TRAIN + TEST + TRACKING".center(110))
print("═" * 110)
print(df[["Model", "Train RMSE", "Test RMSE", "Prev Test RMSE", "CV RMSE"]].round(5).to_string())

# Check if we beat previous best
winner = df.iloc[0]
improved = winner["Test RMSE"] < previous_best

print("\n" + "█" * 110)
print(f"WINNER → {winner['Model']}")
print(f"Test RMSE = {winner['Test RMSE']:.5f} | Train RMSE = {winner['Train RMSE']:.5f}")
if improved:
    print(f"NEW BEST! Beat previous {previous_best:.5f} by {(previous_best - winner['Test RMSE']):.5f}")
    save_best_score(winner["Test RMSE"])
else:
    print(f"No improvement (previous best: {previous_best:.5f})")
print("█" * 110)

# Save full results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f"benchmark_results_{timestamp}.csv")
print(f"\nFull results saved: benchmark_results_{timestamp}.csv")
history_df.to_csv(HIST_FILE)


Previous Best Test RMSE: 0.11673 (or never set)


LinearRegression   → No tuning → TRAIN: 0.09256315651264356 | TEST: 0.12821133826217523

SGDRegressor       → CV: nan → TRAIN: 0.14195047063879782 | TEST: 0.15784602539395737

Ridge              → CV: nan → TRAIN: 0.09256317798727669 | TEST: 0.1281924859324048

LassoCV            → No tuning → TRAIN: 0.10797901533833022 | TEST: 0.11757852714449415

ElasticNet         → CV: nan → TRAIN: 0.09819074490177672 | TEST: 0.11943526828379644

TweedieRegressor   → CV: nan → TRAIN: 0.09672155719486764 | TEST: 0.12946181020651357

BayesianRidge      → No tuning → TRAIN: 0.10426242973015354 | TEST: 0.12160578500410768

══════════════════════════════════════════════════════════════════════════════════════════════════════════════
                                 FINAL LEADERBOARD — TRAIN + TEST + TRACKING                                  
══════════════════════════════════════════════════════════════════════════════════════════════════════════════
   

## SGDRegressor


In [10]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Pipeline: Scaling + SGD Regressor
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDRegressor(max_iter=5000, tol=1e-3, random_state=42))
])

# Hyperparameter grid
param_grid = {
    "sgd__alpha": [1e-4, 1e-3, 1e-2],        # Regularization strength
    "sgd__learning_rate": ["constant", "invscaling", "adaptive"],
    "sgd__eta0": [0.01, 0.1, 0.5],           # Initial learning rate
    "sgd__loss": ["squared_error", "huber"]  # Loss function
}

# GridSearchCV
grid = GridSearchCV(pipe, param_grid, scoring=rmse_scorer, cv=5, n_jobs=-1)
grid.fit(X_train_final, y_train_log)

print("Best SGDRegressor Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)


Best SGDRegressor Params: {'sgd__alpha': 0.01, 'sgd__eta0': 0.1, 'sgd__learning_rate': 'adaptive', 'sgd__loss': 'huber'}
Best CV RMSE: 0.13423956433482814
Test RMSE: 0.1239647010302294


## Spline regression

In [11]:
from sklearn.preprocessing import SplineTransformer, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target if needed
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# RMSE scorer
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                           greater_is_better=False)

# Pipeline: Spline transformation + Linear Regression
pipe = Pipeline([
    ("spline", SplineTransformer()),
    ("lr", LinearRegression())
])

# Hyperparameter grid
param_grid = {
    "spline__degree": [3],           # cubic spline
    "spline__n_knots": [5, 10, 15], # number of knots
    "spline__include_bias": [False] # don't include constant term
}

# Grid search
grid = GridSearchCV(pipe, param_grid, scoring=rmse_scorer, cv=5, n_jobs=-1)
grid.fit(X_train_final, y_train_log)

print("Best Spline Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)


Best Spline Params: {'spline__degree': 3, 'spline__include_bias': False, 'spline__n_knots': 5}
Best CV RMSE: 0.23676842402469864
Test RMSE: 0.4101220648690916


## Poisson Regression

In [12]:
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# ========== Target ==========
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# ========== Pipeline ==========
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poisson", PoissonRegressor(max_iter=5000))
])

# ========== Hyperparameters ==========
param_grid = {
    "poisson__alpha": np.logspace(-4, 1, 10),     # regularization
    "poisson__fit_intercept": [True, False]
}

# ========== RMSE scorer ==========
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# ========== GridSearch ==========
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# ⚠️ FIT ON RAW y, NOT LOG
grid.fit(X_train_final, y_train)

print("Best Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# ========== Evaluate ==========
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, np.log1p(y_pred_test)))

print("Test RMSE:", test_rmse)


Best Params: {'poisson__alpha': 10.0, 'poisson__fit_intercept': True}
Best CV RMSE: 32313.8184125024
Test RMSE: 0.12757021955405573


## Neural Network Regression (MLPRegressor)

In [13]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Base MLP model
mlp = MLPRegressor(
    max_iter=5000,
    random_state=42
)

# Hyperparameter grid
param_grid = {
    "hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64)],
    "activation": ["relu", "tanh"],
    "alpha": [1e-5, 1e-4, 1e-3],
    "learning_rate_init": [0.001, 0.01],
}

# GridSearchCV
grid = GridSearchCV(
    mlp,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# Fit
grid.fit(X_train_final, y_train_log)

print("Best Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Test evaluation
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)


Best Params: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (128,), 'learning_rate_init': 0.01}
Best CV RMSE: 0.13625146233540686
Test RMSE: 0.1392120628979391


## Kernel Ridge Regression (KRR)

In [14]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Pipeline: scaling + kernel ridge regression
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("krr", KernelRidge())
])

# Hyperparameter grid
param_grid = {
    "krr__alpha": [1e-2, 1e-1, 1, 10],            # regularization
    "krr__kernel": ["linear", "polynomial", "rbf"], 
    "krr__degree": [2, 3, 4],                     # only for polynomial kernel
    "krr__gamma": [0.01, 0.1, 1, 10]             # only for rbf/poly kernel
}

# GridSearchCV
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1,
    refit=True
)

# Fit
grid.fit(X_train_final, y_train_log)

print("Best Hyperparameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)


Best Hyperparameters: {'krr__alpha': 0.1, 'krr__degree': 2, 'krr__gamma': 0.01, 'krr__kernel': 'polynomial'}
Best CV RMSE: 0.1649557616593605
Test RMSE: 0.13911764097684137


## Polynomial Regression

In [15]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Pipeline: Polynomial features + scaling + linear regression
pipe = Pipeline([
    ("poly", PolynomialFeatures(include_bias=False)),
    ("lr", LinearRegression())
])

# Hyperparameter grid: degree of polynomial
param_grid = {
    "poly__degree": [2],  # try 2nd, 3rd, 4th degree
}

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# GridSearchCV
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# Fit
grid.fit(X_train_final, y_train_log)

print("Best Polynomial Degree:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)


Best Polynomial Degree: {'poly__degree': 2}
Best CV RMSE: 0.15299423296688647
Test RMSE: 0.15548587765040164


## SVR

In [16]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Pipeline: scale + SVR
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR())
])

# Hyperparameter grid for tuning
param_grid = {
    "svr__C": [0.1, 1, 10, 100],
    "svr__epsilon": [0.05, 0.1, 0.2],
    "svr__gamma": ["scale", "auto"]
}

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Grid search with 5-fold CV
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# Fit SVR model
grid.fit(X_train_final, y_train_log)

print("Best Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)

Best Params: {'svr__C': 1, 'svr__epsilon': 0.05, 'svr__gamma': 'auto'}
Best CV RMSE: 0.18884353563650363
Test RMSE: 0.1878527824521924


In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Log-transform target
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Scale + SVR pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svr", SVR())
])

# Heuristic for RBF gamma: midpoint between 10th and 90th percentile of squared distances
from sklearn.metrics.pairwise import euclidean_distances

dists = euclidean_distances(X_train_final, X_train_final)
dists_squared = dists ** 2
gamma_rbf_guess = 1 / np.median(dists_squared)  # rough estimate (can refine)

# Hyperparameter grid
param_grid = [
    {
        "svr__kernel": ["linear"],
        "svr__C": [0.1, 1, 10, 100],
        "svr__epsilon": [0.05, 0.1, 0.2]
    },
    {
        "svr__kernel": ["rbf"],
        "svr__C": [1, 10, 100],
        "svr__epsilon": [0.05, 0.1],
        "svr__gamma": [gamma_rbf_guess / 2, gamma_rbf_guess, gamma_rbf_guess * 2]
    },
    {
        "svr__kernel": ["poly"],
        "svr__C": [1, 10],
        "svr__epsilon": [0.05, 0.1],
        "svr__degree": [2, 3, 4],
        "svr__gamma": ["scale", "auto"]
    },
    {
        "svr__kernel": ["sigmoid"],
        "svr__C": [1, 10],
        "svr__epsilon": [0.05, 0.1],
        "svr__gamma": ["scale", "auto"]
    }
]

# RMSE scorer
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# Grid search
grid = GridSearchCV(
    pipe,
    param_grid,
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1
)

# Fit SVR
grid.fit(X_train_final, y_train_log)

# Best parameters
print("Best SVR Params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# Evaluate on test set
y_pred_test = grid.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("Test RMSE:", test_rmse)

Best SVR Params: {'svr__C': 100, 'svr__epsilon': 0.05, 'svr__kernel': 'linear'}
Best CV RMSE: 0.1340834593893518
Test RMSE: 0.12638908333891602


## XGBoost

In [19]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# -----------------------------
# Log-transform target
# -----------------------------
y_train_log = np.log1p(y_train.squeeze())
y_test_log = np.log1p(y_test.squeeze())

# -----------------------------
# Hyperparameters for small dataset
# -----------------------------
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.9,
    "reg_alpha": 1,
    "reg_lambda": 1,
    "gamma": 0,
    "n_estimators": 2000,   # fixed boosting rounds
    "verbosity": 1,
    "tree_method": "hist",  # fast
    "random_state": 42
    # You can add monotone_constraints=[...] if needed
}

# -----------------------------
# K-Fold CV
# -----------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
fold_rmse = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_final)):
    print(f"\n===== Fold {fold+1} =====")
    X_tr, X_val = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
    y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

    model = xgb.XGBRegressor(**params)

    # No early stopping
    model.fit(X_tr, y_tr, verbose=50)

    models.append(model)
    preds_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    fold_rmse.append(rmse)
    print(f"Fold RMSE: {rmse:.4f}")

print("\nAverage CV RMSE:", np.mean(fold_rmse))

# -----------------------------
# Predict test set using average of fold models
# -----------------------------
test_pred_log = np.mean([m.predict(X_test_final) for m in models], axis=0)
test_rmse_log = np.sqrt(mean_squared_error(y_test_log, test_pred_log))
print("Test RMSE (log scale):", test_rmse_log)

# Convert back to original scale
test_pred = np.expm1(test_pred_log)
test_rmse_orig = np.sqrt(mean_squared_error(y_test.squeeze(), test_pred))
print("Test RMSE (original scale):", test_rmse_orig)



===== Fold 1 =====
Fold RMSE: 0.1368

===== Fold 2 =====
Fold RMSE: 0.1159

===== Fold 3 =====
Fold RMSE: 0.1290

===== Fold 4 =====
Fold RMSE: 0.1352

===== Fold 5 =====
Fold RMSE: 0.1599

Average CV RMSE: 0.13535158127786515
Test RMSE (log scale): 0.12734955094874753
Test RMSE (original scale): 26458.972920353503


#### XGBoost Tweedie Objective

### XGBoost Fair Loss

### LightGbm

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
import pandas as pd

# -----------------------------
# Log-transform target
# -----------------------------
y_train_log = np.log1p(y_train.squeeze())
y_test_log = np.log1p(y_test.squeeze())

# -----------------------------
# Categorical columns as category dtype
# -----------------------------
for col in X_train_final.select_dtypes('object').columns:
    X_train_final[col] = X_train_final[col].astype('category')
    X_test_final[col] = X_test_final[col].astype('category')

# -----------------------------
# RMSE scorer
# -----------------------------
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# -----------------------------
# LightGBM regressor
# -----------------------------
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

# -----------------------------
# Hyperparameter grid (small dataset guidance)
# -----------------------------
param_distributions = {
    "num_leaves": [20, 31, 40, 50],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.05, 0.08, 0.1],
    "n_estimators": [50, 100, 150, 200],
    "min_child_samples": [50, 75, 100],
    "lambda_l1": [1, 5, 10],
    "lambda_l2": [1, 5, 10],
    "feature_fraction": [0.8, 0.9, 1.0],
    "bagging_fraction": [0.8, 0.9, 1.0],
    "bagging_freq": [1]  # enable bagging
}

# -----------------------------
# Randomized search with 5-fold CV
# -----------------------------
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_distributions,
    n_iter=50,  # number of random combinations to try
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# -----------------------------
# Fit model
# -----------------------------
random_search.fit(X_train_final, y_train_log)

# -----------------------------
# Best parameters and CV RMSE
# -----------------------------
print("Best LightGBM Params:", random_search.best_params_)
print("Best CV RMSE:", -random_search.best_score_)

# -----------------------------
# Evaluate on test set
# -----------------------------
y_pred_test_log = random_search.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test_log))
print("Test RMSE (log scale):", test_rmse)

# Convert predictions back to original scale
y_pred_test = np.expm1(y_pred_test_log)
test_rmse_orig = np.sqrt(mean_squared_error(y_test.squeeze(), y_pred_test))
print("Test RMSE (original scale):", test_rmse_orig)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best LightGBM Params: {'num_leaves': 31, 'n_estimators': 200, 'min_child_samples': 50, 'max_depth': 3, 'learning_rate': 0.1, 'lambda_l2': 10, 'lambda_l1': 1, 'feature_fraction': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}
Best CV RMSE: 0.13935702876975375
Test RMSE (log scale): 0.1304644670756598
Test RMSE (original scale): 27807.402577616966


## RandomForestReg

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=500, random_state=42)
cv_scores = -cross_val_score(rf, X_train_final, y_train_log, cv=5,
                             scoring='neg_root_mean_squared_error', n_jobs=-1)
print("RF CV RMSE:", cv_scores.mean())

rf.fit(X_train_final, y_train_log)
y_pred_test = rf.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("RF Test RMSE:", test_rmse)



## GBR

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                               max_depth=4, random_state=42)
cv_scores = -cross_val_score(gb, X_train_scaled, y_train_log, cv=5,
                             scoring='neg_root_mean_squared_error', n_jobs=-1)
print("GBM CV RMSE:", cv_scores.mean())

gb.fit(X_train_scaled, y_train_log)
y_pred_test = gb.predict(X_test_scaled)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
print("GBM Test RMSE:", test_rmse)


In [11]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np
import pandas as pd

# -----------------------------
# Log-transform target
# -----------------------------
y_train_log = np.log1p(y_train.squeeze())
y_test_log = np.log1p(y_test.squeeze())

# -----------------------------
# Categorical columns as category dtype
# -----------------------------
for col in X_train_final.select_dtypes('object').columns:
    X_train_final[col] = X_train_final[col].astype('category')
    X_test_final[col] = X_test_final[col].astype('category')

# -----------------------------
# RMSE scorer
# -----------------------------
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

# -----------------------------
# LightGBM regressor
# -----------------------------
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

# -----------------------------
# Hyperparameter grid (small dataset guidance)
# -----------------------------
param_distributions = {
    "num_leaves": [20, 31, 40, 50],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.05, 0.08, 0.1],
    "n_estimators": [50, 100, 150, 200],
    "min_child_samples": [50, 75, 100],
    "lambda_l1": [1, 5, 10],
    "lambda_l2": [1, 5, 10],
    "feature_fraction": [0.8, 0.9, 1.0],
    "bagging_fraction": [0.8, 0.9, 1.0],
    "bagging_freq": [1]  # enable bagging
}

# -----------------------------
# Randomized search with 5-fold CV
# -----------------------------
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_distributions,
    n_iter=50,  # number of random combinations to try
    scoring=rmse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# -----------------------------
# Fit model
# -----------------------------
random_search.fit(X_train_final, y_train_log)

# -----------------------------
# Best parameters and CV RMSE
# -----------------------------
print("Best LightGBM Params:", random_search.best_params_)
print("Best CV RMSE:", -random_search.best_score_)

# -----------------------------
# Evaluate on test set
# -----------------------------
y_pred_test_log = random_search.predict(X_test_final)
test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test_log))
print("Test RMSE (log scale):", test_rmse)

# Convert predictions back to original scale
y_pred_test = np.expm1(y_pred_test_log)
test_rmse_orig = np.sqrt(mean_squared_error(y_test.squeeze(), y_pred_test))
print("Test RMSE (original scale):", test_rmse_orig)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best LightGBM Params: {'num_leaves': 31, 'n_estimators': 200, 'min_child_samples': 50, 'max_depth': 3, 'learning_rate': 0.1, 'lambda_l2': 10, 'lambda_l1': 1, 'feature_fraction': 1.0, 'bagging_freq': 1, 'bagging_fraction': 1.0}
Best CV RMSE: 0.13935702876975375
Test RMSE (log scale): 0.1304644670756598
Test RMSE (original scale): 27807.402577616966


In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# -----------------------------
# Log-transform target
# -----------------------------
y_train_log = np.log1p(y_train.squeeze())
y_test_log = np.log1p(y_test.squeeze())

# -----------------------------
# Categorical columns as category dtype
# -----------------------------
for col in X_train_final.select_dtypes('object').columns:
    X_train_final[col] = X_train_final[col].astype('category')
    X_test_final[col] = X_test_final[col].astype('category')

# -----------------------------
# Hyperparameters for small dataset with DART
# -----------------------------
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "dart",   # DART boosting
    "num_leaves": 31,
    "max_depth": 5,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_child_samples": 75,
    "lambda_l1": 1,
    "lambda_l2": 1,
    "drop_rate": 0.1,          # fraction of trees dropped each iteration
    "verbose": -1,
    "seed": 42
}

# -----------------------------
# Number of boosting rounds
# -----------------------------
num_boost_round = 2000

# -----------------------------
# K-Fold CV
# -----------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
models = []
fold_rmse = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_final)):
    print(f"\n===== Fold {fold+1} =====")
    X_tr, X_val = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
    y_tr, y_val = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

    train_set = lgb.Dataset(X_tr, y_tr)
    valid_set = lgb.Dataset(X_val, y_val, reference=train_set)

    model = lgb.train(
        params,
        train_set,
        num_boost_round=num_boost_round,
        valid_sets=[valid_set],
        callbacks=[lgb.log_evaluation(period=50)]  # Prints every 50 iterations
    )

    models.append(model)
    preds_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds_val))
    fold_rmse.append(rmse)
    print(f"Fold RMSE: {rmse:.4f}")

print("\nAverage CV RMSE:", np.mean(fold_rmse))

# -----------------------------
# Predict test set using average of fold models
# -----------------------------
test_pred_log = np.mean([m.predict(X_test_final) for m in models], axis=0)
test_rmse_log = np.sqrt(mean_squared_error(y_test_log, test_pred_log))
print("Test RMSE (log scale):", test_rmse_log)

# Convert back to original scale
test_pred = np.expm1(test_pred_log)
test_rmse_orig = np.sqrt(mean_squared_error(y_test.squeeze(), test_pred))
print("Test RMSE (original scale):", test_rmse_orig)



===== Fold 1 =====
[100]	valid_0's rmse: 10.1088
[200]	valid_0's rmse: 10.3726
[300]	valid_0's rmse: 12.1261
[400]	valid_0's rmse: 10.6488
[500]	valid_0's rmse: 11.3268
[600]	valid_0's rmse: 12.7434
[700]	valid_0's rmse: 11.9515
[800]	valid_0's rmse: 11.4003
[900]	valid_0's rmse: 11.7801
[1000]	valid_0's rmse: 11.9442
[1100]	valid_0's rmse: 12.1784
[1200]	valid_0's rmse: 12.0705
[1300]	valid_0's rmse: 11.6512
[1400]	valid_0's rmse: 11.6988
[1500]	valid_0's rmse: 12.1663
[1600]	valid_0's rmse: 12.4506
[1700]	valid_0's rmse: 12.4331
[1800]	valid_0's rmse: 12.3822
[1900]	valid_0's rmse: 12.2581
[2000]	valid_0's rmse: 12.2286
Fold RMSE: 13.0569

===== Fold 2 =====
[100]	valid_0's rmse: 10.1182
[200]	valid_0's rmse: 10.8871
[300]	valid_0's rmse: 12.6559
[400]	valid_0's rmse: 12.5192
[500]	valid_0's rmse: 12.7685
[600]	valid_0's rmse: 12.7934
[700]	valid_0's rmse: 12.6599
[800]	valid_0's rmse: 12.4095
[900]	valid_0's rmse: 11.8762
[1000]	valid_0's rmse: 12.0884
[1100]	valid_0's rmse: 12.464

# Full Pipeline Setup

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Numeric columns to transform
to_transform = X_train.select_dtypes(include=['int64','float64']).columns

# Define models
models = {
    "Lasso": LassoCV(alphas=np.logspace(-4, 1, 10), cv=5, max_iter=5000),
    "RandomForest": RandomForestRegressor(n_estimators=500, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=500, learning_rate=0.05,
                                                 max_depth=4, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4,
                                subsample=0.8, colsample_bytree=0.8, random_state=42)
}

results = []

for name, model in models.items():
    # Pipeline: Yeo-Johnson + StandardScaler + Model
    pipe = Pipeline([
        ("yeojohnson", PowerTransformer(method='yeo-johnson')),
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    
    # 5-fold CV RMSE
    cv_rmse = -cross_val_score(pipe, X_train, y_train_log, cv=5,
                               scoring='neg_root_mean_squared_error', n_jobs=-1).mean()
    
    # Fit on full training
    pipe.fit(X_train, y_train_log)
    y_pred_test = pipe.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_test))
    
    results.append({
        "Model": name,
        "CV_RMSE": cv_rmse,
        "Test_RMSE": test_rmse
    })

# Compare
results_df = pd.DataFrame(results).sort_values(by='Test_RMSE')
results_df
