# 1. Import Libraries and Data

The molecular descriptors and fingerprints data was previously created in the `PI_data_loading.ipynb` notebook and has been stored in the data folder as csv file.

In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Models for scaled data
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Tree based models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
pi = pd.read_csv('../data/pi_df_clean.csv')
print(pi.shape)
pi.head()

(834, 13)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_units_norm,IC50_nM,pIC50
0,CHEMBL1627209,O=C(N[C@@H]1c2ccccc2C[C@@H]1O)[C@@H](Cc1ccccc1...,IC50,21.1,7.68,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,21.1,7.675718
1,CHEMBL1627287,N=[S+]([O-])([C@H](Cc1ccccc1)C(=O)N[C@H]1c2ccc...,IC50,153.9,6.81,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,153.9,6.812761
2,CHEMBL1627235,N=[S+]([O-])([C@H](Cc1ccccc1)C(=O)N[C@H]1c2ccc...,IC50,37.3,7.43,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,37.3,7.428291
3,CHEMBL1627210,N=[S+]([O-])(C[C@H](Cc1ccccc1)C(=O)N[C@@H]1c2c...,IC50,2.5,8.6,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,2.5,8.60206
4,CHEMBL396814,O=C(N[C@H]1c2ccccc2C[C@H]1O)[C@@H](Cc1ccccc1)C...,IC50,10000.0,,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,10000.0,5.0


# Preprocessing

In [3]:
df = pd.read_csv('../data/pi_qsar_features.csv')
print("rows, cols:", df.shape)

rows, cols: (834, 523)


In [4]:
df.head()

Unnamed: 0,MolWt,MolLogP,MolMR,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumAromaticRings,HeavyAtomCount,FractionCSP3,...,Morgan_512_503,Morgan_512_504,Morgan_512_505,Morgan_512_506,Morgan_512_507,Morgan_512_508,Morgan_512_509,Morgan_512_510,Morgan_512_511,pIC50
0,636.814,4.0019,179.2214,121.72,4,5,12,4,46,0.315789,...,0,0,0,0,1,0,0,0,0,7.675718
1,623.775,3.80477,172.8031,145.57,5,6,10,4,45,0.277778,...,0,0,0,0,1,0,0,0,0,6.812761
2,623.775,3.80477,172.8031,145.57,5,6,10,4,45,0.277778,...,0,0,0,0,1,0,0,0,0,7.428291
3,651.829,4.29997,181.9411,145.57,5,6,12,4,47,0.315789,...,0,0,0,0,1,0,0,0,0,8.60206
4,636.814,4.0019,179.2214,121.72,4,5,12,4,46,0.315789,...,0,0,0,0,1,0,0,0,0,5.0


In [5]:
# Prepare X and y
y = df['pIC50']
X = df.drop(columns=['pIC50'])
print('Feature count: ', X.shape[1])

Feature count:  522


In [6]:
# ------------------------------------------------------------------------------
# Quick feature cleanup
# ------------------------------------------------------------------------------

# a. remove near-constant features

# Fit the selector
vt = VarianceThreshold(threshold=1e-6)
X_v = vt.fit_transform(X)
# Features kept
kept_features = X.columns[vt.get_support()].tolist()
# Features removed
removed_features = [col for col in X.columns if col not in kept_features]
# Create the reduced DataFrame
X = pd.DataFrame(X_v, columns=kept_features)

print(f"Features after variance filter: {len(kept_features)}")
print(f"Features removed: {len(removed_features)}")
# Optional: display or save the removed features
print("\nRemoved features (first 20):")
print(removed_features[:20])

Features after variance filter: 521
Features removed: 1

Removed features (first 20):
['Morgan_512_499']


In [7]:
# b. (optional) remove extremely collinear features (simple correlation filter)

corr_thresh = 0.98
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > corr_thresh)]
if to_drop:
    X.drop(columns=to_drop, inplace=True)
    print("Dropped highly correlated features:", to_drop)

Dropped highly correlated features: ['MolMR', 'HeavyAtomCount']


In [8]:
X.shape

(834, 519)

In [9]:
# ------------------------------------------------------------------------------
# Define descriptor columns to scale and fingerprint columns to pass through
# ------------------------------------------------------------------------------
descriptors = ['MolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors',
    'NumRotatableBonds', 'NumAromaticRings', 'FractionCSP3']
fingerprints = [c for c in X.columns if c not in descriptors]

# ColumnTransformer: scale descriptors, pass fingerprints through
preprocessor = ColumnTransformer([
    ("desc", StandardScaler(), descriptors),
    ("fp", "passthrough", fingerprints)
])

In [10]:
X.head()

Unnamed: 0,MolWt,MolLogP,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumAromaticRings,FractionCSP3,Morgan_512_0,Morgan_512_1,...,Morgan_512_502,Morgan_512_503,Morgan_512_504,Morgan_512_505,Morgan_512_506,Morgan_512_507,Morgan_512_508,Morgan_512_509,Morgan_512_510,Morgan_512_511
0,636.814,4.0019,121.72,4.0,5.0,12.0,4.0,0.315789,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,623.775,3.80477,145.57,5.0,6.0,10.0,4.0,0.277778,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,623.775,3.80477,145.57,5.0,6.0,10.0,4.0,0.277778,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,651.829,4.29997,145.57,5.0,6.0,12.0,4.0,0.315789,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,636.814,4.0019,121.72,4.0,5.0,12.0,4.0,0.315789,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
import os

SAVE_DIR = "../artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

# 1. Save final feature column list (after filters)
final_features = X.columns.tolist()
pd.Series(final_features).to_csv(os.path.join(SAVE_DIR, "protease_features.csv"), index=False)

In [12]:
len(final_features)

519

In [13]:
# ---------------------------
# k-fold OOF trainer
# ---------------------------
def kfold_train_predict(model, X_df, y_arr, n_splits=5, random_state=42):
    """
    model: estimator or pipeline (should accept fit/predict)
    X_df: pandas DataFrame (full dataset)
    y_arr: numpy array or Series
    returns: oof_preds (np.array same length as X_df), mean_r2, mean_rmse
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_df, y_arr), 1):
        X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
        y_train, y_val = y_arr[train_idx], y_arr[val_idx]

        # fit on train fold, predict on val fold
        model.fit(X_train, y_train)
        preds = model.predict(X_val)

        oof_preds[val_idx] = preds

        fold_r2 = r2_score(y_val, preds)
        fold_rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_scores.append((fold_r2, fold_rmse))
        print(f"Fold {fold}: R2 = {fold_r2:.3f}, RMSE = {fold_rmse:.3f}")

    mean_r2 = np.mean([s[0] for s in fold_scores])
    mean_rmse = np.mean([s[1] for s in fold_scores])
    print(f"\nMean CV R2 = {mean_r2:.3f}, Mean RMSE = {mean_rmse:.3f}")
    return oof_preds, mean_r2, mean_rmse

In [14]:
# ---------------------------
# Define models and pipelines
# ---------------------------
models = {}

# Linear models (need scaling) -> include preprocessor
models['Linear'] = Pipeline([("preproc", preprocessor), ("model", LinearRegression())])
models['ElasticNet'] = Pipeline([("preproc", preprocessor), ("model", ElasticNet(random_state=42, max_iter=5000))])

# SVR (needs scaling)
models['SVM'] = Pipeline([("preproc", preprocessor), ("model", SVR())])

# KNN
models['KNN'] = Pipeline([("preproc", preprocessor), ("model", KNeighborsRegressor())])

# MLP
models['MLP'] = Pipeline([("preproc", preprocessor), ("model", MLPRegressor(max_iter=2000, random_state=42))])

# Random Forest (trees don't need scaling, but pipeline is fine)
models['Random Forest'] = Pipeline([("preproc", preprocessor), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])

# XGBoost
models['XGBoost'] = Pipeline([("preproc", preprocessor), ("model", XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, objective='reg:squarederror'))])

In [15]:
# ---------------------------
# Run k-fold for each model
# ---------------------------
results = {}
oof_dict = {}

for name, model in models.items():
    print(f"\n=== Training model: {name} ===")
    oof_preds, mean_r2, mean_rmse = kfold_train_predict(model, X, y, n_splits=5, random_state=42)
    results[name] = {"r2": mean_r2, "rmse": mean_rmse}
    oof_dict[name] = oof_preds

# Summary
print("\nSummary (mean CV results):")
for name, res in results.items():
    print(f"{name}: R2 = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")


=== Training model: Linear ===
Fold 1: R2 = -1.433, RMSE = 2.736
Fold 2: R2 = -0.521, RMSE = 2.214
Fold 3: R2 = -1.164, RMSE = 2.549
Fold 4: R2 = -0.456, RMSE = 2.226
Fold 5: R2 = -0.955, RMSE = 2.571

Mean CV R2 = -0.906, Mean RMSE = 2.459

=== Training model: ElasticNet ===
Fold 1: R2 = 0.121, RMSE = 1.644
Fold 2: R2 = 0.085, RMSE = 1.717
Fold 3: R2 = 0.099, RMSE = 1.644
Fold 4: R2 = 0.118, RMSE = 1.732
Fold 5: R2 = 0.116, RMSE = 1.728

Mean CV R2 = 0.108, Mean RMSE = 1.693

=== Training model: SVM ===
Fold 1: R2 = 0.785, RMSE = 0.814
Fold 2: R2 = 0.777, RMSE = 0.849
Fold 3: R2 = 0.773, RMSE = 0.826
Fold 4: R2 = 0.768, RMSE = 0.889
Fold 5: R2 = 0.744, RMSE = 0.930

Mean CV R2 = 0.769, Mean RMSE = 0.862

=== Training model: KNN ===
Fold 1: R2 = 0.783, RMSE = 0.817
Fold 2: R2 = 0.720, RMSE = 0.950
Fold 3: R2 = 0.772, RMSE = 0.828
Fold 4: R2 = 0.781, RMSE = 0.863
Fold 5: R2 = 0.746, RMSE = 0.926

Mean CV R2 = 0.760, Mean RMSE = 0.877

=== Training model: MLP ===
Fold 1: R2 = 0.760, RMS

# 6. Tune the Best Models

### Support Vector Machine (SVR),
### K-Nearest Neighbours (KNN),
### Random Forest (RF)

In [16]:
import optuna
import joblib
from sklearn.decomposition import PCA

In [17]:
# PCA variant of the model
n_fp = len(fingerprints)
print(f"Using {len(descriptors)} descriptor columns and {n_fp} fingerprint columns.")

if n_fp == 0:
    raise ValueError("No fingerprint columns detected. Make sure your X contains fingerprint bit columns.")

# ---------- CV settings ----------
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

Using 8 descriptor columns and 511 fingerprint columns.


In [18]:
# ----------------------- SVM ---------------------------------
def run_optuna_svm(n_trials=80):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        # SVR hyperparameters (log-uniform sampling where appropriate)
        kernel = trial.suggest_categorical("kernel", ["rbf", "poly", "sigmoid"])
        C = trial.suggest_loguniform("C", 1e-3, 1e3)
        epsilon = trial.suggest_loguniform("epsilon", 1e-4, 1.0)
        gamma_choice = trial.suggest_categorical("gamma_choice", ["scale", "auto", "numeric"])
        if gamma_choice == "numeric":
            gamma = trial.suggest_loguniform("gamma", 1e-5, 1e1)
        else:
            gamma = gamma_choice
        # degree and coef0 for polynomial kernel
        if kernel == "poly":
            degree = trial.suggest_int("degree", 2, 5)
            coef0 = trial.suggest_float("coef0", 0.0, 1.0)
        else:
            degree = 3
            coef0 = 0.0

        svr = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, degree=degree, coef0=coef0, max_iter=100000)
        pipe = Pipeline([("preproc", preprocessor), ("svr", svr)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    # reconstruct SVR
    best_kernel = best.get("kernel", "rbf")
    best_gamma = best.get("gamma", best.get("gamma_choice", "scale"))
    if best_gamma == "numeric":
        best_gamma = best.get("gamma")
    best_degree = best.get("degree", 3)
    best_coef0 = best.get("coef0", 0.0)

    best_svm = SVR(
        kernel=best_kernel,
        C=best.get("C"),
        epsilon=best.get("epsilon"),
        gamma=best_gamma,
        degree=best_degree,
        coef0=best_coef0,
        max_iter=100000
    )
    best_pipe = Pipeline([("preproc", preprocessor_best), ("svr", best_svm)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_svm.joblib")
    print("SVM best R2:", study.best_value)
    print("SVM best params:", study.best_params)
    return study, best_pipe

In [19]:
# ------------------ KNN -----------------------
def run_optuna_knn(n_trials=50):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        p = trial.suggest_int("p", 1, 2)  # 1 = manhattan, 2 = euclidean

        knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p)
        pipe = Pipeline([("preproc", preprocessor), ("knn", knn)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    best_knn = KNeighborsRegressor(n_neighbors=best["n_neighbors"], weights=best["weights"], p=best["p"])
    best_pipe = Pipeline([("preproc", preprocessor_best), ("knn", best_knn)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_knn.joblib")
    print("KNN best R2:", study.best_value)
    print("KNN best params:", study.best_params)
    return study, best_pipe

In [20]:
# -------------------- Random Forest --------------------------------
def run_optuna_rf(n_trials=50):
    def objective(trial):
        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", "passthrough", fingerprints)
        ])

        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 3, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5, 0.8])

        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                   max_features=max_features, n_jobs=-1, random_state=42)
        pipe = Pipeline([("preproc", preprocessor), ("rf", rf)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    best = study.best_params
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", "passthrough", fingerprints)
    ])
    best_rf = RandomForestRegressor(n_estimators=best["n_estimators"], max_depth=best["max_depth"],
                                    min_samples_split=best["min_samples_split"],
                                    min_samples_leaf=best["min_samples_leaf"],
                                    max_features=best["max_features"], n_jobs=-1, random_state=42)
    best_pipe = Pipeline([("preproc", preprocessor_best), ("rf", best_rf)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_rf.joblib")
    print("RF best R2:", study.best_value)
    print("RF best params:", study.best_params)
    return study, best_pipe

In [21]:
# ----------------------------
# Example: run all studies sequentially (adjust n_trials)
# ----------------------------
svm_study, svm_pipe = run_optuna_svm(n_trials=50)

[I 2025-10-07 23:01:18,821] A new study created in memory with name: no-name-54c724f9-bd88-40eb-ad75-4c1b49e77415
Best trial: 0. Best value: 0.333167:   2%|▏         | 1/50 [00:11<09:23, 11.50s/it]

[I 2025-10-07 23:01:30,315] Trial 0 finished with value: 0.33316702744851867 and parameters: {'use_pca': True, 'pca_n_components': 203, 'kernel': 'poly', 'C': 0.144984185718093, 'epsilon': 0.0005468826676351325, 'gamma_choice': 'auto', 'degree': 3, 'coef0': 0.33945995115873917}. Best is trial 0 with value: 0.33316702744851867.


Best trial: 0. Best value: 0.333167:   4%|▍         | 2/50 [00:20<07:48,  9.75s/it]

[I 2025-10-07 23:01:38,841] Trial 1 finished with value: 0.07098982482087728 and parameters: {'use_pca': True, 'pca_n_components': 131, 'kernel': 'rbf', 'C': 0.04051205306627068, 'epsilon': 0.009937489409736737, 'gamma_choice': 'numeric', 'gamma': 0.000588086340924328}. Best is trial 0 with value: 0.33316702744851867.


Best trial: 0. Best value: 0.333167:   6%|▌         | 3/50 [00:20<04:26,  5.67s/it]

[I 2025-10-07 23:01:39,648] Trial 2 finished with value: 0.019193277052792147 and parameters: {'use_pca': True, 'pca_n_components': 142, 'kernel': 'rbf', 'C': 0.026173252201609147, 'epsilon': 0.06965734708026083, 'gamma_choice': 'numeric', 'gamma': 0.0003539407748846822}. Best is trial 0 with value: 0.33316702744851867.


Best trial: 0. Best value: 0.333167:   8%|▊         | 4/50 [00:21<02:48,  3.66s/it]

[I 2025-10-07 23:01:40,229] Trial 3 finished with value: -1.008950886614687 and parameters: {'use_pca': True, 'pca_n_components': 53, 'kernel': 'sigmoid', 'C': 1.83377304114063, 'epsilon': 0.0021513354301928014, 'gamma_choice': 'scale'}. Best is trial 0 with value: 0.33316702744851867.


Best trial: 4. Best value: 0.672905:  10%|█         | 5/50 [00:24<02:33,  3.41s/it]

[I 2025-10-07 23:01:43,206] Trial 4 finished with value: 0.6729047237519261 and parameters: {'use_pca': True, 'pca_n_components': 32, 'kernel': 'poly', 'C': 0.01090078964967063, 'epsilon': 0.0012977844943093202, 'gamma_choice': 'numeric', 'gamma': 2.9076262272038047, 'degree': 2, 'coef0': 0.3599735512613764}. Best is trial 4 with value: 0.6729047237519261.


Best trial: 5. Best value: 0.717002:  12%|█▏        | 6/50 [00:25<01:49,  2.50s/it]

[I 2025-10-07 23:01:43,931] Trial 5 finished with value: 0.7170022018720559 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 13.163702697079021, 'epsilon': 0.031130756008527515, 'gamma_choice': 'auto', 'degree': 5, 'coef0': 0.35445552171870265}. Best is trial 5 with value: 0.7170022018720559.


Best trial: 5. Best value: 0.717002:  14%|█▍        | 7/50 [00:25<01:22,  1.91s/it]

[I 2025-10-07 23:01:44,630] Trial 6 finished with value: -0.017434606429448163 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.01708175886340897, 'epsilon': 0.31422253243875825, 'gamma_choice': 'numeric', 'gamma': 4.22397608667676}. Best is trial 5 with value: 0.7170022018720559.


Best trial: 7. Best value: 0.758075:  16%|█▌        | 8/50 [00:27<01:14,  1.76s/it]

[I 2025-10-07 23:01:46,079] Trial 7 finished with value: 0.7580747929708265 and parameters: {'use_pca': True, 'pca_n_components': 246, 'kernel': 'poly', 'C': 567.8183437573593, 'epsilon': 0.00029323574308138167, 'gamma_choice': 'numeric', 'gamma': 0.0003035451937079993, 'degree': 5, 'coef0': 0.8172077228733875}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 7. Best value: 0.758075:  18%|█▊        | 9/50 [00:29<01:13,  1.78s/it]

[I 2025-10-07 23:01:47,898] Trial 8 finished with value: 0.6367424462857741 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 340.47319254410735, 'epsilon': 0.041327568040439355, 'gamma_choice': 'auto'}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 7. Best value: 0.758075:  20%|██        | 10/50 [00:30<01:01,  1.54s/it]

[I 2025-10-07 23:01:48,901] Trial 9 finished with value: 0.22081711989375927 and parameters: {'use_pca': True, 'pca_n_components': 258, 'kernel': 'sigmoid', 'C': 0.008231146234089437, 'epsilon': 0.0010614046667260253, 'gamma_choice': 'scale'}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 7. Best value: 0.758075:  22%|██▏       | 11/50 [00:30<00:51,  1.33s/it]

[I 2025-10-07 23:01:49,746] Trial 10 finished with value: 0.7548896302608108 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 815.403889559941, 'epsilon': 0.00010579099247821183, 'gamma_choice': 'numeric', 'gamma': 1.012554409534924e-05, 'degree': 5, 'coef0': 0.9695144579386619}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 7. Best value: 0.758075:  24%|██▍       | 12/50 [00:31<00:45,  1.18s/it]

[I 2025-10-07 23:01:50,602] Trial 11 finished with value: 0.7542392767132715 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 927.7809313800011, 'epsilon': 0.00012633974939104713, 'gamma_choice': 'numeric', 'gamma': 1.0608077724461657e-05, 'degree': 5, 'coef0': 0.9978934076551094}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 7. Best value: 0.758075:  26%|██▌       | 13/50 [00:32<00:39,  1.06s/it]

[I 2025-10-07 23:01:51,391] Trial 12 finished with value: 0.6368042410239286 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 55.60822118290568, 'epsilon': 0.00011091044142910736, 'gamma_choice': 'numeric', 'gamma': 1.2419315716259243e-05, 'degree': 5, 'coef0': 0.9426367879587397}. Best is trial 7 with value: 0.7580747929708265.


Best trial: 13. Best value: 0.760595:  28%|██▊       | 14/50 [00:33<00:35,  1.00it/s]

[I 2025-10-07 23:01:52,239] Trial 13 finished with value: 0.7605951869133601 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 48.78343124016288, 'epsilon': 0.0004134366960635198, 'gamma_choice': 'numeric', 'gamma': 0.0004731337241592208, 'degree': 4, 'coef0': 0.7248775327461312}. Best is trial 13 with value: 0.7605951869133601.


Best trial: 14. Best value: 0.762537:  30%|███       | 15/50 [00:34<00:39,  1.12s/it]

[I 2025-10-07 23:01:53,621] Trial 14 finished with value: 0.7625372719450851 and parameters: {'use_pca': True, 'pca_n_components': 285, 'kernel': 'poly', 'C': 43.86879630156964, 'epsilon': 0.0054787889974277845, 'gamma_choice': 'numeric', 'gamma': 0.004131403956885738, 'degree': 4, 'coef0': 0.6811656666794897}. Best is trial 14 with value: 0.7625372719450851.


Best trial: 14. Best value: 0.762537:  32%|███▏      | 16/50 [00:36<00:40,  1.18s/it]

[I 2025-10-07 23:01:54,947] Trial 15 finished with value: 0.744464969470626 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 8.939804011580922, 'epsilon': 0.0055565919475393105, 'gamma_choice': 'scale', 'degree': 4, 'coef0': 0.6556742642836981}. Best is trial 14 with value: 0.7625372719450851.


Best trial: 14. Best value: 0.762537:  34%|███▍      | 17/50 [00:37<00:37,  1.14s/it]

[I 2025-10-07 23:01:56,000] Trial 16 finished with value: 0.740851717919089 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 51.79219234290182, 'epsilon': 0.005955878116000091, 'gamma_choice': 'numeric', 'gamma': 0.029318840040567166}. Best is trial 14 with value: 0.7625372719450851.


Best trial: 17. Best value: 0.774754:  36%|███▌      | 18/50 [00:38<00:37,  1.16s/it]

[I 2025-10-07 23:01:57,196] Trial 17 finished with value: 0.7747542271449313 and parameters: {'use_pca': True, 'pca_n_components': 281, 'kernel': 'poly', 'C': 0.7557449968385085, 'epsilon': 0.002828582842916999, 'gamma_choice': 'numeric', 'gamma': 0.0176216414493264, 'degree': 4, 'coef0': 0.6440746615307898}. Best is trial 17 with value: 0.7747542271449313.


Best trial: 17. Best value: 0.774754:  38%|███▊      | 19/50 [00:39<00:35,  1.15s/it]

[I 2025-10-07 23:01:58,341] Trial 18 finished with value: 0.0365633857328725 and parameters: {'use_pca': True, 'pca_n_components': 293, 'kernel': 'poly', 'C': 0.44608990818694744, 'epsilon': 0.0029458198879505977, 'gamma_choice': 'auto', 'degree': 3, 'coef0': 0.03469029537604146}. Best is trial 17 with value: 0.7747542271449313.


Best trial: 19. Best value: 0.775242:  40%|████      | 20/50 [00:40<00:35,  1.19s/it]

[I 2025-10-07 23:01:59,627] Trial 19 finished with value: 0.7752418238476378 and parameters: {'use_pca': True, 'pca_n_components': 290, 'kernel': 'rbf', 'C': 1.5573418212482555, 'epsilon': 0.018384290563234776, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  42%|████▏     | 21/50 [00:41<00:32,  1.13s/it]

[I 2025-10-07 23:02:00,610] Trial 20 finished with value: 0.01627276247773979 and parameters: {'use_pca': True, 'pca_n_components': 212, 'kernel': 'rbf', 'C': 0.0018601553828214551, 'epsilon': 0.21422725652796787, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  44%|████▍     | 22/50 [00:43<00:34,  1.22s/it]

[I 2025-10-07 23:02:02,047] Trial 21 finished with value: 0.7743758223209805 and parameters: {'use_pca': True, 'pca_n_components': 297, 'kernel': 'rbf', 'C': 3.8382746029439314, 'epsilon': 0.018277638007212778, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  46%|████▌     | 23/50 [00:44<00:34,  1.26s/it]

[I 2025-10-07 23:02:03,389] Trial 22 finished with value: 0.7744691661588019 and parameters: {'use_pca': True, 'pca_n_components': 299, 'kernel': 'rbf', 'C': 1.4497424983495988, 'epsilon': 0.02176846898293579, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  48%|████▊     | 24/50 [00:45<00:31,  1.20s/it]

[I 2025-10-07 23:02:04,443] Trial 23 finished with value: 0.7337257987717404 and parameters: {'use_pca': True, 'pca_n_components': 237, 'kernel': 'rbf', 'C': 0.3862869477450005, 'epsilon': 0.10309237976430233, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  50%|█████     | 25/50 [00:46<00:26,  1.07s/it]

[I 2025-10-07 23:02:05,215] Trial 24 finished with value: 0.7091243744199243 and parameters: {'use_pca': True, 'pca_n_components': 188, 'kernel': 'rbf', 'C': 0.9072266449043132, 'epsilon': 0.8804456643438189, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  52%|█████▏    | 26/50 [00:47<00:26,  1.11s/it]

[I 2025-10-07 23:02:06,417] Trial 25 finished with value: 0.5468813626383546 and parameters: {'use_pca': True, 'pca_n_components': 263, 'kernel': 'rbf', 'C': 0.09941674169143751, 'epsilon': 0.012649249528576214, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  54%|█████▍    | 27/50 [00:48<00:26,  1.17s/it]

[I 2025-10-07 23:02:07,739] Trial 26 finished with value: 0.7744504956108855 and parameters: {'use_pca': True, 'pca_n_components': 299, 'kernel': 'rbf', 'C': 3.9139868305617886, 'epsilon': 0.02435262035293425, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  56%|█████▌    | 28/50 [00:49<00:22,  1.03s/it]

[I 2025-10-07 23:02:08,436] Trial 27 finished with value: 0.6830672454085203 and parameters: {'use_pca': True, 'pca_n_components': 108, 'kernel': 'rbf', 'C': 0.199634065161188, 'epsilon': 0.06486088596368497, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  58%|█████▊    | 29/50 [00:50<00:21,  1.04s/it]

[I 2025-10-07 23:02:09,499] Trial 28 finished with value: 0.7728967356373437 and parameters: {'use_pca': True, 'pca_n_components': 227, 'kernel': 'rbf', 'C': 1.251357513917136, 'epsilon': 0.0028999632998182225, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  60%|██████    | 30/50 [00:51<00:19,  1.01it/s]

[I 2025-10-07 23:02:10,378] Trial 29 finished with value: 0.47819685248684357 and parameters: {'use_pca': True, 'pca_n_components': 177, 'kernel': 'rbf', 'C': 0.10771039549298791, 'epsilon': 0.01036705375277184, 'gamma_choice': 'auto'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 19. Best value: 0.775242:  62%|██████▏   | 31/50 [00:52<00:19,  1.03s/it]

[I 2025-10-07 23:02:11,511] Trial 30 finished with value: -14.571669500151984 and parameters: {'use_pca': True, 'pca_n_components': 260, 'kernel': 'sigmoid', 'C': 10.292658125374043, 'epsilon': 0.0012629410507429073, 'gamma_choice': 'scale'}. Best is trial 19 with value: 0.7752418238476378.


Best trial: 31. Best value: 0.775542:  64%|██████▍   | 32/50 [00:53<00:19,  1.10s/it]

[I 2025-10-07 23:02:12,778] Trial 31 finished with value: 0.7755418909965857 and parameters: {'use_pca': True, 'pca_n_components': 299, 'kernel': 'rbf', 'C': 3.298628958528029, 'epsilon': 0.023675742757736085, 'gamma_choice': 'scale'}. Best is trial 31 with value: 0.7755418909965857.


Best trial: 32. Best value: 0.776292:  66%|██████▌   | 33/50 [00:55<00:19,  1.16s/it]

[I 2025-10-07 23:02:14,061] Trial 32 finished with value: 0.7762921639146796 and parameters: {'use_pca': True, 'pca_n_components': 276, 'kernel': 'rbf', 'C': 2.968304768439402, 'epsilon': 0.016989681066338858, 'gamma_choice': 'scale'}. Best is trial 32 with value: 0.7762921639146796.


Best trial: 32. Best value: 0.776292:  68%|██████▊   | 34/50 [00:56<00:19,  1.19s/it]

[I 2025-10-07 23:02:15,335] Trial 33 finished with value: 0.7747373356475236 and parameters: {'use_pca': True, 'pca_n_components': 269, 'kernel': 'rbf', 'C': 3.324750783808802, 'epsilon': 0.008180859507501708, 'gamma_choice': 'scale'}. Best is trial 32 with value: 0.7762921639146796.


Best trial: 32. Best value: 0.776292:  70%|███████   | 35/50 [00:57<00:18,  1.23s/it]

[I 2025-10-07 23:02:16,638] Trial 34 finished with value: 0.7570101480279553 and parameters: {'use_pca': True, 'pca_n_components': 273, 'kernel': 'rbf', 'C': 0.6181778362201018, 'epsilon': 0.04134471459212577, 'gamma_choice': 'scale'}. Best is trial 32 with value: 0.7762921639146796.


Best trial: 32. Best value: 0.776292:  72%|███████▏  | 36/50 [00:58<00:16,  1.17s/it]

[I 2025-10-07 23:02:17,670] Trial 35 finished with value: 0.762307809819921 and parameters: {'use_pca': True, 'pca_n_components': 228, 'kernel': 'rbf', 'C': 19.45502401762342, 'epsilon': 0.09911649556795267, 'gamma_choice': 'scale'}. Best is trial 32 with value: 0.7762921639146796.


Best trial: 32. Best value: 0.776292:  74%|███████▍  | 37/50 [01:00<00:15,  1.17s/it]

[I 2025-10-07 23:02:18,835] Trial 36 finished with value: 0.5518792669955366 and parameters: {'use_pca': True, 'pca_n_components': 278, 'kernel': 'rbf', 'C': 0.23923017459224352, 'epsilon': 0.014943739833419121, 'gamma_choice': 'auto'}. Best is trial 32 with value: 0.7762921639146796.


Best trial: 37. Best value: 0.776525:  76%|███████▌  | 38/50 [01:01<00:13,  1.15s/it]

[I 2025-10-07 23:02:19,943] Trial 37 finished with value: 0.7765254438224104 and parameters: {'use_pca': True, 'pca_n_components': 250, 'kernel': 'rbf', 'C': 2.1888423528442957, 'epsilon': 0.0033195680527641947, 'gamma_choice': 'scale'}. Best is trial 37 with value: 0.7765254438224104.


Best trial: 37. Best value: 0.776525:  78%|███████▊  | 39/50 [01:02<00:12,  1.17s/it]

[I 2025-10-07 23:02:21,162] Trial 38 finished with value: 0.7541108660617171 and parameters: {'use_pca': True, 'pca_n_components': 247, 'kernel': 'rbf', 'C': 153.14720796314518, 'epsilon': 0.04928208443849989, 'gamma_choice': 'scale'}. Best is trial 37 with value: 0.7765254438224104.


Best trial: 39. Best value: 0.782866:  80%|████████  | 40/50 [01:03<00:11,  1.11s/it]

[I 2025-10-07 23:02:22,118] Trial 39 finished with value: 0.782865839428826 and parameters: {'use_pca': True, 'pca_n_components': 212, 'kernel': 'rbf', 'C': 2.224704276478787, 'epsilon': 0.19669744579688375, 'gamma_choice': 'scale'}. Best is trial 39 with value: 0.782865839428826.


Best trial: 39. Best value: 0.782866:  82%|████████▏ | 41/50 [01:04<00:09,  1.10s/it]

[I 2025-10-07 23:02:23,222] Trial 40 finished with value: 0.781412454737469 and parameters: {'use_pca': True, 'pca_n_components': 204, 'kernel': 'rbf', 'C': 6.014938042067777, 'epsilon': 0.2191099514946115, 'gamma_choice': 'scale'}. Best is trial 39 with value: 0.782865839428826.


Best trial: 41. Best value: 0.783533:  84%|████████▍ | 42/50 [01:05<00:08,  1.04s/it]

[I 2025-10-07 23:02:24,109] Trial 41 finished with value: 0.7835327289742409 and parameters: {'use_pca': True, 'pca_n_components': 208, 'kernel': 'rbf', 'C': 6.004472924269693, 'epsilon': 0.3815127542035473, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  86%|████████▌ | 43/50 [01:06<00:06,  1.02it/s]

[I 2025-10-07 23:02:24,969] Trial 42 finished with value: 0.776888800313368 and parameters: {'use_pca': True, 'pca_n_components': 203, 'kernel': 'rbf', 'C': 6.407665371125777, 'epsilon': 0.5643859564693117, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  88%|████████▊ | 44/50 [01:06<00:05,  1.06it/s]

[I 2025-10-07 23:02:25,808] Trial 43 finished with value: 0.7236788955372399 and parameters: {'use_pca': True, 'pca_n_components': 203, 'kernel': 'rbf', 'C': 19.5744329248043, 'epsilon': 0.9444193984651594, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  90%|█████████ | 45/50 [01:07<00:04,  1.08it/s]

[I 2025-10-07 23:02:26,706] Trial 44 finished with value: -12.213403991379923 and parameters: {'use_pca': True, 'pca_n_components': 159, 'kernel': 'sigmoid', 'C': 9.30035583905653, 'epsilon': 0.4743029903956596, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  92%|█████████▏| 46/50 [01:08<00:03,  1.18it/s]

[I 2025-10-07 23:02:27,377] Trial 45 finished with value: 0.7803913681535279 and parameters: {'use_pca': True, 'pca_n_components': 188, 'kernel': 'rbf', 'C': 5.9308637086606915, 'epsilon': 0.20329542133752704, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  94%|█████████▍| 47/50 [01:09<00:02,  1.31it/s]

[I 2025-10-07 23:02:27,948] Trial 46 finished with value: 0.7780797191343589 and parameters: {'use_pca': True, 'pca_n_components': 172, 'kernel': 'rbf', 'C': 5.800204281504729, 'epsilon': 0.16793229435045245, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  96%|█████████▌| 48/50 [01:09<00:01,  1.50it/s]

[I 2025-10-07 23:02:28,385] Trial 47 finished with value: 0.6428220916118009 and parameters: {'use_pca': True, 'pca_n_components': 122, 'kernel': 'sigmoid', 'C': 19.66218158042845, 'epsilon': 0.20800865815049707, 'gamma_choice': 'auto'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533:  98%|█████████▊| 49/50 [01:10<00:00,  1.63it/s]

[I 2025-10-07 23:02:28,873] Trial 48 finished with value: 0.7738341491527824 and parameters: {'use_pca': True, 'pca_n_components': 160, 'kernel': 'rbf', 'C': 5.8122729809054015, 'epsilon': 0.11846051281345038, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.


Best trial: 41. Best value: 0.783533: 100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


[I 2025-10-07 23:02:29,346] Trial 49 finished with value: 0.7611497716610789 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 95.23408607528143, 'epsilon': 0.22523215814978376, 'gamma_choice': 'scale'}. Best is trial 41 with value: 0.7835327289742409.
SVM best R2: 0.7835327289742409
SVM best params: {'use_pca': True, 'pca_n_components': 208, 'kernel': 'rbf', 'C': 6.004472924269693, 'epsilon': 0.3815127542035473, 'gamma_choice': 'scale'}


In [22]:
knn_study, knn_pipe = run_optuna_knn(n_trials=50)

[I 2025-10-07 23:02:29,696] A new study created in memory with name: no-name-9b948db5-60b9-4067-9b45-8c160f07746b
Best trial: 0. Best value: 0.638474:   2%|▏         | 1/50 [00:00<00:31,  1.58it/s]

[I 2025-10-07 23:02:30,325] Trial 0 finished with value: 0.6384743341301162 and parameters: {'use_pca': True, 'pca_n_components': 274, 'n_neighbors': 21, 'weights': 'uniform', 'p': 1}. Best is trial 0 with value: 0.6384743341301162.


Best trial: 1. Best value: 0.718032:   4%|▍         | 2/50 [00:01<00:34,  1.38it/s]

[I 2025-10-07 23:02:31,113] Trial 1 finished with value: 0.7180319955484974 and parameters: {'use_pca': False, 'n_neighbors': 16, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7180319955484974.


Best trial: 1. Best value: 0.718032:   6%|▌         | 3/50 [00:01<00:27,  1.71it/s]

[I 2025-10-07 23:02:31,534] Trial 2 finished with value: 0.6978048028974643 and parameters: {'use_pca': False, 'n_neighbors': 28, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7180319955484974.


Best trial: 1. Best value: 0.718032:   8%|▊         | 4/50 [00:02<00:26,  1.76it/s]

[I 2025-10-07 23:02:32,076] Trial 3 finished with value: 0.6978048028974643 and parameters: {'use_pca': False, 'n_neighbors': 28, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7180319955484974.


Best trial: 1. Best value: 0.718032:  10%|█         | 5/50 [00:03<00:29,  1.54it/s]

[I 2025-10-07 23:02:32,868] Trial 4 finished with value: 0.6164052752730969 and parameters: {'use_pca': True, 'pca_n_components': 254, 'n_neighbors': 29, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7180319955484974.


Best trial: 5. Best value: 0.737878:  12%|█▏        | 6/50 [00:03<00:27,  1.62it/s]

[I 2025-10-07 23:02:33,423] Trial 5 finished with value: 0.7378779945307459 and parameters: {'use_pca': True, 'pca_n_components': 195, 'n_neighbors': 11, 'weights': 'uniform', 'p': 1}. Best is trial 5 with value: 0.7378779945307459.


Best trial: 5. Best value: 0.737878:  14%|█▍        | 7/50 [00:03<00:20,  2.08it/s]

[I 2025-10-07 23:02:33,616] Trial 6 finished with value: 0.6982768239547892 and parameters: {'use_pca': False, 'n_neighbors': 21, 'weights': 'uniform', 'p': 2}. Best is trial 5 with value: 0.7378779945307459.


Best trial: 5. Best value: 0.737878:  16%|█▌        | 8/50 [00:04<00:18,  2.27it/s]

[I 2025-10-07 23:02:33,976] Trial 7 finished with value: 0.7083836900224166 and parameters: {'use_pca': True, 'pca_n_components': 11, 'n_neighbors': 11, 'weights': 'distance', 'p': 2}. Best is trial 5 with value: 0.7378779945307459.


Best trial: 5. Best value: 0.737878:  18%|█▊        | 9/50 [00:04<00:16,  2.47it/s]

[I 2025-10-07 23:02:34,300] Trial 8 finished with value: 0.7284530423860461 and parameters: {'use_pca': False, 'n_neighbors': 11, 'weights': 'distance', 'p': 1}. Best is trial 5 with value: 0.7378779945307459.


Best trial: 9. Best value: 0.757454:  20%|██        | 10/50 [00:04<00:15,  2.56it/s]

[I 2025-10-07 23:02:34,658] Trial 9 finished with value: 0.7574536053552073 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 9 with value: 0.7574536053552073.


Best trial: 9. Best value: 0.757454:  22%|██▏       | 11/50 [00:05<00:13,  2.85it/s]

[I 2025-10-07 23:02:34,925] Trial 10 finished with value: 0.5801756294432494 and parameters: {'use_pca': False, 'n_neighbors': 1, 'weights': 'uniform', 'p': 2}. Best is trial 9 with value: 0.7574536053552073.


Best trial: 11. Best value: 0.762927:  24%|██▍       | 12/50 [00:05<00:14,  2.71it/s]

[I 2025-10-07 23:02:35,335] Trial 11 finished with value: 0.7629274653765751 and parameters: {'use_pca': True, 'pca_n_components': 133, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 11 with value: 0.7629274653765751.


Best trial: 11. Best value: 0.762927:  26%|██▌       | 13/50 [00:05<00:12,  2.87it/s]

[I 2025-10-07 23:02:35,630] Trial 12 finished with value: 0.7443776922376971 and parameters: {'use_pca': True, 'pca_n_components': 89, 'n_neighbors': 2, 'weights': 'uniform', 'p': 1}. Best is trial 11 with value: 0.7629274653765751.


Best trial: 13. Best value: 0.763979:  28%|██▊       | 14/50 [00:06<00:12,  2.78it/s]

[I 2025-10-07 23:02:36,026] Trial 13 finished with value: 0.7639785430603963 and parameters: {'use_pca': True, 'pca_n_components': 130, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  30%|███       | 15/50 [00:06<00:12,  2.86it/s]

[I 2025-10-07 23:02:36,346] Trial 14 finished with value: 0.7569686921187058 and parameters: {'use_pca': True, 'pca_n_components': 132, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  32%|███▏      | 16/50 [00:07<00:12,  2.75it/s]

[I 2025-10-07 23:02:36,714] Trial 15 finished with value: 0.7579311509787434 and parameters: {'use_pca': True, 'pca_n_components': 120, 'n_neighbors': 5, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  34%|███▍      | 17/50 [00:07<00:12,  2.55it/s]

[I 2025-10-07 23:02:37,206] Trial 16 finished with value: 0.7051180305920146 and parameters: {'use_pca': True, 'pca_n_components': 188, 'n_neighbors': 15, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  36%|███▌      | 18/50 [00:07<00:11,  2.74it/s]

[I 2025-10-07 23:02:37,499] Trial 17 finished with value: 0.7605820475511759 and parameters: {'use_pca': True, 'pca_n_components': 60, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  38%|███▊      | 19/50 [00:08<00:11,  2.62it/s]

[I 2025-10-07 23:02:37,922] Trial 18 finished with value: 0.7540597816718846 and parameters: {'use_pca': True, 'pca_n_components': 178, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  40%|████      | 20/50 [00:08<00:10,  2.75it/s]

[I 2025-10-07 23:02:38,245] Trial 19 finished with value: 0.7348472047357608 and parameters: {'use_pca': True, 'pca_n_components': 83, 'n_neighbors': 15, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  42%|████▏     | 21/50 [00:09<00:11,  2.52it/s]

[I 2025-10-07 23:02:38,723] Trial 20 finished with value: 0.7522948254323114 and parameters: {'use_pca': True, 'pca_n_components': 226, 'n_neighbors': 9, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  44%|████▍     | 22/50 [00:09<00:10,  2.63it/s]

[I 2025-10-07 23:02:39,059] Trial 21 finished with value: 0.7417225979487994 and parameters: {'use_pca': True, 'pca_n_components': 32, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  46%|████▌     | 23/50 [00:09<00:09,  2.79it/s]

[I 2025-10-07 23:02:39,372] Trial 22 finished with value: 0.7517842214959306 and parameters: {'use_pca': True, 'pca_n_components': 64, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  48%|████▊     | 24/50 [00:10<00:09,  2.83it/s]

[I 2025-10-07 23:02:39,713] Trial 23 finished with value: 0.586494144631075 and parameters: {'use_pca': True, 'pca_n_components': 131, 'n_neighbors': 1, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  50%|█████     | 25/50 [00:10<00:08,  2.88it/s]

[I 2025-10-07 23:02:40,046] Trial 24 finished with value: 0.7571679147685837 and parameters: {'use_pca': True, 'pca_n_components': 103, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  52%|█████▏    | 26/50 [00:10<00:08,  2.68it/s]

[I 2025-10-07 23:02:40,468] Trial 25 finished with value: 0.756854726764818 and parameters: {'use_pca': True, 'pca_n_components': 157, 'n_neighbors': 9, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  54%|█████▍    | 27/50 [00:11<00:08,  2.76it/s]

[I 2025-10-07 23:02:40,804] Trial 26 finished with value: 0.7318331700913827 and parameters: {'use_pca': True, 'pca_n_components': 50, 'n_neighbors': 13, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  56%|█████▌    | 28/50 [00:11<00:09,  2.22it/s]

[I 2025-10-07 23:02:41,474] Trial 27 finished with value: 0.748716023988705 and parameters: {'use_pca': True, 'pca_n_components': 159, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  58%|█████▊    | 29/50 [00:12<00:08,  2.47it/s]

[I 2025-10-07 23:02:41,770] Trial 28 finished with value: 0.757333166418945 and parameters: {'use_pca': True, 'pca_n_components': 74, 'n_neighbors': 7, 'weights': 'uniform', 'p': 2}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  60%|██████    | 30/50 [00:12<00:07,  2.55it/s]

[I 2025-10-07 23:02:42,133] Trial 29 finished with value: 0.7003893490290134 and parameters: {'use_pca': True, 'pca_n_components': 109, 'n_neighbors': 19, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  62%|██████▏   | 31/50 [00:12<00:06,  2.76it/s]

[I 2025-10-07 23:02:42,423] Trial 30 finished with value: 0.745667177300349 and parameters: {'use_pca': True, 'pca_n_components': 43, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 13. Best value: 0.763979:  64%|██████▍   | 32/50 [00:13<00:07,  2.47it/s]

[I 2025-10-07 23:02:42,924] Trial 31 finished with value: 0.7585645495053235 and parameters: {'use_pca': True, 'pca_n_components': 126, 'n_neighbors': 5, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7639785430603963.


Best trial: 32. Best value: 0.767903:  66%|██████▌   | 33/50 [00:13<00:07,  2.13it/s]

[I 2025-10-07 23:02:43,549] Trial 32 finished with value: 0.767903022513214 and parameters: {'use_pca': True, 'pca_n_components': 145, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  68%|██████▊   | 34/50 [00:14<00:08,  1.96it/s]

[I 2025-10-07 23:02:44,156] Trial 33 finished with value: 0.7548502519907337 and parameters: {'use_pca': True, 'pca_n_components': 156, 'n_neighbors': 9, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  70%|███████   | 35/50 [00:15<00:07,  1.92it/s]

[I 2025-10-07 23:02:44,698] Trial 34 finished with value: 0.5527981722602782 and parameters: {'use_pca': True, 'pca_n_components': 220, 'n_neighbors': 1, 'weights': 'distance', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  72%|███████▏  | 36/50 [00:15<00:06,  2.07it/s]

[I 2025-10-07 23:02:45,093] Trial 35 finished with value: 0.6711977246427525 and parameters: {'use_pca': True, 'pca_n_components': 137, 'n_neighbors': 25, 'weights': 'distance', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  74%|███████▍  | 37/50 [00:15<00:05,  2.24it/s]

[I 2025-10-07 23:02:45,451] Trial 36 finished with value: 0.7427459656779221 and parameters: {'use_pca': False, 'n_neighbors': 13, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  76%|███████▌  | 38/50 [00:16<00:05,  2.35it/s]

[I 2025-10-07 23:02:45,831] Trial 37 finished with value: 0.7643660017354206 and parameters: {'use_pca': True, 'pca_n_components': 98, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  78%|███████▊  | 39/50 [00:16<00:05,  2.00it/s]

[I 2025-10-07 23:02:46,503] Trial 38 finished with value: 0.6598816071510171 and parameters: {'use_pca': True, 'pca_n_components': 299, 'n_neighbors': 18, 'weights': 'distance', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  80%|████████  | 40/50 [00:17<00:04,  2.16it/s]

[I 2025-10-07 23:02:46,882] Trial 39 finished with value: 0.7574536053552073 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  82%|████████▏ | 41/50 [00:17<00:04,  2.23it/s]

[I 2025-10-07 23:02:47,297] Trial 40 finished with value: 0.7357852991273848 and parameters: {'use_pca': True, 'pca_n_components': 95, 'n_neighbors': 9, 'weights': 'distance', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  84%|████████▍ | 42/50 [00:17<00:03,  2.38it/s]

[I 2025-10-07 23:02:47,648] Trial 41 finished with value: 0.756962425484226 and parameters: {'use_pca': True, 'pca_n_components': 110, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  86%|████████▌ | 43/50 [00:18<00:03,  2.22it/s]

[I 2025-10-07 23:02:48,166] Trial 42 finished with value: 0.7633583088549061 and parameters: {'use_pca': True, 'pca_n_components': 144, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  88%|████████▊ | 44/50 [00:18<00:02,  2.15it/s]

[I 2025-10-07 23:02:48,665] Trial 43 finished with value: 0.7355624892539663 and parameters: {'use_pca': True, 'pca_n_components': 173, 'n_neighbors': 12, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  90%|█████████ | 45/50 [00:19<00:02,  2.33it/s]

[I 2025-10-07 23:02:49,012] Trial 44 finished with value: 0.7578624946800685 and parameters: {'use_pca': False, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  92%|█████████▏| 46/50 [00:19<00:01,  2.47it/s]

[I 2025-10-07 23:02:49,357] Trial 45 finished with value: 0.7639888611552795 and parameters: {'use_pca': True, 'pca_n_components': 143, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  94%|█████████▍| 47/50 [00:20<00:01,  2.38it/s]

[I 2025-10-07 23:02:49,817] Trial 46 finished with value: 0.7341353370437479 and parameters: {'use_pca': True, 'pca_n_components': 206, 'n_neighbors': 10, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  96%|█████████▌| 48/50 [00:20<00:00,  2.31it/s]

[I 2025-10-07 23:02:50,281] Trial 47 finished with value: 0.7591178040306743 and parameters: {'use_pca': True, 'pca_n_components': 149, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903:  98%|█████████▊| 49/50 [00:21<00:00,  2.17it/s]

[I 2025-10-07 23:02:50,802] Trial 48 finished with value: 0.7284530423860461 and parameters: {'use_pca': False, 'n_neighbors': 11, 'weights': 'distance', 'p': 1}. Best is trial 32 with value: 0.767903022513214.


Best trial: 32. Best value: 0.767903: 100%|██████████| 50/50 [00:21<00:00,  2.31it/s]

[I 2025-10-07 23:02:51,334] Trial 49 finished with value: 0.6613473431411337 and parameters: {'use_pca': True, 'pca_n_components': 143, 'n_neighbors': 24, 'weights': 'uniform', 'p': 1}. Best is trial 32 with value: 0.767903022513214.
KNN best R2: 0.767903022513214
KNN best params: {'use_pca': True, 'pca_n_components': 145, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}





In [23]:
rf_study, rf_pipe = run_optuna_rf(n_trials=50)

[I 2025-10-07 23:02:51,492] A new study created in memory with name: no-name-2b6f612e-d25b-4792-8020-d146262764f9
Best trial: 0. Best value: 0.76546:   2%|▏         | 1/50 [00:04<03:59,  4.89s/it]

[I 2025-10-07 23:02:56,380] Trial 0 finished with value: 0.765460251629412 and parameters: {'n_estimators': 261, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 0.5}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:   4%|▍         | 2/50 [00:07<02:59,  3.74s/it]

[I 2025-10-07 23:02:59,310] Trial 1 finished with value: 0.7561891101741871 and parameters: {'n_estimators': 268, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 0.3}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:   6%|▌         | 3/50 [00:09<02:12,  2.83s/it]

[I 2025-10-07 23:03:01,056] Trial 2 finished with value: 0.7328408724575306 and parameters: {'n_estimators': 235, 'max_depth': 15, 'min_samples_split': 9, 'min_samples_leaf': 12, 'max_features': 0.3}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:   8%|▊         | 4/50 [00:13<02:22,  3.10s/it]

[I 2025-10-07 23:03:04,573] Trial 3 finished with value: 0.7311302891942713 and parameters: {'n_estimators': 365, 'max_depth': 15, 'min_samples_split': 11, 'min_samples_leaf': 12, 'max_features': 0.5}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  10%|█         | 5/50 [00:13<01:40,  2.24s/it]

[I 2025-10-07 23:03:05,277] Trial 4 finished with value: 0.6446340485745835 and parameters: {'n_estimators': 153, 'max_depth': 11, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  12%|█▏        | 6/50 [00:17<01:55,  2.62s/it]

[I 2025-10-07 23:03:08,628] Trial 5 finished with value: 0.7247231128026035 and parameters: {'n_estimators': 372, 'max_depth': 12, 'min_samples_split': 12, 'min_samples_leaf': 14, 'max_features': 0.5}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  14%|█▍        | 7/50 [00:19<01:53,  2.64s/it]

[I 2025-10-07 23:03:11,329] Trial 6 finished with value: 0.7506984464070274 and parameters: {'n_estimators': 146, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 6, 'max_features': 0.8}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  16%|█▌        | 8/50 [00:22<01:48,  2.59s/it]

[I 2025-10-07 23:03:13,797] Trial 7 finished with value: 0.7243699499781341 and parameters: {'n_estimators': 234, 'max_depth': 24, 'min_samples_split': 15, 'min_samples_leaf': 14, 'max_features': 0.5}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  18%|█▊        | 9/50 [00:23<01:31,  2.23s/it]

[I 2025-10-07 23:03:15,237] Trial 8 finished with value: 0.5353717059953355 and parameters: {'n_estimators': 406, 'max_depth': 49, 'min_samples_split': 18, 'min_samples_leaf': 19, 'max_features': 'log2'}. Best is trial 0 with value: 0.765460251629412.


Best trial: 0. Best value: 0.76546:  20%|██        | 10/50 [00:25<01:23,  2.09s/it]

[I 2025-10-07 23:03:17,014] Trial 9 finished with value: 0.5337561691534923 and parameters: {'n_estimators': 461, 'max_depth': 33, 'min_samples_split': 18, 'min_samples_leaf': 19, 'max_features': 'log2'}. Best is trial 0 with value: 0.765460251629412.


Best trial: 10. Best value: 0.772017:  22%|██▏       | 11/50 [00:27<01:18,  2.01s/it]

[I 2025-10-07 23:03:18,844] Trial 10 finished with value: 0.7720170127500345 and parameters: {'n_estimators': 309, 'max_depth': 32, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  24%|██▍       | 12/50 [00:28<01:10,  1.86s/it]

[I 2025-10-07 23:03:20,374] Trial 11 finished with value: 0.7720140618918098 and parameters: {'n_estimators': 316, 'max_depth': 34, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  26%|██▌       | 13/50 [00:30<01:03,  1.72s/it]

[I 2025-10-07 23:03:21,755] Trial 12 finished with value: 0.7719177813789673 and parameters: {'n_estimators': 323, 'max_depth': 35, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  28%|██▊       | 14/50 [00:31<00:59,  1.65s/it]

[I 2025-10-07 23:03:23,233] Trial 13 finished with value: 0.7689920238987817 and parameters: {'n_estimators': 322, 'max_depth': 38, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  30%|███       | 15/50 [00:33<01:02,  1.79s/it]

[I 2025-10-07 23:03:25,365] Trial 14 finished with value: 0.7523214791853029 and parameters: {'n_estimators': 500, 'max_depth': 44, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  32%|███▏      | 16/50 [00:34<00:51,  1.52s/it]

[I 2025-10-07 23:03:26,256] Trial 15 finished with value: 0.7527126575578577 and parameters: {'n_estimators': 192, 'max_depth': 27, 'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  34%|███▍      | 17/50 [00:35<00:42,  1.27s/it]

[I 2025-10-07 23:03:26,956] Trial 16 finished with value: 0.7023520373701704 and parameters: {'n_estimators': 105, 'max_depth': 29, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  36%|███▌      | 18/50 [00:44<01:59,  3.72s/it]

[I 2025-10-07 23:03:36,370] Trial 17 finished with value: 0.7569412382095744 and parameters: {'n_estimators': 410, 'max_depth': 41, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.8}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  38%|███▊      | 19/50 [00:46<01:32,  2.99s/it]

[I 2025-10-07 23:03:37,672] Trial 18 finished with value: 0.5970188083790792 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  40%|████      | 20/50 [00:47<01:19,  2.63s/it]

[I 2025-10-07 23:03:39,470] Trial 19 finished with value: 0.7615588999636886 and parameters: {'n_estimators': 354, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  42%|████▏     | 21/50 [00:49<01:06,  2.28s/it]

[I 2025-10-07 23:03:40,926] Trial 20 finished with value: 0.6927413451264615 and parameters: {'n_estimators': 290, 'max_depth': 22, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  44%|████▍     | 22/50 [00:51<01:03,  2.26s/it]

[I 2025-10-07 23:03:43,133] Trial 21 finished with value: 0.7700035286459708 and parameters: {'n_estimators': 331, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  46%|████▌     | 23/50 [00:54<01:03,  2.34s/it]

[I 2025-10-07 23:03:45,675] Trial 22 finished with value: 0.7688507576970369 and parameters: {'n_estimators': 404, 'max_depth': 36, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  48%|████▊     | 24/50 [00:55<00:54,  2.09s/it]

[I 2025-10-07 23:03:47,158] Trial 23 finished with value: 0.7317122713171339 and parameters: {'n_estimators': 325, 'max_depth': 44, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  50%|█████     | 25/50 [00:56<00:44,  1.80s/it]

[I 2025-10-07 23:03:48,287] Trial 24 finished with value: 0.7526046970811665 and parameters: {'n_estimators': 212, 'max_depth': 32, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  52%|█████▏    | 26/50 [01:04<01:27,  3.64s/it]

[I 2025-10-07 23:03:56,209] Trial 25 finished with value: 0.753090365695045 and parameters: {'n_estimators': 277, 'max_depth': 38, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.8}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  54%|█████▍    | 27/50 [01:08<01:27,  3.80s/it]

[I 2025-10-07 23:04:00,399] Trial 26 finished with value: 0.7638888758196332 and parameters: {'n_estimators': 341, 'max_depth': 41, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 0.3}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  56%|█████▌    | 28/50 [01:11<01:15,  3.44s/it]

[I 2025-10-07 23:04:02,998] Trial 27 finished with value: 0.7612553183133187 and parameters: {'n_estimators': 308, 'max_depth': 26, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  58%|█████▊    | 29/50 [01:13<01:03,  3.03s/it]

[I 2025-10-07 23:04:05,060] Trial 28 finished with value: 0.7499129795032451 and parameters: {'n_estimators': 375, 'max_depth': 50, 'min_samples_split': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  60%|██████    | 30/50 [01:15<00:53,  2.68s/it]

[I 2025-10-07 23:04:06,939] Trial 29 finished with value: 0.7417244546735482 and parameters: {'n_estimators': 249, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  62%|██████▏   | 31/50 [01:18<00:51,  2.70s/it]

[I 2025-10-07 23:04:09,668] Trial 30 finished with value: 0.5613723396856201 and parameters: {'n_estimators': 450, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 16, 'max_features': 'log2'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  64%|██████▍   | 32/50 [01:20<00:44,  2.46s/it]

[I 2025-10-07 23:04:11,574] Trial 31 finished with value: 0.7715900765038031 and parameters: {'n_estimators': 345, 'max_depth': 34, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  66%|██████▌   | 33/50 [01:21<00:36,  2.17s/it]

[I 2025-10-07 23:04:13,067] Trial 32 finished with value: 0.7676956652211541 and parameters: {'n_estimators': 271, 'max_depth': 36, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  68%|██████▊   | 34/50 [01:24<00:39,  2.50s/it]

[I 2025-10-07 23:04:16,330] Trial 33 finished with value: 0.7692691904551296 and parameters: {'n_estimators': 300, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  70%|███████   | 35/50 [01:26<00:33,  2.21s/it]

[I 2025-10-07 23:04:17,856] Trial 34 finished with value: 0.7216707434269288 and parameters: {'n_estimators': 383, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  72%|███████▏  | 36/50 [01:31<00:41,  2.99s/it]

[I 2025-10-07 23:04:22,673] Trial 35 finished with value: 0.7646706051306527 and parameters: {'n_estimators': 342, 'max_depth': 44, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  74%|███████▍  | 37/50 [01:33<00:36,  2.81s/it]

[I 2025-10-07 23:04:25,067] Trial 36 finished with value: 0.7674913625798896 and parameters: {'n_estimators': 249, 'max_depth': 27, 'min_samples_split': 11, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  76%|███████▌  | 38/50 [01:35<00:28,  2.40s/it]

[I 2025-10-07 23:04:26,517] Trial 37 finished with value: 0.7702253676017323 and parameters: {'n_estimators': 276, 'max_depth': 32, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  78%|███████▊  | 39/50 [01:40<00:36,  3.29s/it]

[I 2025-10-07 23:04:31,864] Trial 38 finished with value: 0.7440406643369023 and parameters: {'n_estimators': 355, 'max_depth': 37, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 0.8}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  80%|████████  | 40/50 [01:43<00:33,  3.37s/it]

[I 2025-10-07 23:04:35,438] Trial 39 finished with value: 0.7578626658347059 and parameters: {'n_estimators': 313, 'max_depth': 47, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 0.5}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  82%|████████▏ | 41/50 [01:45<00:25,  2.79s/it]

[I 2025-10-07 23:04:36,883] Trial 40 finished with value: 0.6756875560665975 and parameters: {'n_estimators': 395, 'max_depth': 23, 'min_samples_split': 11, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  84%|████████▍ | 42/50 [01:47<00:19,  2.47s/it]

[I 2025-10-07 23:04:38,587] Trial 41 finished with value: 0.7656943638563839 and parameters: {'n_estimators': 287, 'max_depth': 32, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  86%|████████▌ | 43/50 [01:48<00:14,  2.05s/it]

[I 2025-10-07 23:04:39,673] Trial 42 finished with value: 0.7692782209450206 and parameters: {'n_estimators': 220, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  88%|████████▊ | 44/50 [01:49<00:10,  1.73s/it]

[I 2025-10-07 23:04:40,651] Trial 43 finished with value: 0.7602305392738008 and parameters: {'n_estimators': 268, 'max_depth': 35, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  90%|█████████ | 45/50 [01:50<00:07,  1.54s/it]

[I 2025-10-07 23:04:41,738] Trial 44 finished with value: 0.7604736923202362 and parameters: {'n_estimators': 254, 'max_depth': 40, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  92%|█████████▏| 46/50 [01:51<00:06,  1.55s/it]

[I 2025-10-07 23:04:43,331] Trial 45 finished with value: 0.7692912886097071 and parameters: {'n_estimators': 355, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  94%|█████████▍| 47/50 [01:53<00:04,  1.62s/it]

[I 2025-10-07 23:04:45,099] Trial 46 finished with value: 0.7526364301508195 and parameters: {'n_estimators': 425, 'max_depth': 32, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  96%|█████████▌| 48/50 [02:00<00:06,  3.21s/it]

[I 2025-10-07 23:04:52,017] Trial 47 finished with value: 0.7546626950680835 and parameters: {'n_estimators': 319, 'max_depth': 29, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017:  98%|█████████▊| 49/50 [02:01<00:02,  2.62s/it]

[I 2025-10-07 23:04:53,280] Trial 48 finished with value: 0.758752024426 and parameters: {'n_estimators': 288, 'max_depth': 39, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.7720170127500345.


Best trial: 10. Best value: 0.772017: 100%|██████████| 50/50 [02:03<00:00,  2.48s/it]


[I 2025-10-07 23:04:55,471] Trial 49 finished with value: 0.6951140722826321 and parameters: {'n_estimators': 186, 'max_depth': 33, 'min_samples_split': 4, 'min_samples_leaf': 20, 'max_features': 0.8}. Best is trial 10 with value: 0.7720170127500345.
RF best R2: 0.7720170127500345
RF best params: {'n_estimators': 309, 'max_depth': 32, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt'}


# 7. Try a meta model or ensembler of the best models

In [24]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict

# reproducible folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1) Get OOF predictions for each base model (these preds are made by models
#    that were trained without the corresponding sample — no leakage)
print("Generating OOF preds (this may take time)...")
oof_knn = cross_val_predict(knn_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_svm = cross_val_predict(svm_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_rf  = cross_val_predict(rf_pipe,  X, y, cv=kf, n_jobs=-1, method='predict')

# Stack OOF predictions (n_samples x n_models)
stack_oof = np.vstack([oof_knn, oof_svm, oof_rf]).T

# 2) Simple average ensemble
ens_mean = stack_oof.mean(axis=1)
r2_mean = r2_score(y, ens_mean)
rmse_mean = np.sqrt(mean_squared_error(y, ens_mean))
print(f"Simple average ensemble -> R2: {r2_mean:.4f}, RMSE: {rmse_mean:.4f}")

Generating OOF preds (this may take time)...
Simple average ensemble -> R2: 0.7882, RMSE: 0.8263


In [25]:
# 3) Stacking: train a Ridge meta-learner on the OOF stack
meta = Ridge(alpha=1.0)
meta.fit(stack_oof, y)                 # training on OOF preds is OK (no leakage)
ens_stack = meta.predict(stack_oof)    # predictions on the same OOF matrix
r2_stack = r2_score(y, ens_stack)
rmse_stack = np.sqrt(mean_squared_error(y, ens_stack))
print(f"Stacking (Ridge) ensemble -> R2: {r2_stack:.4f}, RMSE: {rmse_stack:.4f}")

# Show meta-learner weights
print("Meta-learner coefficients (weights):", meta.coef_)
print("Meta-learner intercept:", meta.intercept_)

Stacking (Ridge) ensemble -> R2: 0.7926, RMSE: 0.8177
Meta-learner coefficients (weights): [ 0.38985534  0.66950896 -0.03331666]
Meta-learner intercept: -0.2121222716352733


In [26]:
joblib.dump(meta, "../models/pi_meta.joblib")
print("Saved final base pipelines and meta-learner.")

Saved final base pipelines and meta-learner.
