# 1. Import Libraries and Data

The molecular descriptors and fingerprints data was previously created in the `PI_data_loading.ipynb` notebook and has been stored in the data folder as csv file.

In [1]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Models for scaled data
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Tree based models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
pi = pd.read_csv('../data/pi_df_clean.csv')
print(pi.shape)
pi.head()

(834, 13)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_units_norm,IC50_nM,pIC50
0,CHEMBL1627209,O=C(N[C@@H]1c2ccccc2C[C@@H]1O)[C@@H](Cc1ccccc1...,IC50,21.1,7.68,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,21.1,7.675718
1,CHEMBL1627287,N=[S+]([O-])([C@H](Cc1ccccc1)C(=O)N[C@H]1c2ccc...,IC50,153.9,6.81,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,153.9,6.812761
2,CHEMBL1627235,N=[S+]([O-])([C@H](Cc1ccccc1)C(=O)N[C@H]1c2ccc...,IC50,37.3,7.43,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,37.3,7.428291
3,CHEMBL1627210,N=[S+]([O-])(C[C@H](Cc1ccccc1)C(=O)N[C@@H]1c2c...,IC50,2.5,8.6,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,2.5,8.60206
4,CHEMBL396814,O=C(N[C@H]1c2ccccc2C[C@H]1O)[C@@H](Cc1ccccc1)C...,IC50,10000.0,,CHEMBL899796,Inhibition of HIV1 recombinant protease,CHEMBL2366517,Protease,Human immunodeficiency virus 1,nM,10000.0,5.0


# Training QSAR Models

In [3]:
df = pd.read_csv('../data/pi_qsar_features.csv')
print("rows, cols:", df.shape)

rows, cols: (834, 523)


In [4]:
df.head()

Unnamed: 0,MolWt,MolLogP,MolMR,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumAromaticRings,HeavyAtomCount,FractionCSP3,...,Morgan_512_503,Morgan_512_504,Morgan_512_505,Morgan_512_506,Morgan_512_507,Morgan_512_508,Morgan_512_509,Morgan_512_510,Morgan_512_511,pIC50
0,636.814,4.0019,179.2214,121.72,4,5,12,4,46,0.315789,...,0,0,0,0,1,0,0,0,0,7.675718
1,623.775,3.80477,172.8031,145.57,5,6,10,4,45,0.277778,...,0,0,0,0,1,0,0,0,0,6.812761
2,623.775,3.80477,172.8031,145.57,5,6,10,4,45,0.277778,...,0,0,0,0,1,0,0,0,0,7.428291
3,651.829,4.29997,181.9411,145.57,5,6,12,4,47,0.315789,...,0,0,0,0,1,0,0,0,0,8.60206
4,636.814,4.0019,179.2214,121.72,4,5,12,4,46,0.315789,...,0,0,0,0,1,0,0,0,0,5.0


In [5]:
# Prepare X and y
y = df['pIC50']
X = df.drop(columns=['pIC50'])
print('Feature count: ', X.shape[1])

Feature count:  522


In [6]:
# ------------------------------------------------------------------------------
# Quick feature cleanup
# ------------------------------------------------------------------------------

# a. remove near-constant features

# Fit the selector
vt = VarianceThreshold(threshold=1e-6)
X_v = vt.fit_transform(X)
# Features kept
kept_features = X.columns[vt.get_support()].tolist()
# Features removed
removed_features = [col for col in X.columns if col not in kept_features]
# Create the reduced DataFrame
X = pd.DataFrame(X_v, columns=kept_features)

print(f"Features after variance filter: {len(kept_features)}")
print(f"Features removed: {len(removed_features)}")
# Optional: display or save the removed features
print("\nRemoved features (first 20):")
print(removed_features[:20])

Features after variance filter: 521
Features removed: 1

Removed features (first 20):
['Morgan_512_499']


In [7]:
# b. (optional) remove extremely collinear features (simple correlation filter)

corr_thresh = 0.98
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > corr_thresh)]
if to_drop:
    X.drop(columns=to_drop, inplace=True)
    print("Dropped highly correlated features:", to_drop)

Dropped highly correlated features: ['MolMR', 'HeavyAtomCount']


In [8]:
X.shape

(834, 519)

In [9]:
# ------------------------------------------------------------------------------
# Define descriptor columns to scale and fingerprint columns to pass through
# ------------------------------------------------------------------------------
descriptors = ['MolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors',
    'NumRotatableBonds', 'NumAromaticRings', 'FractionCSP3']
fingerprints = [c for c in X.columns if c not in descriptors]

# ColumnTransformer: scale descriptors, pass fingerprints through
preprocessor = ColumnTransformer([
    ("desc", StandardScaler(), descriptors),
    ("fp", "passthrough", fingerprints)
])

In [10]:
# ---------------------------
# k-fold OOF trainer
# ---------------------------
def kfold_train_predict(model, X_df, y_arr, n_splits=5, random_state=42):
    """
    model: estimator or pipeline (should accept fit/predict)
    X_df: pandas DataFrame (full dataset)
    y_arr: numpy array or Series
    returns: oof_preds (np.array same length as X_df), mean_r2, mean_rmse
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_df, y_arr), 1):
        X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
        y_train, y_val = y_arr[train_idx], y_arr[val_idx]

        # fit on train fold, predict on val fold
        model.fit(X_train, y_train)
        preds = model.predict(X_val)

        oof_preds[val_idx] = preds

        fold_r2 = r2_score(y_val, preds)
        fold_rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_scores.append((fold_r2, fold_rmse))
        print(f"Fold {fold}: R2 = {fold_r2:.3f}, RMSE = {fold_rmse:.3f}")

    mean_r2 = np.mean([s[0] for s in fold_scores])
    mean_rmse = np.mean([s[1] for s in fold_scores])
    print(f"\nMean CV R2 = {mean_r2:.3f}, Mean RMSE = {mean_rmse:.3f}")
    return oof_preds, mean_r2, mean_rmse

In [11]:
# ---------------------------
# Define models and pipelines
# ---------------------------
models = {}

# Linear models (need scaling) -> include preprocessor
models['Linear'] = Pipeline([("preproc", preprocessor), ("model", LinearRegression())])
models['ElasticNet'] = Pipeline([("preproc", preprocessor), ("model", ElasticNet(random_state=42, max_iter=5000))])

# SVR (needs scaling)
models['SVM'] = Pipeline([("preproc", preprocessor), ("model", SVR())])

# KNN
models['KNN'] = Pipeline([("preproc", preprocessor), ("model", KNeighborsRegressor())])

# MLP
models['MLP'] = Pipeline([("preproc", preprocessor), ("model", MLPRegressor(max_iter=2000, random_state=42))])

# Random Forest (trees don't need scaling, but pipeline is fine)
models['Random Forest'] = Pipeline([("preproc", preprocessor), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])

# XGBoost
models['XGBoost'] = Pipeline([("preproc", preprocessor), ("model", XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, objective='reg:squarederror'))])

In [12]:
# ---------------------------
# Run k-fold for each model
# ---------------------------
results = {}
oof_dict = {}

for name, model in models.items():
    print(f"\n=== Training model: {name} ===")
    oof_preds, mean_r2, mean_rmse = kfold_train_predict(model, X, y, n_splits=5, random_state=42)
    results[name] = {"r2": mean_r2, "rmse": mean_rmse}
    oof_dict[name] = oof_preds

# Summary
print("\nSummary (mean CV results):")
for name, res in results.items():
    print(f"{name}: R2 = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")


=== Training model: Linear ===
Fold 1: R2 = -1.433, RMSE = 2.736
Fold 2: R2 = -0.521, RMSE = 2.214
Fold 3: R2 = -1.164, RMSE = 2.549
Fold 4: R2 = -0.456, RMSE = 2.226
Fold 5: R2 = -0.955, RMSE = 2.571

Mean CV R2 = -0.906, Mean RMSE = 2.459

=== Training model: ElasticNet ===
Fold 1: R2 = 0.121, RMSE = 1.644
Fold 2: R2 = 0.085, RMSE = 1.717
Fold 3: R2 = 0.099, RMSE = 1.644
Fold 4: R2 = 0.118, RMSE = 1.732
Fold 5: R2 = 0.116, RMSE = 1.728

Mean CV R2 = 0.108, Mean RMSE = 1.693

=== Training model: SVM ===
Fold 1: R2 = 0.785, RMSE = 0.814
Fold 2: R2 = 0.777, RMSE = 0.849
Fold 3: R2 = 0.773, RMSE = 0.826
Fold 4: R2 = 0.768, RMSE = 0.889
Fold 5: R2 = 0.744, RMSE = 0.930

Mean CV R2 = 0.769, Mean RMSE = 0.862

=== Training model: KNN ===
Fold 1: R2 = 0.783, RMSE = 0.817
Fold 2: R2 = 0.720, RMSE = 0.950
Fold 3: R2 = 0.772, RMSE = 0.828
Fold 4: R2 = 0.781, RMSE = 0.863
Fold 5: R2 = 0.746, RMSE = 0.926

Mean CV R2 = 0.760, Mean RMSE = 0.877

=== Training model: MLP ===
Fold 1: R2 = 0.760, RMS

# 6. Tune the Best Models

### Support Vector Machine (SVR),
### K-Nearest Neighbours (KNN),
### Random Forest (RF)

In [13]:
import optuna
import joblib
from sklearn.decomposition import PCA

In [14]:
# PCA variant of the model
n_fp = len(fingerprints)
print(f"Using {len(descriptors)} descriptor columns and {n_fp} fingerprint columns.")

if n_fp == 0:
    raise ValueError("No fingerprint columns detected. Make sure your X contains fingerprint bit columns.")

# ---------- CV settings ----------
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

Using 8 descriptor columns and 511 fingerprint columns.


In [15]:
# ----------------------- SVM ---------------------------------
def run_optuna_svm(n_trials=80):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        # SVR hyperparameters (log-uniform sampling where appropriate)
        kernel = trial.suggest_categorical("kernel", ["rbf", "poly", "sigmoid"])
        C = trial.suggest_loguniform("C", 1e-3, 1e3)
        epsilon = trial.suggest_loguniform("epsilon", 1e-4, 1.0)
        gamma_choice = trial.suggest_categorical("gamma_choice", ["scale", "auto", "numeric"])
        if gamma_choice == "numeric":
            gamma = trial.suggest_loguniform("gamma", 1e-5, 1e1)
        else:
            gamma = gamma_choice
        # degree and coef0 for polynomial kernel
        if kernel == "poly":
            degree = trial.suggest_int("degree", 2, 5)
            coef0 = trial.suggest_float("coef0", 0.0, 1.0)
        else:
            degree = 3
            coef0 = 0.0

        svr = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, degree=degree, coef0=coef0, max_iter=100000)
        pipe = Pipeline([("preproc", preprocessor), ("svr", svr)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    # reconstruct SVR
    best_kernel = best.get("kernel", "rbf")
    best_gamma = best.get("gamma", best.get("gamma_choice", "scale"))
    if best_gamma == "numeric":
        best_gamma = best.get("gamma")
    best_degree = best.get("degree", 3)
    best_coef0 = best.get("coef0", 0.0)

    best_svm = SVR(
        kernel=best_kernel,
        C=best.get("C"),
        epsilon=best.get("epsilon"),
        gamma=best_gamma,
        degree=best_degree,
        coef0=best_coef0,
        max_iter=100000
    )
    best_pipe = Pipeline([("preproc", preprocessor_best), ("svr", best_svm)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_svm.joblib")
    print("SVM best R2:", study.best_value)
    print("SVM best params:", study.best_params)
    return study, best_pipe

In [16]:
# ------------------ KNN -----------------------
def run_optuna_knn(n_trials=50):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        p = trial.suggest_int("p", 1, 2)  # 1 = manhattan, 2 = euclidean

        knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p)
        pipe = Pipeline([("preproc", preprocessor), ("knn", knn)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    best_knn = KNeighborsRegressor(n_neighbors=best["n_neighbors"], weights=best["weights"], p=best["p"])
    best_pipe = Pipeline([("preproc", preprocessor_best), ("knn", best_knn)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_knn.joblib")
    print("KNN best R2:", study.best_value)
    print("KNN best params:", study.best_params)
    return study, best_pipe

In [17]:
# -------------------- Random Forest --------------------------------
def run_optuna_rf(n_trials=50):
    def objective(trial):
        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", "passthrough", fingerprints)
        ])

        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 3, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5, 0.8])

        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                   max_features=max_features, n_jobs=-1, random_state=42)
        pipe = Pipeline([("preproc", preprocessor), ("rf", rf)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    best = study.best_params
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", "passthrough", fingerprints)
    ])
    best_rf = RandomForestRegressor(n_estimators=best["n_estimators"], max_depth=best["max_depth"],
                                    min_samples_split=best["min_samples_split"],
                                    min_samples_leaf=best["min_samples_leaf"],
                                    max_features=best["max_features"], n_jobs=-1, random_state=42)
    best_pipe = Pipeline([("preproc", preprocessor_best), ("rf", best_rf)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/pi_rf.joblib")
    print("RF best R2:", study.best_value)
    print("RF best params:", study.best_params)
    return study, best_pipe

In [18]:
# ----------------------------
# Example: run all studies sequentially (adjust n_trials)
# ----------------------------
svm_study, svm_pipe = run_optuna_svm(n_trials=50)

[I 2025-10-07 08:43:50,993] A new study created in memory with name: no-name-e524b7f5-bed2-4444-9f40-ca9cd4eba564
Best trial: 0. Best value: 0.745879:   2%|▏         | 1/50 [00:05<04:47,  5.88s/it]

[I 2025-10-07 08:43:56,873] Trial 0 finished with value: 0.7458786566524909 and parameters: {'use_pca': True, 'pca_n_components': 128, 'kernel': 'rbf', 'C': 0.45487253388891086, 'epsilon': 0.0006808829161516052, 'gamma_choice': 'scale'}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:   4%|▍         | 2/50 [00:09<03:37,  4.53s/it]

[I 2025-10-07 08:44:00,447] Trial 1 finished with value: 0.7193074734075713 and parameters: {'use_pca': True, 'pca_n_components': 122, 'kernel': 'sigmoid', 'C': 8.129760396229347, 'epsilon': 0.0257863509804728, 'gamma_choice': 'auto'}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:   6%|▌         | 3/50 [00:10<02:08,  2.73s/it]

[I 2025-10-07 08:44:01,025] Trial 2 finished with value: 0.6560874231488429 and parameters: {'use_pca': True, 'pca_n_components': 255, 'kernel': 'poly', 'C': 0.024820682757779634, 'epsilon': 0.13768719691129538, 'gamma_choice': 'numeric', 'gamma': 0.05733447017970653, 'degree': 2, 'coef0': 0.015788330756968505}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:   8%|▊         | 4/50 [00:10<01:26,  1.89s/it]

[I 2025-10-07 08:44:01,634] Trial 3 finished with value: 0.07036960200126578 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.013667422416356094, 'epsilon': 0.12653072214120345, 'gamma_choice': 'auto'}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:  10%|█         | 5/50 [00:11<01:08,  1.53s/it]

[I 2025-10-07 08:44:02,533] Trial 4 finished with value: 0.7248769408255535 and parameters: {'use_pca': True, 'pca_n_components': 105, 'kernel': 'rbf', 'C': 404.6698053351755, 'epsilon': 0.00020041858067053095, 'gamma_choice': 'auto'}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:  12%|█▏        | 6/50 [00:12<00:52,  1.19s/it]

[I 2025-10-07 08:44:03,067] Trial 5 finished with value: -48.18490775614117 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 12.812624016109309, 'epsilon': 0.0001362731648746042, 'gamma_choice': 'scale'}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 0. Best value: 0.745879:  14%|█▍        | 7/50 [00:12<00:41,  1.05it/s]

[I 2025-10-07 08:44:03,535] Trial 6 finished with value: 0.5937428149914612 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.01483486866012648, 'epsilon': 0.01564279297890732, 'gamma_choice': 'scale', 'degree': 3, 'coef0': 0.8926775688481913}. Best is trial 0 with value: 0.7458786566524909.


Best trial: 7. Best value: 0.749273:  16%|█▌        | 8/50 [00:13<00:34,  1.21it/s]

[I 2025-10-07 08:44:04,080] Trial 7 finished with value: 0.7492732366786947 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 948.0209351261364, 'epsilon': 0.0005101602286257455, 'gamma_choice': 'numeric', 'gamma': 6.143134561658655e-05}. Best is trial 7 with value: 0.7492732366786947.


Best trial: 8. Best value: 0.76942:  18%|█▊        | 9/50 [00:13<00:32,  1.27it/s] 

[I 2025-10-07 08:44:04,781] Trial 8 finished with value: 0.769420153246829 and parameters: {'use_pca': True, 'pca_n_components': 300, 'kernel': 'rbf', 'C': 6.154429759213691, 'epsilon': 0.00012808794549754938, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  20%|██        | 10/50 [00:14<00:29,  1.36it/s]

[I 2025-10-07 08:44:05,398] Trial 9 finished with value: -3776.9582850271095 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 110.74242064596804, 'epsilon': 0.0009732301528139723, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  22%|██▏       | 11/50 [00:15<00:27,  1.42it/s]

[I 2025-10-07 08:44:06,034] Trial 10 finished with value: 0.7494312440918285 and parameters: {'use_pca': True, 'pca_n_components': 245, 'kernel': 'rbf', 'C': 0.5207523261889803, 'epsilon': 0.003346656861370676, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  24%|██▍       | 12/50 [00:15<00:26,  1.42it/s]

[I 2025-10-07 08:44:06,739] Trial 11 finished with value: 0.6913839985232269 and parameters: {'use_pca': True, 'pca_n_components': 292, 'kernel': 'rbf', 'C': 0.25490717899801224, 'epsilon': 0.004149905393875103, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  26%|██▌       | 13/50 [00:16<00:23,  1.57it/s]

[I 2025-10-07 08:44:07,228] Trial 12 finished with value: 0.7395437308986276 and parameters: {'use_pca': True, 'pca_n_components': 229, 'kernel': 'rbf', 'C': 2.9396362211862286, 'epsilon': 0.8290868898131225, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  28%|██▊       | 14/50 [00:16<00:22,  1.61it/s]

[I 2025-10-07 08:44:07,811] Trial 13 finished with value: 0.005322309137302384 and parameters: {'use_pca': True, 'pca_n_components': 206, 'kernel': 'rbf', 'C': 0.001072196641987945, 'epsilon': 0.004737952124083347, 'gamma_choice': 'scale'}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  30%|███       | 15/50 [00:17<00:21,  1.62it/s]

[I 2025-10-07 08:44:08,415] Trial 14 finished with value: 0.08733345474734773 and parameters: {'use_pca': True, 'pca_n_components': 288, 'kernel': 'rbf', 'C': 37.48578282086194, 'epsilon': 0.0020413567257748354, 'gamma_choice': 'numeric', 'gamma': 2.0502079448358645}. Best is trial 8 with value: 0.769420153246829.


Best trial: 8. Best value: 0.76942:  32%|███▏      | 16/50 [00:17<00:18,  1.87it/s]

[I 2025-10-07 08:44:08,761] Trial 15 finished with value: 0.6934041934297814 and parameters: {'use_pca': True, 'pca_n_components': 49, 'kernel': 'poly', 'C': 0.13105916553266725, 'epsilon': 0.00010755342733644668, 'gamma_choice': 'scale', 'degree': 5, 'coef0': 0.3266332806204615}. Best is trial 8 with value: 0.769420153246829.


Best trial: 16. Best value: 0.777775:  34%|███▍      | 17/50 [00:18<00:17,  1.88it/s]

[I 2025-10-07 08:44:09,284] Trial 16 finished with value: 0.7777751369483401 and parameters: {'use_pca': True, 'pca_n_components': 185, 'kernel': 'rbf', 'C': 2.017469266590996, 'epsilon': 0.04858394294432953, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  36%|███▌      | 18/50 [00:18<00:16,  1.93it/s]

[I 2025-10-07 08:44:09,773] Trial 17 finished with value: 0.7752805427433981 and parameters: {'use_pca': True, 'pca_n_components': 177, 'kernel': 'rbf', 'C': 3.369248447398757, 'epsilon': 0.04988483537408194, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  38%|███▊      | 19/50 [00:19<00:15,  2.01it/s]

[I 2025-10-07 08:44:10,227] Trial 18 finished with value: 0.16043096995350034 and parameters: {'use_pca': True, 'pca_n_components': 182, 'kernel': 'poly', 'C': 1.833026843016615, 'epsilon': 0.07560947225225478, 'gamma_choice': 'numeric', 'gamma': 1.1916541833684981e-05, 'degree': 5, 'coef0': 0.991249853521522}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  40%|████      | 20/50 [00:19<00:15,  1.97it/s]

[I 2025-10-07 08:44:10,756] Trial 19 finished with value: 0.7071506198675552 and parameters: {'use_pca': True, 'pca_n_components': 178, 'kernel': 'sigmoid', 'C': 48.89117861027763, 'epsilon': 0.556674135928693, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  42%|████▏     | 21/50 [00:20<00:13,  2.17it/s]

[I 2025-10-07 08:44:11,103] Trial 20 finished with value: 0.5847461956973286 and parameters: {'use_pca': True, 'pca_n_components': 76, 'kernel': 'rbf', 'C': 0.09595064302520323, 'epsilon': 0.038985650379152, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  44%|████▍     | 22/50 [00:20<00:12,  2.30it/s]

[I 2025-10-07 08:44:11,476] Trial 21 finished with value: 0.7559875093341957 and parameters: {'use_pca': True, 'pca_n_components': 15, 'kernel': 'rbf', 'C': 5.482028477811914, 'epsilon': 0.33124117902319766, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  46%|████▌     | 23/50 [00:21<00:12,  2.12it/s]

[I 2025-10-07 08:44:12,036] Trial 22 finished with value: 0.7716471987701363 and parameters: {'use_pca': True, 'pca_n_components': 159, 'kernel': 'rbf', 'C': 1.1571926074097822, 'epsilon': 0.0077814240701965885, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  48%|████▊     | 24/50 [00:21<00:11,  2.17it/s]

[I 2025-10-07 08:44:12,471] Trial 23 finished with value: 0.7731745179549803 and parameters: {'use_pca': True, 'pca_n_components': 158, 'kernel': 'rbf', 'C': 1.3338588310973942, 'epsilon': 0.01005879576504265, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  50%|█████     | 25/50 [00:21<00:11,  2.11it/s]

[I 2025-10-07 08:44:12,975] Trial 24 finished with value: 0.7611980018961215 and parameters: {'use_pca': True, 'pca_n_components': 201, 'kernel': 'rbf', 'C': 16.894543138302037, 'epsilon': 0.047181440481455, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  52%|█████▏    | 26/50 [00:22<00:11,  2.14it/s]

[I 2025-10-07 08:44:13,427] Trial 25 finished with value: 0.7727139922387652 and parameters: {'use_pca': True, 'pca_n_components': 148, 'kernel': 'rbf', 'C': 1.5181053466824823, 'epsilon': 0.015424603555734624, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  54%|█████▍    | 27/50 [00:22<00:10,  2.16it/s]

[I 2025-10-07 08:44:13,883] Trial 26 finished with value: 0.37928659897346007 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.043967306420317, 'epsilon': 0.23813210029966786, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  56%|█████▌    | 28/50 [00:23<00:10,  2.17it/s]

[I 2025-10-07 08:44:14,337] Trial 27 finished with value: 0.767582566545782 and parameters: {'use_pca': True, 'pca_n_components': 157, 'kernel': 'rbf', 'C': 0.8694417176832063, 'epsilon': 0.010586049666469458, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  58%|█████▊    | 29/50 [00:23<00:10,  2.04it/s]

[I 2025-10-07 08:44:14,895] Trial 28 finished with value: 0.7682990042873187 and parameters: {'use_pca': True, 'pca_n_components': 208, 'kernel': 'poly', 'C': 120.06801163525226, 'epsilon': 0.07262659883851759, 'gamma_choice': 'auto', 'degree': 2, 'coef0': 0.5534885500855767}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  60%|██████    | 30/50 [00:24<00:09,  2.12it/s]

[I 2025-10-07 08:44:15,323] Trial 29 finished with value: 0.5121219294962994 and parameters: {'use_pca': True, 'pca_n_components': 181, 'kernel': 'sigmoid', 'C': 0.3828858754233489, 'epsilon': 0.025908583608766238, 'gamma_choice': 'numeric', 'gamma': 0.0028705804357504593}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  62%|██████▏   | 31/50 [00:24<00:08,  2.23it/s]

[I 2025-10-07 08:44:15,720] Trial 30 finished with value: 0.7695468719151168 and parameters: {'use_pca': True, 'pca_n_components': 102, 'kernel': 'rbf', 'C': 3.173747821685351, 'epsilon': 0.0015216321987930378, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  64%|██████▍   | 32/50 [00:25<00:07,  2.28it/s]

[I 2025-10-07 08:44:16,133] Trial 31 finished with value: 0.770626100233937 and parameters: {'use_pca': True, 'pca_n_components': 149, 'kernel': 'rbf', 'C': 1.0985041090539482, 'epsilon': 0.016667896193165378, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  66%|██████▌   | 33/50 [00:25<00:07,  2.30it/s]

[I 2025-10-07 08:44:16,560] Trial 32 finished with value: 0.77363079111254 and parameters: {'use_pca': True, 'pca_n_components': 142, 'kernel': 'rbf', 'C': 2.597924133446453, 'epsilon': 0.007695665716751057, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  68%|██████▊   | 34/50 [00:26<00:07,  2.28it/s]

[I 2025-10-07 08:44:17,005] Trial 33 finished with value: 0.7497882766094923 and parameters: {'use_pca': True, 'pca_n_components': 124, 'kernel': 'rbf', 'C': 28.470148486866638, 'epsilon': 0.006611143658863259, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  70%|███████   | 35/50 [00:26<00:07,  2.11it/s]

[I 2025-10-07 08:44:17,566] Trial 34 finished with value: 0.7722140390075918 and parameters: {'use_pca': True, 'pca_n_components': 171, 'kernel': 'rbf', 'C': 4.438743437214909, 'epsilon': 0.031082771394702037, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  72%|███████▏  | 36/50 [00:27<00:06,  2.14it/s]

[I 2025-10-07 08:44:18,017] Trial 35 finished with value: 0.7772110899283389 and parameters: {'use_pca': True, 'pca_n_components': 131, 'kernel': 'rbf', 'C': 10.222568898436206, 'epsilon': 0.1313061182404721, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  74%|███████▍  | 37/50 [00:27<00:05,  2.25it/s]

[I 2025-10-07 08:44:18,411] Trial 36 finished with value: 0.424095117155464 and parameters: {'use_pca': True, 'pca_n_components': 85, 'kernel': 'sigmoid', 'C': 10.168075635458722, 'epsilon': 0.13887806365702077, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  76%|███████▌  | 38/50 [00:27<00:05,  2.16it/s]

[I 2025-10-07 08:44:18,914] Trial 37 finished with value: 0.7715642941483929 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 17.403528462327476, 'epsilon': 0.074520448002695, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  78%|███████▊  | 39/50 [00:28<00:05,  2.18it/s]

[I 2025-10-07 08:44:19,365] Trial 38 finished with value: 0.7615216945893634 and parameters: {'use_pca': True, 'pca_n_components': 133, 'kernel': 'poly', 'C': 93.18782521369883, 'epsilon': 0.21621418536981882, 'gamma_choice': 'auto', 'degree': 4, 'coef0': 0.6101127978025463}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  80%|████████  | 40/50 [00:28<00:04,  2.12it/s]

[I 2025-10-07 08:44:19,864] Trial 39 finished with value: 0.7669371980011596 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 7.779229684254186, 'epsilon': 0.11918041108130484, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  82%|████████▏ | 41/50 [00:29<00:04,  2.15it/s]

[I 2025-10-07 08:44:20,314] Trial 40 finished with value: 0.6160283830048447 and parameters: {'use_pca': True, 'pca_n_components': 134, 'kernel': 'rbf', 'C': 0.2047130579067467, 'epsilon': 0.0512238558714341, 'gamma_choice': 'auto'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  84%|████████▍ | 42/50 [00:29<00:03,  2.13it/s]

[I 2025-10-07 08:44:20,791] Trial 41 finished with value: 0.7767496829674737 and parameters: {'use_pca': True, 'pca_n_components': 194, 'kernel': 'rbf', 'C': 2.2304322641782726, 'epsilon': 0.019984022585839523, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  86%|████████▌ | 43/50 [00:30<00:03,  2.09it/s]

[I 2025-10-07 08:44:21,290] Trial 42 finished with value: 0.04533561063734026 and parameters: {'use_pca': True, 'pca_n_components': 198, 'kernel': 'rbf', 'C': 0.5916638444928212, 'epsilon': 0.018386651690105366, 'gamma_choice': 'numeric', 'gamma': 3.5326264162860865}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 16. Best value: 0.777775:  88%|████████▊ | 44/50 [00:30<00:03,  1.98it/s]

[I 2025-10-07 08:44:21,864] Trial 43 finished with value: 0.7766605708758665 and parameters: {'use_pca': True, 'pca_n_components': 227, 'kernel': 'rbf', 'C': 2.683150447328483, 'epsilon': 0.02552084151840713, 'gamma_choice': 'scale'}. Best is trial 16 with value: 0.7777751369483401.


Best trial: 44. Best value: 0.782451:  90%|█████████ | 45/50 [00:31<00:02,  1.95it/s]

[I 2025-10-07 08:44:22,387] Trial 44 finished with value: 0.7824509015948344 and parameters: {'use_pca': True, 'pca_n_components': 220, 'kernel': 'rbf', 'C': 10.629746378833838, 'epsilon': 0.10229560218392462, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7824509015948344.


Best trial: 44. Best value: 0.782451:  92%|█████████▏| 46/50 [00:31<00:01,  2.07it/s]

[I 2025-10-07 08:44:22,802] Trial 45 finished with value: 0.7544856996855821 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 13.550012050203637, 'epsilon': 0.12030849949411335, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7824509015948344.


Best trial: 44. Best value: 0.782451:  94%|█████████▍| 47/50 [00:32<00:01,  1.98it/s]

[I 2025-10-07 08:44:23,361] Trial 46 finished with value: 0.7580934642424977 and parameters: {'use_pca': True, 'pca_n_components': 227, 'kernel': 'rbf', 'C': 348.5666746675904, 'epsilon': 0.3895367312319191, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7824509015948344.


Best trial: 44. Best value: 0.782451:  96%|█████████▌| 48/50 [00:32<00:01,  1.89it/s]

[I 2025-10-07 08:44:23,944] Trial 47 finished with value: 0.7786121818830714 and parameters: {'use_pca': True, 'pca_n_components': 263, 'kernel': 'rbf', 'C': 26.07236136884119, 'epsilon': 0.18687299567064683, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7824509015948344.


Best trial: 44. Best value: 0.782451:  98%|█████████▊| 49/50 [00:33<00:00,  1.87it/s]

[I 2025-10-07 08:44:24,490] Trial 48 finished with value: 0.7724517237101943 and parameters: {'use_pca': True, 'pca_n_components': 264, 'kernel': 'rbf', 'C': 52.14914333768239, 'epsilon': 0.15716963821242436, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7824509015948344.


Best trial: 44. Best value: 0.782451: 100%|██████████| 50/50 [00:33<00:00,  1.47it/s]


[I 2025-10-07 08:44:24,980] Trial 49 finished with value: 0.6716591784959041 and parameters: {'use_pca': True, 'pca_n_components': 265, 'kernel': 'poly', 'C': 25.043559470483444, 'epsilon': 0.8057782247721512, 'gamma_choice': 'auto', 'degree': 3, 'coef0': 0.07562095708048178}. Best is trial 44 with value: 0.7824509015948344.
SVM best R2: 0.7824509015948344
SVM best params: {'use_pca': True, 'pca_n_components': 220, 'kernel': 'rbf', 'C': 10.629746378833838, 'epsilon': 0.10229560218392462, 'gamma_choice': 'auto'}


In [19]:
knn_study, knn_pipe = run_optuna_knn(n_trials=50)

[I 2025-10-07 08:44:25,373] A new study created in memory with name: no-name-04233f45-8a87-4354-abe9-8f968a4d25ba
Best trial: 0. Best value: 0.713689:   2%|▏         | 1/50 [00:00<00:17,  2.74it/s]

[I 2025-10-07 08:44:25,733] Trial 0 finished with value: 0.713689233868753 and parameters: {'use_pca': True, 'pca_n_components': 28, 'n_neighbors': 2, 'weights': 'distance', 'p': 2}. Best is trial 0 with value: 0.713689233868753.


Best trial: 1. Best value: 0.736402:   4%|▍         | 2/50 [00:00<00:22,  2.12it/s]

[I 2025-10-07 08:44:26,276] Trial 1 finished with value: 0.7364020842677571 and parameters: {'use_pca': True, 'pca_n_components': 227, 'n_neighbors': 7, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:   6%|▌         | 3/50 [00:01<00:17,  2.76it/s]

[I 2025-10-07 08:44:26,508] Trial 2 finished with value: 0.7265386294904894 and parameters: {'use_pca': False, 'n_neighbors': 9, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:   8%|▊         | 4/50 [00:01<00:15,  3.00it/s]

[I 2025-10-07 08:44:26,801] Trial 3 finished with value: 0.7073279006592864 and parameters: {'use_pca': True, 'pca_n_components': 55, 'n_neighbors': 25, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  10%|█         | 5/50 [00:01<00:13,  3.31it/s]

[I 2025-10-07 08:44:27,046] Trial 4 finished with value: 0.6825447451585891 and parameters: {'use_pca': False, 'n_neighbors': 22, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  12%|█▏        | 6/50 [00:02<00:14,  3.02it/s]

[I 2025-10-07 08:44:27,435] Trial 5 finished with value: 0.6925644540739772 and parameters: {'use_pca': True, 'pca_n_components': 190, 'n_neighbors': 23, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  14%|█▍        | 7/50 [00:02<00:14,  2.95it/s]

[I 2025-10-07 08:44:27,788] Trial 6 finished with value: 0.6902682452897138 and parameters: {'use_pca': False, 'n_neighbors': 28, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  16%|█▌        | 8/50 [00:02<00:16,  2.60it/s]

[I 2025-10-07 08:44:28,273] Trial 7 finished with value: 0.6867127069006249 and parameters: {'use_pca': True, 'pca_n_components': 258, 'n_neighbors': 23, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  18%|█▊        | 9/50 [00:03<00:13,  3.01it/s]

[I 2025-10-07 08:44:28,488] Trial 8 finished with value: 0.6601590327060892 and parameters: {'use_pca': False, 'n_neighbors': 29, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  20%|██        | 10/50 [00:03<00:13,  3.01it/s]

[I 2025-10-07 08:44:28,815] Trial 9 finished with value: 0.7143986223557672 and parameters: {'use_pca': True, 'pca_n_components': 133, 'n_neighbors': 18, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  22%|██▏       | 11/50 [00:04<00:15,  2.45it/s]

[I 2025-10-07 08:44:29,402] Trial 10 finished with value: 0.7137971262931091 and parameters: {'use_pca': True, 'pca_n_components': 300, 'n_neighbors': 11, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  24%|██▍       | 12/50 [00:04<00:14,  2.54it/s]

[I 2025-10-07 08:44:29,759] Trial 11 finished with value: 0.7354008166013106 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  26%|██▌       | 13/50 [00:04<00:13,  2.71it/s]

[I 2025-10-07 08:44:30,074] Trial 12 finished with value: 0.71796419760872 and parameters: {'use_pca': False, 'n_neighbors': 2, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  28%|██▊       | 14/50 [00:05<00:13,  2.72it/s]

[I 2025-10-07 08:44:30,439] Trial 13 finished with value: 0.7356029182257698 and parameters: {'use_pca': False, 'n_neighbors': 9, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 1. Best value: 0.736402:  30%|███       | 15/50 [00:05<00:13,  2.62it/s]

[I 2025-10-07 08:44:30,853] Trial 14 finished with value: 0.7198304846231796 and parameters: {'use_pca': True, 'pca_n_components': 193, 'n_neighbors': 13, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.7364020842677571.


Best trial: 15. Best value: 0.737201:  32%|███▏      | 16/50 [00:05<00:12,  2.66it/s]

[I 2025-10-07 08:44:31,216] Trial 15 finished with value: 0.7372014293322435 and parameters: {'use_pca': False, 'n_neighbors': 6, 'weights': 'distance', 'p': 1}. Best is trial 15 with value: 0.7372014293322435.


Best trial: 15. Best value: 0.737201:  34%|███▍      | 17/50 [00:06<00:12,  2.74it/s]

[I 2025-10-07 08:44:31,558] Trial 16 finished with value: 0.7361310529821781 and parameters: {'use_pca': True, 'pca_n_components': 115, 'n_neighbors': 5, 'weights': 'distance', 'p': 1}. Best is trial 15 with value: 0.7372014293322435.


Best trial: 15. Best value: 0.737201:  36%|███▌      | 18/50 [00:06<00:11,  2.81it/s]

[I 2025-10-07 08:44:31,889] Trial 17 finished with value: 0.7208608818891589 and parameters: {'use_pca': False, 'n_neighbors': 15, 'weights': 'distance', 'p': 1}. Best is trial 15 with value: 0.7372014293322435.


Best trial: 18. Best value: 0.762042:  38%|███▊      | 19/50 [00:06<00:11,  2.64it/s]

[I 2025-10-07 08:44:32,323] Trial 18 finished with value: 0.7620417404695455 and parameters: {'use_pca': True, 'pca_n_components': 240, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 18 with value: 0.7620417404695455.


Best trial: 19. Best value: 0.765898:  40%|████      | 20/50 [00:07<00:10,  2.75it/s]

[I 2025-10-07 08:44:32,652] Trial 19 finished with value: 0.7658979483041399 and parameters: {'use_pca': False, 'n_neighbors': 5, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  42%|████▏     | 21/50 [00:07<00:10,  2.77it/s]

[I 2025-10-07 08:44:33,005] Trial 20 finished with value: 0.7427459656779221 and parameters: {'use_pca': False, 'n_neighbors': 13, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  44%|████▍     | 22/50 [00:07<00:10,  2.78it/s]

[I 2025-10-07 08:44:33,363] Trial 21 finished with value: 0.7250721763688895 and parameters: {'use_pca': False, 'n_neighbors': 17, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  46%|████▌     | 23/50 [00:08<00:09,  2.77it/s]

[I 2025-10-07 08:44:33,728] Trial 22 finished with value: 0.582900849993041 and parameters: {'use_pca': False, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  48%|████▊     | 24/50 [00:08<00:09,  2.78it/s]

[I 2025-10-07 08:44:34,084] Trial 23 finished with value: 0.7488824738800499 and parameters: {'use_pca': False, 'n_neighbors': 12, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  50%|█████     | 25/50 [00:09<00:08,  2.80it/s]

[I 2025-10-07 08:44:34,437] Trial 24 finished with value: 0.7590450338653796 and parameters: {'use_pca': False, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  52%|█████▏    | 26/50 [00:09<00:08,  2.86it/s]

[I 2025-10-07 08:44:34,767] Trial 25 finished with value: 0.7590450338653796 and parameters: {'use_pca': False, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  54%|█████▍    | 27/50 [00:09<00:09,  2.53it/s]

[I 2025-10-07 08:44:35,266] Trial 26 finished with value: 0.755321897513636 and parameters: {'use_pca': True, 'pca_n_components': 284, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  56%|█████▌    | 28/50 [00:10<00:08,  2.56it/s]

[I 2025-10-07 08:44:35,641] Trial 27 finished with value: 0.7566547869579198 and parameters: {'use_pca': False, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  58%|█████▊    | 29/50 [00:10<00:07,  2.68it/s]

[I 2025-10-07 08:44:35,978] Trial 28 finished with value: 0.7562478299947513 and parameters: {'use_pca': True, 'pca_n_components': 91, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  60%|██████    | 30/50 [00:11<00:07,  2.64it/s]

[I 2025-10-07 08:44:36,371] Trial 29 finished with value: 0.5622067759975604 and parameters: {'use_pca': True, 'pca_n_components': 181, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  62%|██████▏   | 31/50 [00:11<00:06,  3.04it/s]

[I 2025-10-07 08:44:36,582] Trial 30 finished with value: 0.7471209570944437 and parameters: {'use_pca': False, 'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  64%|██████▍   | 32/50 [00:11<00:05,  3.08it/s]

[I 2025-10-07 08:44:36,900] Trial 31 finished with value: 0.7590450338653796 and parameters: {'use_pca': False, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 19. Best value: 0.765898:  66%|██████▌   | 33/50 [00:11<00:05,  3.16it/s]

[I 2025-10-07 08:44:37,195] Trial 32 finished with value: 0.7578624946800685 and parameters: {'use_pca': False, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 19 with value: 0.7658979483041399.


Best trial: 33. Best value: 0.765973:  68%|██████▊   | 34/50 [00:12<00:05,  3.18it/s]

[I 2025-10-07 08:44:37,507] Trial 33 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  70%|███████   | 35/50 [00:12<00:04,  3.12it/s]

[I 2025-10-07 08:44:37,840] Trial 34 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  72%|███████▏  | 36/50 [00:12<00:04,  2.94it/s]

[I 2025-10-07 08:44:38,224] Trial 35 finished with value: 0.7574536053552073 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  74%|███████▍  | 37/50 [00:13<00:04,  2.77it/s]

[I 2025-10-07 08:44:38,633] Trial 36 finished with value: 0.7415428122299964 and parameters: {'use_pca': True, 'pca_n_components': 238, 'n_neighbors': 2, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  76%|███████▌  | 38/50 [00:13<00:03,  3.08it/s]

[I 2025-10-07 08:44:38,876] Trial 37 finished with value: 0.7526133098704693 and parameters: {'use_pca': False, 'n_neighbors': 8, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  78%|███████▊  | 39/50 [00:13<00:03,  3.00it/s]

[I 2025-10-07 08:44:39,230] Trial 38 finished with value: 0.5661425665278089 and parameters: {'use_pca': True, 'pca_n_components': 163, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  80%|████████  | 40/50 [00:14<00:03,  3.30it/s]

[I 2025-10-07 08:44:39,464] Trial 39 finished with value: 0.7573195145761333 and parameters: {'use_pca': False, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  82%|████████▏ | 41/50 [00:14<00:02,  3.29it/s]

[I 2025-10-07 08:44:39,767] Trial 40 finished with value: 0.7186426051954287 and parameters: {'use_pca': False, 'n_neighbors': 19, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  84%|████████▍ | 42/50 [00:14<00:02,  3.29it/s]

[I 2025-10-07 08:44:40,073] Trial 41 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  86%|████████▌ | 43/50 [00:15<00:02,  3.28it/s]

[I 2025-10-07 08:44:40,380] Trial 42 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  88%|████████▊ | 44/50 [00:15<00:01,  3.28it/s]

[I 2025-10-07 08:44:40,685] Trial 43 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  90%|█████████ | 45/50 [00:15<00:01,  3.29it/s]

[I 2025-10-07 08:44:40,987] Trial 44 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  92%|█████████▏| 46/50 [00:15<00:01,  3.16it/s]

[I 2025-10-07 08:44:41,330] Trial 45 finished with value: 0.7659725043609967 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  94%|█████████▍| 47/50 [00:16<00:00,  3.07it/s]

[I 2025-10-07 08:44:41,680] Trial 46 finished with value: 0.7343877380477967 and parameters: {'use_pca': False, 'n_neighbors': 2, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  96%|█████████▌| 48/50 [00:16<00:00,  3.10it/s]

[I 2025-10-07 08:44:41,995] Trial 47 finished with value: 0.6920749209522636 and parameters: {'use_pca': False, 'n_neighbors': 27, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973:  98%|█████████▊| 49/50 [00:16<00:00,  3.21it/s]

[I 2025-10-07 08:44:42,281] Trial 48 finished with value: 0.7602692173276665 and parameters: {'use_pca': False, 'n_neighbors': 9, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.


Best trial: 33. Best value: 0.765973: 100%|██████████| 50/50 [00:17<00:00,  2.90it/s]

[I 2025-10-07 08:44:42,597] Trial 49 finished with value: 0.7142793449704775 and parameters: {'use_pca': False, 'n_neighbors': 21, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7659725043609967.
KNN best R2: 0.7659725043609967
KNN best params: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}





In [20]:
rf_study, rf_pipe = run_optuna_rf(n_trials=50)

[I 2025-10-07 08:44:42,675] A new study created in memory with name: no-name-0267da4e-dad1-4765-8812-ac60547e6861
Best trial: 0. Best value: 0.727811:   2%|▏         | 1/50 [00:02<02:22,  2.90s/it]

[I 2025-10-07 08:44:45,572] Trial 0 finished with value: 0.7278113040569016 and parameters: {'n_estimators': 253, 'max_depth': 26, 'min_samples_split': 17, 'min_samples_leaf': 12, 'max_features': 0.8}. Best is trial 0 with value: 0.7278113040569016.


Best trial: 0. Best value: 0.727811:   4%|▍         | 2/50 [00:05<02:00,  2.50s/it]

[I 2025-10-07 08:44:47,796] Trial 1 finished with value: 0.7030440733361011 and parameters: {'n_estimators': 434, 'max_depth': 47, 'min_samples_split': 2, 'min_samples_leaf': 18, 'max_features': 0.3}. Best is trial 0 with value: 0.7278113040569016.


Best trial: 2. Best value: 0.748577:   6%|▌         | 3/50 [00:09<02:35,  3.32s/it]

[I 2025-10-07 08:44:52,084] Trial 2 finished with value: 0.7485770262584359 and parameters: {'n_estimators': 284, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 0.8}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:   8%|▊         | 4/50 [00:12<02:30,  3.28s/it]

[I 2025-10-07 08:44:55,295] Trial 3 finished with value: 0.7440497577865605 and parameters: {'n_estimators': 153, 'max_depth': 25, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_features': 0.8}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:  10%|█         | 5/50 [00:13<01:48,  2.40s/it]

[I 2025-10-07 08:44:56,156] Trial 4 finished with value: 0.6227018276608598 and parameters: {'n_estimators': 218, 'max_depth': 22, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:  12%|█▏        | 6/50 [00:15<01:33,  2.13s/it]

[I 2025-10-07 08:44:57,741] Trial 5 finished with value: 0.7164382840817598 and parameters: {'n_estimators': 376, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:  14%|█▍        | 7/50 [00:15<01:12,  1.69s/it]

[I 2025-10-07 08:44:58,529] Trial 6 finished with value: 0.6553929734757882 and parameters: {'n_estimators': 149, 'max_depth': 37, 'min_samples_split': 8, 'min_samples_leaf': 14, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:  16%|█▌        | 8/50 [00:17<01:08,  1.63s/it]

[I 2025-10-07 08:45:00,015] Trial 7 finished with value: 0.6460238698352476 and parameters: {'n_estimators': 417, 'max_depth': 26, 'min_samples_split': 8, 'min_samples_leaf': 16, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 2. Best value: 0.748577:  18%|█▊        | 9/50 [00:18<01:03,  1.54s/it]

[I 2025-10-07 08:45:01,334] Trial 8 finished with value: 0.638859989835293 and parameters: {'n_estimators': 343, 'max_depth': 43, 'min_samples_split': 18, 'min_samples_leaf': 17, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7485770262584359.


Best trial: 9. Best value: 0.75167:  20%|██        | 10/50 [00:23<01:37,  2.45s/it]

[I 2025-10-07 08:45:05,840] Trial 9 finished with value: 0.751669956028459 and parameters: {'n_estimators': 276, 'max_depth': 11, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 0.8}. Best is trial 9 with value: 0.751669956028459.


Best trial: 9. Best value: 0.75167:  22%|██▏       | 11/50 [00:27<02:02,  3.13s/it]

[I 2025-10-07 08:45:10,526] Trial 10 finished with value: 0.7399469891446564 and parameters: {'n_estimators': 495, 'max_depth': 6, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 9 with value: 0.751669956028459.


Best trial: 9. Best value: 0.75167:  24%|██▍       | 12/50 [00:32<02:14,  3.54s/it]

[I 2025-10-07 08:45:14,991] Trial 11 finished with value: 0.7473980965983616 and parameters: {'n_estimators': 292, 'max_depth': 11, 'min_samples_split': 12, 'min_samples_leaf': 7, 'max_features': 0.8}. Best is trial 9 with value: 0.751669956028459.


Best trial: 12. Best value: 0.753295:  26%|██▌       | 13/50 [00:36<02:14,  3.65s/it]

[I 2025-10-07 08:45:18,893] Trial 12 finished with value: 0.7532954932640534 and parameters: {'n_estimators': 219, 'max_depth': 16, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 0.8}. Best is trial 12 with value: 0.7532954932640534.


Best trial: 13. Best value: 0.756968:  28%|██▊       | 14/50 [00:40<02:18,  3.84s/it]

[I 2025-10-07 08:45:23,191] Trial 13 finished with value: 0.7569683675702414 and parameters: {'n_estimators': 208, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 0.8}. Best is trial 13 with value: 0.7569683675702414.


Best trial: 14. Best value: 0.761415:  30%|███       | 15/50 [00:43<02:04,  3.56s/it]

[I 2025-10-07 08:45:26,104] Trial 14 finished with value: 0.7614152639722053 and parameters: {'n_estimators': 195, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 14 with value: 0.7614152639722053.


Best trial: 14. Best value: 0.761415:  32%|███▏      | 16/50 [00:44<01:31,  2.68s/it]

[I 2025-10-07 08:45:26,741] Trial 15 finished with value: 0.6056802791024445 and parameters: {'n_estimators': 104, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 14 with value: 0.7614152639722053.


Best trial: 16. Best value: 0.770854:  34%|███▍      | 17/50 [00:45<01:19,  2.41s/it]

[I 2025-10-07 08:45:28,517] Trial 16 finished with value: 0.770854164311388 and parameters: {'n_estimators': 183, 'max_depth': 18, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  36%|███▌      | 18/50 [00:46<01:04,  2.03s/it]

[I 2025-10-07 08:45:29,661] Trial 17 finished with value: 0.6943361793029232 and parameters: {'n_estimators': 166, 'max_depth': 33, 'min_samples_split': 15, 'min_samples_leaf': 20, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  38%|███▊      | 19/50 [00:48<00:54,  1.76s/it]

[I 2025-10-07 08:45:30,801] Trial 18 finished with value: 0.7684736849815177 and parameters: {'n_estimators': 111, 'max_depth': 17, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  40%|████      | 20/50 [00:49<00:46,  1.55s/it]

[I 2025-10-07 08:45:31,867] Trial 19 finished with value: 0.7454763408550045 and parameters: {'n_estimators': 106, 'max_depth': 31, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  42%|████▏     | 21/50 [00:51<00:48,  1.68s/it]

[I 2025-10-07 08:45:33,839] Trial 20 finished with value: 0.7659608769752626 and parameters: {'n_estimators': 132, 'max_depth': 21, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  44%|████▍     | 22/50 [00:53<00:48,  1.74s/it]

[I 2025-10-07 08:45:35,719] Trial 21 finished with value: 0.7665002608990366 and parameters: {'n_estimators': 136, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  46%|████▌     | 23/50 [00:55<00:52,  1.94s/it]

[I 2025-10-07 08:45:38,143] Trial 22 finished with value: 0.7645899754491328 and parameters: {'n_estimators': 178, 'max_depth': 19, 'min_samples_split': 13, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  48%|████▊     | 24/50 [00:57<00:47,  1.84s/it]

[I 2025-10-07 08:45:39,743] Trial 23 finished with value: 0.7602978800195703 and parameters: {'n_estimators': 126, 'max_depth': 12, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 0.5}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  50%|█████     | 25/50 [00:58<00:45,  1.81s/it]

[I 2025-10-07 08:45:41,467] Trial 24 finished with value: 0.7373582688312055 and parameters: {'n_estimators': 240, 'max_depth': 18, 'min_samples_split': 14, 'min_samples_leaf': 11, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  52%|█████▏    | 26/50 [01:00<00:42,  1.77s/it]

[I 2025-10-07 08:45:43,172] Trial 25 finished with value: 0.7650301592654841 and parameters: {'n_estimators': 178, 'max_depth': 31, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 16 with value: 0.770854164311388.


Best trial: 16. Best value: 0.770854:  54%|█████▍    | 27/50 [01:02<00:40,  1.75s/it]

[I 2025-10-07 08:45:44,873] Trial 26 finished with value: 0.755514006764175 and parameters: {'n_estimators': 130, 'max_depth': 8, 'min_samples_split': 11, 'min_samples_leaf': 5, 'max_features': 0.5}. Best is trial 16 with value: 0.770854164311388.


Best trial: 27. Best value: 0.772526:  56%|█████▌    | 28/50 [01:05<00:51,  2.35s/it]

[I 2025-10-07 08:45:48,622] Trial 27 finished with value: 0.772525745003823 and parameters: {'n_estimators': 333, 'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  58%|█████▊    | 29/50 [01:09<00:53,  2.56s/it]

[I 2025-10-07 08:45:51,680] Trial 28 finished with value: 0.7546128719908963 and parameters: {'n_estimators': 342, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  60%|██████    | 30/50 [01:11<00:51,  2.60s/it]

[I 2025-10-07 08:45:54,366] Trial 29 finished with value: 0.726581328855727 and parameters: {'n_estimators': 334, 'max_depth': 28, 'min_samples_split': 16, 'min_samples_leaf': 13, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  62%|██████▏   | 31/50 [01:14<00:48,  2.55s/it]

[I 2025-10-07 08:45:56,794] Trial 30 finished with value: 0.7561833762374526 and parameters: {'n_estimators': 256, 'max_depth': 14, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  64%|██████▍   | 32/50 [01:15<00:39,  2.19s/it]

[I 2025-10-07 08:45:58,129] Trial 31 finished with value: 0.74098727910688 and parameters: {'n_estimators': 322, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  66%|██████▌   | 33/50 [01:19<00:45,  2.65s/it]

[I 2025-10-07 08:46:01,875] Trial 32 finished with value: 0.7666758421317554 and parameters: {'n_estimators': 377, 'max_depth': 21, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  68%|██████▊   | 34/50 [01:22<00:47,  2.98s/it]

[I 2025-10-07 08:46:05,625] Trial 33 finished with value: 0.7688648498461941 and parameters: {'n_estimators': 390, 'max_depth': 22, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 27. Best value: 0.772526:  70%|███████   | 35/50 [01:26<00:47,  3.19s/it]

[I 2025-10-07 08:46:09,289] Trial 34 finished with value: 0.7595090134735918 and parameters: {'n_estimators': 424, 'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 27 with value: 0.772525745003823.


Best trial: 35. Best value: 0.773078:  72%|███████▏  | 36/50 [01:30<00:48,  3.43s/it]

[I 2025-10-07 08:46:13,283] Trial 35 finished with value: 0.7730782056657596 and parameters: {'n_estimators': 380, 'max_depth': 23, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  74%|███████▍  | 37/50 [01:34<00:47,  3.68s/it]

[I 2025-10-07 08:46:17,535] Trial 36 finished with value: 0.770890436646592 and parameters: {'n_estimators': 393, 'max_depth': 23, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  76%|███████▌  | 38/50 [01:39<00:49,  4.10s/it]

[I 2025-10-07 08:46:22,619] Trial 37 finished with value: 0.7709913880943791 and parameters: {'n_estimators': 455, 'max_depth': 24, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  78%|███████▊  | 39/50 [01:42<00:40,  3.70s/it]

[I 2025-10-07 08:46:25,382] Trial 38 finished with value: 0.7524070997368246 and parameters: {'n_estimators': 490, 'max_depth': 35, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  80%|████████  | 40/50 [01:45<00:34,  3.44s/it]

[I 2025-10-07 08:46:28,225] Trial 39 finished with value: 0.7688564694867879 and parameters: {'n_estimators': 462, 'max_depth': 24, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt'}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  82%|████████▏ | 41/50 [01:49<00:32,  3.64s/it]

[I 2025-10-07 08:46:32,321] Trial 40 finished with value: 0.7471894342046049 and parameters: {'n_estimators': 452, 'max_depth': 28, 'min_samples_split': 7, 'min_samples_leaf': 9, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  84%|████████▍ | 42/50 [01:54<00:32,  4.04s/it]

[I 2025-10-07 08:46:37,282] Trial 41 finished with value: 0.769389305357605 and parameters: {'n_estimators': 391, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  86%|████████▌ | 43/50 [01:58<00:27,  3.93s/it]

[I 2025-10-07 08:46:40,950] Trial 42 finished with value: 0.7686554793647283 and parameters: {'n_estimators': 366, 'max_depth': 40, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  88%|████████▊ | 44/50 [02:01<00:21,  3.64s/it]

[I 2025-10-07 08:46:43,921] Trial 43 finished with value: 0.7596391418421586 and parameters: {'n_estimators': 314, 'max_depth': 21, 'min_samples_split': 7, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  90%|█████████ | 45/50 [02:06<00:20,  4.06s/it]

[I 2025-10-07 08:46:48,952] Trial 44 finished with value: 0.7652913687946856 and parameters: {'n_estimators': 403, 'max_depth': 49, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  92%|█████████▏| 46/50 [02:08<00:13,  3.38s/it]

[I 2025-10-07 08:46:50,761] Trial 45 finished with value: 0.6517279775603289 and parameters: {'n_estimators': 447, 'max_depth': 23, 'min_samples_split': 9, 'min_samples_leaf': 15, 'max_features': 'sqrt'}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  94%|█████████▍| 47/50 [02:09<00:08,  2.79s/it]

[I 2025-10-07 08:46:52,165] Trial 46 finished with value: 0.6980656247245822 and parameters: {'n_estimators': 361, 'max_depth': 27, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  96%|█████████▌| 48/50 [02:15<00:07,  3.61s/it]

[I 2025-10-07 08:46:57,694] Trial 47 finished with value: 0.7720725097645643 and parameters: {'n_estimators': 469, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078:  98%|█████████▊| 49/50 [02:21<00:04,  4.41s/it]

[I 2025-10-07 08:47:03,969] Trial 48 finished with value: 0.7721923904804227 and parameters: {'n_estimators': 481, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7730782056657596.


Best trial: 35. Best value: 0.773078: 100%|██████████| 50/50 [02:46<00:00,  3.33s/it]


[I 2025-10-07 08:47:29,018] Trial 49 finished with value: 0.7547027128762174 and parameters: {'n_estimators': 477, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 0.8}. Best is trial 35 with value: 0.7730782056657596.
RF best R2: 0.7730782056657596
RF best params: {'n_estimators': 380, 'max_depth': 23, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 0.3}


# 7. Try a meta model or ensembler of the best models

In [21]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict

# reproducible folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1) Get OOF predictions for each base model (these preds are made by models
#    that were trained without the corresponding sample — no leakage)
print("Generating OOF preds (this may take time)...")
oof_knn = cross_val_predict(knn_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_svm = cross_val_predict(svm_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_rf  = cross_val_predict(rf_pipe,  X, y, cv=kf, n_jobs=-1, method='predict')

# Stack OOF predictions (n_samples x n_models)
stack_oof = np.vstack([oof_knn, oof_svm, oof_rf]).T

# 2) Simple average ensemble
ens_mean = stack_oof.mean(axis=1)
r2_mean = r2_score(y, ens_mean)
rmse_mean = np.sqrt(mean_squared_error(y, ens_mean))
print(f"Simple average ensemble -> R2: {r2_mean:.4f}, RMSE: {rmse_mean:.4f}")

Generating OOF preds (this may take time)...
Simple average ensemble -> R2: 0.7925, RMSE: 0.8179


In [22]:
# 3) Stacking: train a Ridge meta-learner on the OOF stack
meta = Ridge(alpha=1.0)
meta.fit(stack_oof, y)                 # training on OOF preds is OK (no leakage)
ens_stack = meta.predict(stack_oof)    # predictions on the same OOF matrix
r2_stack = r2_score(y, ens_stack)
rmse_stack = np.sqrt(mean_squared_error(y, ens_stack))
print(f"Stacking (Ridge) ensemble -> R2: {r2_stack:.4f}, RMSE: {rmse_stack:.4f}")

# Show meta-learner weights
print("Meta-learner coefficients (weights):", meta.coef_)
print("Meta-learner intercept:", meta.intercept_)

Stacking (Ridge) ensemble -> R2: 0.7945, RMSE: 0.8141
Meta-learner coefficients (weights): [0.34952213 0.54508747 0.0954718 ]
Meta-learner intercept: 0.057185963824212926


In [23]:
joblib.dump(meta, "../models/pi_meta.joblib")
print("Saved final base pipelines and meta-learner.")

Saved final base pipelines and meta-learner.
