# Import Libraries and Tools and Create Helper Functions

In [1]:
# Requirements: pandas, numpy, scikit-learn, xgboost, joblib, matplotlib
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Models for scaled data
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Tree based models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
integrase = pd.read_csv('../data/ii_df_clean.csv')
integrase.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_units_norm,IC50_nM,pIC50
0,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1400.0,5.85,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1400.0,5.853872
1,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1000.0,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1000.0,6.0
2,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1700.0,5.77,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1700.0,5.769551
3,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1000.0,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1000.0,6.0
4,CHEMBL177126,O=C(/C=C/c1ccc(O)c(O)c1)O[C@H]1[C@H](O)C[C@](O...,IC50,250.0,6.6,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,250.0,6.60206


# Data Preprocessing

We have appended the morgan fingerprints and molecular descriptors into a single dataframe

In [3]:
df = pd.read_csv('../data/ii_qsar_features.csv')
print("rows, cols:", df.shape)

rows, cols: (6386, 523)


In [4]:
df.head()

Unnamed: 0,MolWt,MolLogP,MolMR,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumAromaticRings,HeavyAtomCount,FractionCSP3,...,Morgan_512_503,Morgan_512_504,Morgan_512_505,Morgan_512_506,Morgan_512_507,Morgan_512_508,Morgan_512_509,Morgan_512_510,Morgan_512_511,pIC50
0,498.484,2.892,125.2126,177.14,7,10,7,3,36,0.269231,...,0,1,0,0,1,0,0,0,0,5.853872
1,498.484,2.892,125.2126,177.14,7,10,7,3,36,0.269231,...,0,1,0,0,1,0,0,0,0,6.0
2,290.271,2.0301,73.8595,118.22,5,6,4,2,21,0.133333,...,0,0,0,0,0,0,0,0,0,5.769551
3,290.271,2.0301,73.8595,118.22,5,6,4,2,21,0.133333,...,0,0,0,0,0,0,0,0,0,6.0
4,516.455,1.0296,125.1976,211.28,7,11,7,2,37,0.24,...,0,0,1,0,1,0,0,0,0,6.60206


In [5]:
# Prepare X and y
y = df['pIC50']
X = df.drop(columns=['pIC50'])
print('Feature count: ', X.shape[1])

Feature count:  522


In [6]:
y.isna().sum()

np.int64(0)

In [7]:
# ------------------------------------------------------------------------------
# Quick feature cleanup
# ------------------------------------------------------------------------------

# a. remove near-constant features

# Fit the selector
vt = VarianceThreshold(threshold=1e-6)
X_v = vt.fit_transform(X)
# Features kept
kept_features = X.columns[vt.get_support()].tolist()
# Features removed
removed_features = [col for col in X.columns if col not in kept_features]
# Create the reduced DataFrame
X = pd.DataFrame(X_v, columns=kept_features)

print(f"Features after variance filter: {len(kept_features)}")
print(f"Features removed: {len(removed_features)}")
# Optional: display or save the removed features
print("\nRemoved features (first 20):")
print(removed_features[:20])

Features after variance filter: 522
Features removed: 0

Removed features (first 20):
[]


In [8]:
# b. (optional) remove extremely collinear features (simple correlation filter)

corr_thresh = 0.98
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > corr_thresh)]
if to_drop:
    X.drop(columns=to_drop, inplace=True)
    print("Dropped highly correlated features:", to_drop)

Dropped highly correlated features: ['MolMR', 'HeavyAtomCount']


In [9]:
X.shape

(6386, 520)

In [10]:
# ------------------------------------------------------------------------------
# Define descriptor columns to scale and fingerprint columns to pass through
# ------------------------------------------------------------------------------
descriptors = ['MolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors',
    'NumRotatableBonds', 'NumAromaticRings', 'FractionCSP3']
fingerprints = [c for c in X.columns if c not in descriptors]

# ColumnTransformer: scale descriptors, pass fingerprints through
preprocessor = ColumnTransformer([
    ("desc", StandardScaler(), descriptors),
    ("fp", "passthrough", fingerprints)
])

In [11]:
import os

SAVE_DIR = "../artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

# 1. Save final feature column list (after filters)
final_features = X.columns.tolist()
pd.Series(final_features).to_csv(os.path.join(SAVE_DIR, "integrase_features.csv"), index=False)

# QSAR Model Training

In [12]:
# ---------------------------
# k-fold OOF trainer
# ---------------------------
def kfold_train_predict(model, X_df, y_arr, n_splits=5, random_state=42):
    """
    model: estimator or pipeline (should accept fit/predict)
    X_df: pandas DataFrame (full dataset)
    y_arr: numpy array or Series
    returns: oof_preds (np.array same length as X_df), mean_r2, mean_rmse
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_df, y_arr), 1):
        X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
        y_train, y_val = y_arr[train_idx], y_arr[val_idx]

        # fit on train fold, predict on val fold
        model.fit(X_train, y_train)
        preds = model.predict(X_val)

        oof_preds[val_idx] = preds

        fold_r2 = r2_score(y_val, preds)
        fold_rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_scores.append((fold_r2, fold_rmse))
        print(f"Fold {fold}: R2 = {fold_r2:.3f}, RMSE = {fold_rmse:.3f}")

    mean_r2 = np.mean([s[0] for s in fold_scores])
    mean_rmse = np.mean([s[1] for s in fold_scores])
    print(f"\nMean CV R2 = {mean_r2:.3f}, Mean RMSE = {mean_rmse:.3f}")
    return oof_preds, mean_r2, mean_rmse

In [13]:
# ---------------------------
# Define models and pipelines
# ---------------------------
models = {}

# Linear models (need scaling) -> include preprocessor
models['Linear'] = Pipeline([("preproc", preprocessor), ("model", LinearRegression())])
models['ElasticNet'] = Pipeline([("preproc", preprocessor), ("model", ElasticNet(random_state=42, max_iter=5000))])

# SVR (needs scaling)
models['SVM'] = Pipeline([("preproc", preprocessor), ("model", SVR())])

# KNN
models['KNN'] = Pipeline([("preproc", preprocessor), ("model", KNeighborsRegressor())])

# MLP
models['MLP'] = Pipeline([("preproc", preprocessor), ("model", MLPRegressor(max_iter=2000, random_state=42))])

# Random Forest (trees don't need scaling, but pipeline is fine)
models['Random Forest'] = Pipeline([("preproc", preprocessor), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])

# XGBoost
models['XGBoost'] = Pipeline([("preproc", preprocessor), ("model", XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, objective='reg:squarederror'))])

In [14]:
# ---------------------------
# Run k-fold for each model
# ---------------------------
results = {}
oof_dict = {}

for name, model in models.items():
    print(f"\n=== Training model: {name} ===")
    oof_preds, mean_r2, mean_rmse = kfold_train_predict(model, X, y, n_splits=5, random_state=42)
    results[name] = {"r2": mean_r2, "rmse": mean_rmse}
    oof_dict[name] = oof_preds

# Summary
print("\nSummary (mean CV results):")
for name, res in results.items():
    print(f"{name}: R2 = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")


=== Training model: Linear ===
Fold 1: R2 = 0.632, RMSE = 0.811
Fold 2: R2 = 0.650, RMSE = 0.818
Fold 3: R2 = 0.631, RMSE = 0.838
Fold 4: R2 = 0.618, RMSE = 0.836
Fold 5: R2 = 0.653, RMSE = 0.797

Mean CV R2 = 0.637, Mean RMSE = 0.820

=== Training model: ElasticNet ===
Fold 1: R2 = -0.001, RMSE = 1.337
Fold 2: R2 = -0.001, RMSE = 1.382
Fold 3: R2 = -0.001, RMSE = 1.380
Fold 4: R2 = -0.000, RMSE = 1.352
Fold 5: R2 = -0.000, RMSE = 1.352

Mean CV R2 = -0.001, Mean RMSE = 1.361

=== Training model: SVM ===
Fold 1: R2 = 0.732, RMSE = 0.692
Fold 2: R2 = 0.763, RMSE = 0.672
Fold 3: R2 = 0.745, RMSE = 0.697
Fold 4: R2 = 0.739, RMSE = 0.691
Fold 5: R2 = 0.761, RMSE = 0.661

Mean CV R2 = 0.748, Mean RMSE = 0.683

=== Training model: KNN ===
Fold 1: R2 = 0.709, RMSE = 0.721
Fold 2: R2 = 0.752, RMSE = 0.688
Fold 3: R2 = 0.740, RMSE = 0.703
Fold 4: R2 = 0.715, RMSE = 0.721
Fold 5: R2 = 0.763, RMSE = 0.658

Mean CV R2 = 0.736, Mean RMSE = 0.698

=== Training model: MLP ===
Fold 1: R2 = 0.643, RMS

# Tune the Best Models

### Support Vector Machine (SVR),
### K-Nearest Neighbours (KNN),
### Random Forest (RF)

In [15]:
import optuna
import joblib
from sklearn.decomposition import PCA

In [16]:
# PCA variant of the model
n_fp = len(fingerprints)
print(f"Using {len(descriptors)} descriptor columns and {n_fp} fingerprint columns.")

if n_fp == 0:
    raise ValueError("No fingerprint columns detected. Make sure your X contains fingerprint bit columns.")

# ---------- CV settings ----------
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

Using 8 descriptor columns and 512 fingerprint columns.


In [17]:
# ----------------------- SVM ---------------------------------
def run_optuna_svm(n_trials=80):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        # SVR hyperparameters (log-uniform sampling where appropriate)
        kernel = trial.suggest_categorical("kernel", ["rbf", "poly", "sigmoid"])
        C = trial.suggest_loguniform("C", 1e-3, 1e3)
        epsilon = trial.suggest_loguniform("epsilon", 1e-4, 1.0)
        gamma_choice = trial.suggest_categorical("gamma_choice", ["scale", "auto", "numeric"])
        if gamma_choice == "numeric":
            gamma = trial.suggest_loguniform("gamma", 1e-5, 1e1)
        else:
            gamma = gamma_choice
        # degree and coef0 for polynomial kernel
        if kernel == "poly":
            degree = trial.suggest_int("degree", 2, 5)
            coef0 = trial.suggest_float("coef0", 0.0, 1.0)
        else:
            degree = 3
            coef0 = 0.0

        svr = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, degree=degree, coef0=coef0, max_iter=100000)
        pipe = Pipeline([("preproc", preprocessor), ("svr", svr)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    # reconstruct SVR
    best_kernel = best.get("kernel", "rbf")
    best_gamma = best.get("gamma", best.get("gamma_choice", "scale"))
    if best_gamma == "numeric":
        best_gamma = best.get("gamma")
    best_degree = best.get("degree", 3)
    best_coef0 = best.get("coef0", 0.0)

    best_svm = SVR(
        kernel=best_kernel,
        C=best.get("C"),
        epsilon=best.get("epsilon"),
        gamma=best_gamma,
        degree=best_degree,
        coef0=best_coef0,
        max_iter=100000
    )
    best_pipe = Pipeline([("preproc", preprocessor_best), ("svr", best_svm)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_svm.joblib")
    print("SVM best R2:", study.best_value)
    print("SVM best params:", study.best_params)
    return study, best_pipe

In [18]:
# ------------------ KNN -----------------------
def run_optuna_knn(n_trials=50):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        p = trial.suggest_int("p", 1, 2)  # 1 = manhattan, 2 = euclidean

        knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p)
        pipe = Pipeline([("preproc", preprocessor), ("knn", knn)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    best_knn = KNeighborsRegressor(n_neighbors=best["n_neighbors"], weights=best["weights"], p=best["p"])
    best_pipe = Pipeline([("preproc", preprocessor_best), ("knn", best_knn)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_knn.joblib")
    print("KNN best R2:", study.best_value)
    print("KNN best params:", study.best_params)
    return study, best_pipe

In [19]:
# -------------------- Random Forest --------------------------------
def run_optuna_rf(n_trials=50):
    def objective(trial):
        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", "passthrough", fingerprints)
        ])

        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 3, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5, 0.8])

        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                   max_features=max_features, n_jobs=-1, random_state=42)
        pipe = Pipeline([("preproc", preprocessor), ("rf", rf)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    best = study.best_params
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", "passthrough", fingerprints)
    ])
    best_rf = RandomForestRegressor(n_estimators=best["n_estimators"], max_depth=best["max_depth"],
                                    min_samples_split=best["min_samples_split"],
                                    min_samples_leaf=best["min_samples_leaf"],
                                    max_features=best["max_features"], n_jobs=-1, random_state=42)
    best_pipe = Pipeline([("preproc", preprocessor_best), ("rf", best_rf)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_rf.joblib")
    print("RF best R2:", study.best_value)
    print("RF best params:", study.best_params)
    return study, best_pipe

In [20]:
# ----------------------------
# Example: run all studies sequentially (adjust n_trials)
# ----------------------------
svm_study, svm_pipe = run_optuna_svm(n_trials=50)

[I 2025-10-07 23:14:18,200] A new study created in memory with name: no-name-d0522039-98ec-40c8-b41e-dd23de756aac
Best trial: 0. Best value: 0.673067:   2%|▏         | 1/50 [00:26<21:20, 26.13s/it]

[I 2025-10-07 23:14:44,315] Trial 0 finished with value: 0.6730669192851183 and parameters: {'use_pca': True, 'pca_n_components': 60, 'kernel': 'poly', 'C': 17.363418028138508, 'epsilon': 0.06440092769435873, 'gamma_choice': 'numeric', 'gamma': 0.037223405208821315, 'degree': 3, 'coef0': 0.009404400250143774}. Best is trial 0 with value: 0.6730669192851183.


Best trial: 1. Best value: 0.678432:   4%|▍         | 2/50 [00:36<13:30, 16.88s/it]

[I 2025-10-07 23:14:54,739] Trial 1 finished with value: 0.678432362256722 and parameters: {'use_pca': True, 'pca_n_components': 87, 'kernel': 'poly', 'C': 0.0972913694709314, 'epsilon': 0.00037184377096643903, 'gamma_choice': 'scale', 'degree': 3, 'coef0': 0.49563748458983936}. Best is trial 1 with value: 0.678432362256722.


Best trial: 1. Best value: 0.678432:   6%|▌         | 3/50 [01:03<16:44, 21.37s/it]

[I 2025-10-07 23:15:21,437] Trial 2 finished with value: 0.6620489787398148 and parameters: {'use_pca': True, 'pca_n_components': 153, 'kernel': 'poly', 'C': 33.38386967372764, 'epsilon': 0.001173099155496379, 'gamma_choice': 'scale', 'degree': 4, 'coef0': 0.14705325334751262}. Best is trial 1 with value: 0.678432362256722.


Best trial: 1. Best value: 0.678432:   8%|▊         | 4/50 [01:16<13:56, 18.18s/it]

[I 2025-10-07 23:15:34,726] Trial 3 finished with value: 0.49172685905862845 and parameters: {'use_pca': True, 'pca_n_components': 222, 'kernel': 'sigmoid', 'C': 20.5241350773133, 'epsilon': 0.00044904233538537976, 'gamma_choice': 'numeric', 'gamma': 2.639790530347574e-05}. Best is trial 1 with value: 0.678432362256722.


Best trial: 4. Best value: 0.713985:  10%|█         | 5/50 [01:40<15:15, 20.34s/it]

[I 2025-10-07 23:15:58,910] Trial 4 finished with value: 0.7139853680275894 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 11.48028908378086, 'epsilon': 0.00013934361419037434, 'gamma_choice': 'scale'}. Best is trial 4 with value: 0.7139853680275894.


Best trial: 4. Best value: 0.713985:  12%|█▏        | 6/50 [01:46<11:21, 15.49s/it]

[I 2025-10-07 23:16:04,992] Trial 5 finished with value: -0.2714377147285928 and parameters: {'use_pca': True, 'pca_n_components': 51, 'kernel': 'sigmoid', 'C': 0.09862544254667456, 'epsilon': 0.03613856173613164, 'gamma_choice': 'auto'}. Best is trial 4 with value: 0.7139853680275894.


Best trial: 4. Best value: 0.713985:  14%|█▍        | 7/50 [02:15<14:07, 19.72s/it]

[I 2025-10-07 23:16:33,402] Trial 6 finished with value: 0.16347133718237275 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.007024863492133858, 'epsilon': 0.02987252958448687, 'gamma_choice': 'scale'}. Best is trial 4 with value: 0.7139853680275894.


Best trial: 4. Best value: 0.713985:  16%|█▌        | 8/50 [02:36<14:03, 20.09s/it]

[I 2025-10-07 23:16:54,299] Trial 7 finished with value: 0.6986005230611801 and parameters: {'use_pca': True, 'pca_n_components': 262, 'kernel': 'poly', 'C': 19.298077540447903, 'epsilon': 0.019233947150984986, 'gamma_choice': 'scale', 'degree': 2, 'coef0': 0.14331664805717137}. Best is trial 4 with value: 0.7139853680275894.


Best trial: 4. Best value: 0.713985:  18%|█▊        | 9/50 [03:03<15:18, 22.40s/it]

[I 2025-10-07 23:17:21,778] Trial 8 finished with value: 0.7012630970061029 and parameters: {'use_pca': True, 'pca_n_components': 213, 'kernel': 'poly', 'C': 150.20047664065896, 'epsilon': 0.0018335223804339588, 'gamma_choice': 'auto', 'degree': 4, 'coef0': 0.623404035020824}. Best is trial 4 with value: 0.7139853680275894.


Best trial: 9. Best value: 0.737823:  20%|██        | 10/50 [03:42<18:24, 27.62s/it]

[I 2025-10-07 23:18:01,093] Trial 9 finished with value: 0.7378225939922901 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 115.60123107453246, 'epsilon': 0.0012485480178562128, 'gamma_choice': 'auto'}. Best is trial 9 with value: 0.7378225939922901.


Best trial: 9. Best value: 0.737823:  22%|██▏       | 11/50 [03:53<14:33, 22.41s/it]

[I 2025-10-07 23:18:11,672] Trial 10 finished with value: 0.6994957506301331 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 893.9605931209983, 'epsilon': 0.6591878452023706, 'gamma_choice': 'auto'}. Best is trial 9 with value: 0.7378225939922901.


Best trial: 9. Best value: 0.737823:  24%|██▍       | 12/50 [04:24<15:49, 24.98s/it]

[I 2025-10-07 23:18:42,550] Trial 11 finished with value: 0.625841470675297 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.3820680064317028, 'epsilon': 0.0001169018694295463, 'gamma_choice': 'auto'}. Best is trial 9 with value: 0.7378225939922901.


Best trial: 12. Best value: 0.751874:  26%|██▌       | 13/50 [05:01<17:38, 28.61s/it]

[I 2025-10-07 23:19:19,503] Trial 12 finished with value: 0.7518740256365078 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.63042364442979, 'epsilon': 0.004752743559448874, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  28%|██▊       | 14/50 [05:28<16:57, 28.26s/it]

[I 2025-10-07 23:19:46,940] Trial 13 finished with value: 0.6129484633225577 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.9949212568013882, 'epsilon': 0.003871734461200115, 'gamma_choice': 'auto'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  30%|███       | 15/50 [05:51<15:34, 26.70s/it]

[I 2025-10-07 23:20:10,040] Trial 14 finished with value: -0.0986304956982984 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.0010365882841199294, 'epsilon': 0.006508315888807574, 'gamma_choice': 'numeric', 'gamma': 9.589496316829043}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  32%|███▏      | 16/50 [06:11<13:57, 24.64s/it]

[I 2025-10-07 23:20:29,887] Trial 15 finished with value: 0.7513902985143985 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.177537376848737, 'epsilon': 0.10375152588918894, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  34%|███▍      | 17/50 [06:26<11:55, 21.69s/it]

[I 2025-10-07 23:20:44,734] Trial 16 finished with value: 0.7490205730392088 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.0661425667487487, 'epsilon': 0.20413242603586088, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  36%|███▌      | 18/50 [06:45<11:06, 20.83s/it]

[I 2025-10-07 23:21:03,543] Trial 17 finished with value: -4.2633201699397585 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.15485906040459443, 'epsilon': 0.12375805329333134, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  38%|███▊      | 19/50 [06:55<09:01, 17.48s/it]

[I 2025-10-07 23:21:13,223] Trial 18 finished with value: 0.7430729016093919 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 2.8805719957297673, 'epsilon': 0.5008842310999057, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  40%|████      | 20/50 [07:15<09:14, 18.48s/it]

[I 2025-10-07 23:21:34,047] Trial 19 finished with value: 0.4384175155475277 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.024499227197231344, 'epsilon': 0.01364110851874755, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 12. Best value: 0.751874:  42%|████▏     | 21/50 [07:34<08:57, 18.52s/it]

[I 2025-10-07 23:21:52,645] Trial 20 finished with value: -26.23616410156321 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.28280049624411735, 'epsilon': 0.23145988722476363, 'gamma_choice': 'scale'}. Best is trial 12 with value: 0.7518740256365078.


Best trial: 21. Best value: 0.755531:  44%|████▍     | 22/50 [07:54<08:52, 19.03s/it]

[I 2025-10-07 23:22:12,863] Trial 21 finished with value: 0.7555308555060253 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 3.0444233154083693, 'epsilon': 0.20083304628553855, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  46%|████▌     | 23/50 [08:13<08:31, 18.94s/it]

[I 2025-10-07 23:22:31,600] Trial 22 finished with value: 0.7479804186524334 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 3.9706745789294584, 'epsilon': 0.07514849918295878, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  48%|████▊     | 24/50 [08:34<08:30, 19.64s/it]

[I 2025-10-07 23:22:52,855] Trial 23 finished with value: 0.7031685432416219 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.33134006051200154, 'epsilon': 0.005836898483603137, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  50%|█████     | 25/50 [08:47<07:18, 17.55s/it]

[I 2025-10-07 23:23:05,544] Trial 24 finished with value: 0.7500644132439641 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 3.986386913719145, 'epsilon': 0.34505938062047603, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  52%|█████▏    | 26/50 [09:12<07:53, 19.72s/it]

[I 2025-10-07 23:23:30,336] Trial 25 finished with value: -0.029429671509376386 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.5041927614349265, 'epsilon': 0.0706812269723218, 'gamma_choice': 'numeric', 'gamma': 1.6671490490283707e-05}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  54%|█████▍    | 27/50 [09:28<07:07, 18.59s/it]

[I 2025-10-07 23:23:46,294] Trial 26 finished with value: 0.7403613361142657 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 6.226825749050434, 'epsilon': 0.14274731628051981, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  56%|█████▌    | 28/50 [09:50<07:12, 19.64s/it]

[I 2025-10-07 23:24:08,383] Trial 27 finished with value: 0.7005206118621402 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 95.39165063170135, 'epsilon': 0.002670188627409326, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  58%|█████▊    | 29/50 [10:10<06:55, 19.78s/it]

[I 2025-10-07 23:24:28,488] Trial 28 finished with value: 0.5067963399223612 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.036350571122337895, 'epsilon': 0.03234888764767035, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  60%|██████    | 30/50 [10:21<05:43, 17.19s/it]

[I 2025-10-07 23:24:39,621] Trial 29 finished with value: -46.746472288410885 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.9276737799738549, 'epsilon': 0.9648508637202076, 'gamma_choice': 'numeric', 'gamma': 0.010663122114636726}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  62%|██████▏   | 31/50 [11:04<07:55, 25.02s/it]

[I 2025-10-07 23:25:22,881] Trial 30 finished with value: 0.6794762004680541 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 1.9218594429422686, 'epsilon': 0.0074857397534662, 'gamma_choice': 'scale', 'degree': 5, 'coef0': 0.9915713658516269}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  64%|██████▍   | 32/50 [11:23<06:56, 23.13s/it]

[I 2025-10-07 23:25:41,643] Trial 31 finished with value: 0.7359871100543877 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 7.790068588321997, 'epsilon': 0.46874486931001696, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  66%|██████▌   | 33/50 [11:45<06:28, 22.85s/it]

[I 2025-10-07 23:26:03,840] Trial 32 finished with value: 0.7540290107301385 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 3.292377748710696, 'epsilon': 0.2880661492791071, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  68%|██████▊   | 34/50 [12:15<06:38, 24.88s/it]

[I 2025-10-07 23:26:33,405] Trial 33 finished with value: 0.721320492497582 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.46135266334791636, 'epsilon': 0.12223231909242538, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  70%|███████   | 35/50 [12:36<05:54, 23.65s/it]

[I 2025-10-07 23:26:54,222] Trial 34 finished with value: 0.7326229246584547 and parameters: {'use_pca': True, 'pca_n_components': 297, 'kernel': 'rbf', 'C': 32.96581462800826, 'epsilon': 0.32069737692305933, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  72%|███████▏  | 36/50 [13:20<06:58, 29.91s/it]

[I 2025-10-07 23:27:38,745] Trial 35 finished with value: -0.04949965868022828 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 8.888746876261703, 'epsilon': 0.06498503591839852, 'gamma_choice': 'numeric', 'gamma': 4.085607752893736, 'degree': 2, 'coef0': 0.9784882571487401}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  74%|███████▍  | 37/50 [13:32<05:20, 24.67s/it]

[I 2025-10-07 23:27:51,187] Trial 36 finished with value: 0.7491241306065467 and parameters: {'use_pca': True, 'pca_n_components': 130, 'kernel': 'rbf', 'C': 2.559316708488883, 'epsilon': 0.015342542797157289, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  76%|███████▌  | 38/50 [14:01<05:10, 25.86s/it]

[I 2025-10-07 23:28:19,809] Trial 37 finished with value: 0.409405625325443 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.04869145451094986, 'epsilon': 0.00033647179170393025, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  78%|███████▊  | 39/50 [14:19<04:19, 23.56s/it]

[I 2025-10-07 23:28:38,017] Trial 38 finished with value: 0.6991035993678218 and parameters: {'use_pca': True, 'pca_n_components': 15, 'kernel': 'rbf', 'C': 51.77007773617777, 'epsilon': 0.04295513447359211, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  80%|████████  | 40/50 [15:08<05:11, 31.13s/it]

[I 2025-10-07 23:29:26,733] Trial 39 finished with value: 0.6726130001710324 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 14.797760358321392, 'epsilon': 0.0007481982458178334, 'gamma_choice': 'scale', 'degree': 5, 'coef0': 0.48915559038386464}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  82%|████████▏ | 41/50 [15:28<04:09, 27.77s/it]

[I 2025-10-07 23:29:46,752] Trial 40 finished with value: 0.2340973960243356 and parameters: {'use_pca': True, 'pca_n_components': 199, 'kernel': 'rbf', 'C': 0.12096882695619934, 'epsilon': 0.021710052140552754, 'gamma_choice': 'numeric', 'gamma': 0.0005386398862232089}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  84%|████████▍ | 42/50 [15:46<03:18, 24.80s/it]

[I 2025-10-07 23:30:04,622] Trial 41 finished with value: 0.7459314252297249 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 5.433943763003666, 'epsilon': 0.30555982467907766, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  86%|████████▌ | 43/50 [15:54<02:18, 19.79s/it]

[I 2025-10-07 23:30:12,736] Trial 42 finished with value: 0.6747812813122933 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 3.6611901195199055, 'epsilon': 0.8810178001887914, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 21. Best value: 0.755531:  88%|████████▊ | 44/50 [16:10<01:51, 18.58s/it]

[I 2025-10-07 23:30:28,489] Trial 43 finished with value: 0.7347677315054832 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.7820329788511083, 'epsilon': 0.3835965490380577, 'gamma_choice': 'scale'}. Best is trial 21 with value: 0.7555308555060253.


Best trial: 44. Best value: 0.757186:  90%|█████████ | 45/50 [16:30<01:35, 19.12s/it]

[I 2025-10-07 23:30:48,864] Trial 44 finished with value: 0.7571860637964287 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.9741986621252687, 'epsilon': 0.1683259141820452, 'gamma_choice': 'scale'}. Best is trial 44 with value: 0.7571860637964287.


Best trial: 44. Best value: 0.757186:  92%|█████████▏| 46/50 [16:50<01:16, 19.22s/it]

[I 2025-10-07 23:31:08,316] Trial 45 finished with value: 0.6349616718452181 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 1.6147854874086374, 'epsilon': 0.19294330460068684, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7571860637964287.


Best trial: 44. Best value: 0.757186:  94%|█████████▍| 47/50 [17:17<01:04, 21.57s/it]

[I 2025-10-07 23:31:35,370] Trial 46 finished with value: 0.7169529748152069 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 26.643713478825156, 'epsilon': 0.10174074336050064, 'gamma_choice': 'scale'}. Best is trial 44 with value: 0.7571860637964287.


Best trial: 44. Best value: 0.757186:  96%|█████████▌| 48/50 [17:43<00:46, 23.03s/it]

[I 2025-10-07 23:32:01,796] Trial 47 finished with value: 0.7332095555513304 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.2284913475018331, 'epsilon': 0.04977444945057876, 'gamma_choice': 'scale', 'degree': 5, 'coef0': 0.7761012983832327}. Best is trial 44 with value: 0.7571860637964287.


Best trial: 44. Best value: 0.757186:  98%|█████████▊| 49/50 [17:55<00:19, 19.61s/it]

[I 2025-10-07 23:32:13,430] Trial 48 finished with value: 0.5805236940837432 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.6335018392059615, 'epsilon': 0.633813780310462, 'gamma_choice': 'auto'}. Best is trial 44 with value: 0.7571860637964287.


Best trial: 44. Best value: 0.757186: 100%|██████████| 50/50 [18:03<00:00, 21.67s/it]


[I 2025-10-07 23:32:21,672] Trial 49 finished with value: -107012818.3257521 and parameters: {'use_pca': True, 'pca_n_components': 131, 'kernel': 'sigmoid', 'C': 359.18102877306717, 'epsilon': 0.17463835870650532, 'gamma_choice': 'scale'}. Best is trial 44 with value: 0.7571860637964287.
SVM best R2: 0.7571860637964287
SVM best params: {'use_pca': False, 'kernel': 'rbf', 'C': 1.9741986621252687, 'epsilon': 0.1683259141820452, 'gamma_choice': 'scale'}


In [21]:
knn_study, knn_pipe = run_optuna_knn(n_trials=50)

[I 2025-10-07 23:32:29,196] A new study created in memory with name: no-name-33fa74a9-fd5b-4639-8d61-bb0c272c51c7
Best trial: 0. Best value: 0.717428:   2%|▏         | 1/50 [00:03<02:33,  3.14s/it]

[I 2025-10-07 23:32:32,334] Trial 0 finished with value: 0.7174278616172077 and parameters: {'use_pca': True, 'pca_n_components': 223, 'n_neighbors': 5, 'weights': 'distance', 'p': 2}. Best is trial 0 with value: 0.7174278616172077.


Best trial: 0. Best value: 0.717428:   4%|▍         | 2/50 [00:04<01:32,  1.93s/it]

[I 2025-10-07 23:32:33,417] Trial 1 finished with value: 0.6827277421159136 and parameters: {'use_pca': False, 'n_neighbors': 29, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.7174278616172077.


Best trial: 2. Best value: 0.722721:   6%|▌         | 3/50 [00:06<01:40,  2.14s/it]

[I 2025-10-07 23:32:35,809] Trial 2 finished with value: 0.7227212274928296 and parameters: {'use_pca': True, 'pca_n_components': 110, 'n_neighbors': 13, 'weights': 'uniform', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:   8%|▊         | 4/50 [00:12<02:48,  3.67s/it]

[I 2025-10-07 23:32:41,826] Trial 3 finished with value: 0.7177789464739528 and parameters: {'use_pca': False, 'n_neighbors': 13, 'weights': 'uniform', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  10%|█         | 5/50 [00:14<02:13,  2.97s/it]

[I 2025-10-07 23:32:43,546] Trial 4 finished with value: 0.6943074799236003 and parameters: {'use_pca': True, 'pca_n_components': 202, 'n_neighbors': 25, 'weights': 'uniform', 'p': 2}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  12%|█▏        | 6/50 [00:16<01:58,  2.69s/it]

[I 2025-10-07 23:32:45,693] Trial 5 finished with value: 0.6448732558886936 and parameters: {'use_pca': True, 'pca_n_components': 111, 'n_neighbors': 1, 'weights': 'distance', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  14%|█▍        | 7/50 [00:22<02:36,  3.65s/it]

[I 2025-10-07 23:32:51,324] Trial 6 finished with value: 0.6997922935876764 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'distance', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  16%|█▌        | 8/50 [00:27<02:59,  4.27s/it]

[I 2025-10-07 23:32:56,932] Trial 7 finished with value: 0.6944614980717609 and parameters: {'use_pca': False, 'n_neighbors': 14, 'weights': 'distance', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  18%|█▊        | 9/50 [00:33<03:12,  4.69s/it]

[I 2025-10-07 23:33:02,525] Trial 8 finished with value: 0.6914801803344633 and parameters: {'use_pca': False, 'n_neighbors': 17, 'weights': 'distance', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  20%|██        | 10/50 [00:33<02:17,  3.44s/it]

[I 2025-10-07 23:33:03,174] Trial 9 finished with value: 0.7140275421324149 and parameters: {'use_pca': False, 'n_neighbors': 14, 'weights': 'uniform', 'p': 2}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 2. Best value: 0.722721:  22%|██▏       | 11/50 [00:34<01:41,  2.60s/it]

[I 2025-10-07 23:33:03,856] Trial 10 finished with value: 0.6647931716842591 and parameters: {'use_pca': True, 'pca_n_components': 12, 'n_neighbors': 22, 'weights': 'uniform', 'p': 1}. Best is trial 2 with value: 0.7227212274928296.


Best trial: 11. Best value: 0.730001:  24%|██▍       | 12/50 [00:36<01:29,  2.36s/it]

[I 2025-10-07 23:33:05,687] Trial 11 finished with value: 0.7300014996305897 and parameters: {'use_pca': True, 'pca_n_components': 92, 'n_neighbors': 10, 'weights': 'uniform', 'p': 1}. Best is trial 11 with value: 0.7300014996305897.


Best trial: 12. Best value: 0.734998:  26%|██▌       | 13/50 [00:38<01:18,  2.13s/it]

[I 2025-10-07 23:33:07,285] Trial 12 finished with value: 0.7349979882159416 and parameters: {'use_pca': True, 'pca_n_components': 80, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 12 with value: 0.7349979882159416.


Best trial: 13. Best value: 0.735711:  28%|██▊       | 14/50 [00:39<01:07,  1.87s/it]

[I 2025-10-07 23:33:08,558] Trial 13 finished with value: 0.7357107974247529 and parameters: {'use_pca': True, 'pca_n_components': 53, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7357107974247529.


Best trial: 13. Best value: 0.735711:  30%|███       | 15/50 [00:40<00:54,  1.56s/it]

[I 2025-10-07 23:33:09,379] Trial 14 finished with value: 0.7267211331899694 and parameters: {'use_pca': True, 'pca_n_components': 18, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 13 with value: 0.7357107974247529.


Best trial: 15. Best value: 0.736617:  32%|███▏      | 16/50 [00:41<00:50,  1.49s/it]

[I 2025-10-07 23:33:10,733] Trial 15 finished with value: 0.7366171878957575 and parameters: {'use_pca': True, 'pca_n_components': 70, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 15 with value: 0.7366171878957575.


Best trial: 15. Best value: 0.736617:  34%|███▍      | 17/50 [00:46<01:24,  2.56s/it]

[I 2025-10-07 23:33:15,783] Trial 16 finished with value: 0.699310870818799 and parameters: {'use_pca': True, 'pca_n_components': 278, 'n_neighbors': 19, 'weights': 'uniform', 'p': 1}. Best is trial 15 with value: 0.7366171878957575.


Best trial: 15. Best value: 0.736617:  36%|███▌      | 18/50 [00:47<01:04,  2.01s/it]

[I 2025-10-07 23:33:16,508] Trial 17 finished with value: 0.7330844368530973 and parameters: {'use_pca': True, 'pca_n_components': 58, 'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 15 with value: 0.7366171878957575.


Best trial: 18. Best value: 0.742748:  38%|███▊      | 19/50 [00:50<01:09,  2.26s/it]

[I 2025-10-07 23:33:19,338] Trial 18 finished with value: 0.742747634970383 and parameters: {'use_pca': True, 'pca_n_components': 157, 'n_neighbors': 5, 'weights': 'uniform', 'p': 1}. Best is trial 18 with value: 0.742747634970383.


Best trial: 18. Best value: 0.742748:  40%|████      | 20/50 [00:52<01:10,  2.36s/it]

[I 2025-10-07 23:33:21,945] Trial 19 finished with value: 0.6436722785427013 and parameters: {'use_pca': True, 'pca_n_components': 154, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 18 with value: 0.742747634970383.


Best trial: 18. Best value: 0.742748:  42%|████▏     | 21/50 [00:54<01:00,  2.10s/it]

[I 2025-10-07 23:33:23,443] Trial 20 finished with value: 0.7424438413852227 and parameters: {'use_pca': True, 'pca_n_components': 171, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 18 with value: 0.742747634970383.


Best trial: 21. Best value: 0.743319:  44%|████▍     | 22/50 [00:55<00:51,  1.83s/it]

[I 2025-10-07 23:33:24,645] Trial 21 finished with value: 0.7433186359426733 and parameters: {'use_pca': True, 'pca_n_components': 165, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  46%|████▌     | 23/50 [00:56<00:44,  1.65s/it]

[I 2025-10-07 23:33:25,882] Trial 22 finished with value: 0.7423398068530525 and parameters: {'use_pca': True, 'pca_n_components': 163, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  48%|████▊     | 24/50 [00:57<00:40,  1.54s/it]

[I 2025-10-07 23:33:27,160] Trial 23 finished with value: 0.7427658600230294 and parameters: {'use_pca': True, 'pca_n_components': 164, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  50%|█████     | 25/50 [00:59<00:35,  1.43s/it]

[I 2025-10-07 23:33:28,339] Trial 24 finished with value: 0.7160915207908541 and parameters: {'use_pca': True, 'pca_n_components': 135, 'n_neighbors': 2, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  52%|█████▏    | 26/50 [01:00<00:35,  1.48s/it]

[I 2025-10-07 23:33:29,946] Trial 25 finished with value: 0.7394667145771349 and parameters: {'use_pca': True, 'pca_n_components': 204, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  54%|█████▍    | 27/50 [01:02<00:37,  1.61s/it]

[I 2025-10-07 23:33:31,856] Trial 26 finished with value: 0.7188544621305808 and parameters: {'use_pca': True, 'pca_n_components': 245, 'n_neighbors': 11, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  56%|█████▌    | 28/50 [01:04<00:35,  1.61s/it]

[I 2025-10-07 23:33:33,475] Trial 27 finished with value: 0.7412708426966462 and parameters: {'use_pca': True, 'pca_n_components': 182, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  58%|█████▊    | 29/50 [01:05<00:31,  1.49s/it]

[I 2025-10-07 23:33:34,661] Trial 28 finished with value: 0.7331225565901921 and parameters: {'use_pca': True, 'pca_n_components': 125, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  60%|██████    | 30/50 [01:06<00:28,  1.45s/it]

[I 2025-10-07 23:33:36,027] Trial 29 finished with value: 0.7261723225982148 and parameters: {'use_pca': True, 'pca_n_components': 136, 'n_neighbors': 11, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  62%|██████▏   | 31/50 [01:08<00:28,  1.50s/it]

[I 2025-10-07 23:33:37,647] Trial 30 finished with value: 0.7423092153552522 and parameters: {'use_pca': True, 'pca_n_components': 188, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  64%|██████▍   | 32/50 [01:09<00:26,  1.46s/it]

[I 2025-10-07 23:33:39,012] Trial 31 finished with value: 0.7424573742681665 and parameters: {'use_pca': True, 'pca_n_components': 164, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  66%|██████▌   | 33/50 [01:11<00:23,  1.41s/it]

[I 2025-10-07 23:33:40,305] Trial 32 finished with value: 0.7367259062734647 and parameters: {'use_pca': True, 'pca_n_components': 150, 'n_neighbors': 7, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  68%|██████▊   | 34/50 [01:12<00:24,  1.50s/it]

[I 2025-10-07 23:33:42,024] Trial 33 finished with value: 0.6859299840093893 and parameters: {'use_pca': True, 'pca_n_components': 225, 'n_neighbors': 29, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  70%|███████   | 35/50 [01:13<00:19,  1.27s/it]

[I 2025-10-07 23:33:42,760] Trial 34 finished with value: 0.7273253670746336 and parameters: {'use_pca': False, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  72%|███████▏  | 36/50 [01:15<00:19,  1.41s/it]

[I 2025-10-07 23:33:44,483] Trial 35 finished with value: 0.7333718877925903 and parameters: {'use_pca': True, 'pca_n_components': 173, 'n_neighbors': 9, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  74%|███████▍  | 37/50 [01:17<00:21,  1.62s/it]

[I 2025-10-07 23:33:46,588] Trial 36 finished with value: 0.6513768490320841 and parameters: {'use_pca': True, 'pca_n_components': 199, 'n_neighbors': 1, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  76%|███████▌  | 38/50 [01:19<00:21,  1.77s/it]

[I 2025-10-07 23:33:48,705] Trial 37 finished with value: 0.7394669498058783 and parameters: {'use_pca': True, 'pca_n_components': 234, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  78%|███████▊  | 39/50 [01:20<00:16,  1.52s/it]

[I 2025-10-07 23:33:49,648] Trial 38 finished with value: 0.6991309680702751 and parameters: {'use_pca': False, 'n_neighbors': 4, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  80%|████████  | 40/50 [01:21<00:14,  1.47s/it]

[I 2025-10-07 23:33:51,009] Trial 39 finished with value: 0.709663580809772 and parameters: {'use_pca': True, 'pca_n_components': 145, 'n_neighbors': 18, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  82%|████████▏ | 41/50 [01:22<00:11,  1.27s/it]

[I 2025-10-07 23:33:51,819] Trial 40 finished with value: 0.6890644469027347 and parameters: {'use_pca': False, 'n_neighbors': 12, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  84%|████████▍ | 42/50 [01:24<00:10,  1.36s/it]

[I 2025-10-07 23:33:53,382] Trial 41 finished with value: 0.7410769981962728 and parameters: {'use_pca': True, 'pca_n_components': 169, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  86%|████████▌ | 43/50 [01:25<00:10,  1.49s/it]

[I 2025-10-07 23:33:55,159] Trial 42 finished with value: 0.6942861154358595 and parameters: {'use_pca': True, 'pca_n_components': 167, 'n_neighbors': 26, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  88%|████████▊ | 44/50 [01:27<00:08,  1.38s/it]

[I 2025-10-07 23:33:56,281] Trial 43 finished with value: 0.7337741979210305 and parameters: {'use_pca': True, 'pca_n_components': 117, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  90%|█████████ | 45/50 [01:28<00:07,  1.43s/it]

[I 2025-10-07 23:33:57,841] Trial 44 finished with value: 0.7395087661178968 and parameters: {'use_pca': True, 'pca_n_components': 209, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  92%|█████████▏| 46/50 [01:29<00:05,  1.30s/it]

[I 2025-10-07 23:33:58,824] Trial 45 finished with value: 0.7181756950765227 and parameters: {'use_pca': True, 'pca_n_components': 100, 'n_neighbors': 2, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  94%|█████████▍| 47/50 [01:30<00:03,  1.09s/it]

[I 2025-10-07 23:33:59,417] Trial 46 finished with value: 0.7312804673210919 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 21. Best value: 0.743319:  96%|█████████▌| 48/50 [01:33<00:03,  1.76s/it]

[I 2025-10-07 23:34:02,767] Trial 47 finished with value: 0.730641930064712 and parameters: {'use_pca': True, 'pca_n_components': 189, 'n_neighbors': 9, 'weights': 'uniform', 'p': 1}. Best is trial 21 with value: 0.7433186359426733.


Best trial: 48. Best value: 0.743577:  98%|█████████▊| 49/50 [01:34<00:01,  1.55s/it]

[I 2025-10-07 23:34:03,826] Trial 48 finished with value: 0.7435774151106533 and parameters: {'use_pca': True, 'pca_n_components': 137, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 48 with value: 0.7435774151106533.


Best trial: 48. Best value: 0.743577: 100%|██████████| 50/50 [01:37<00:00,  1.94s/it]


[I 2025-10-07 23:34:06,435] Trial 49 finished with value: 0.7181139387977344 and parameters: {'use_pca': True, 'pca_n_components': 138, 'n_neighbors': 13, 'weights': 'distance', 'p': 1}. Best is trial 48 with value: 0.7435774151106533.
KNN best R2: 0.7435774151106533
KNN best params: {'use_pca': True, 'pca_n_components': 137, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}


In [22]:
rf_study, rf_pipe = run_optuna_rf(n_trials=50)

[I 2025-10-07 23:34:06,654] A new study created in memory with name: no-name-c7539331-b983-41dd-801b-01f971345083
Best trial: 0. Best value: 0.705659:   2%|▏         | 1/50 [00:24<19:54, 24.37s/it]

[I 2025-10-07 23:34:31,021] Trial 0 finished with value: 0.7056591119187683 and parameters: {'n_estimators': 357, 'max_depth': 24, 'min_samples_split': 16, 'min_samples_leaf': 13, 'max_features': 0.3}. Best is trial 0 with value: 0.7056591119187683.


Best trial: 1. Best value: 0.716178:   4%|▍         | 2/50 [00:26<08:56, 11.18s/it]

[I 2025-10-07 23:34:32,973] Trial 1 finished with value: 0.7161779591510615 and parameters: {'n_estimators': 123, 'max_depth': 45, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 1 with value: 0.7161779591510615.


Best trial: 1. Best value: 0.716178:   6%|▌         | 3/50 [00:51<13:44, 17.54s/it]

[I 2025-10-07 23:34:58,087] Trial 2 finished with value: 0.7081816275943615 and parameters: {'n_estimators': 221, 'max_depth': 50, 'min_samples_split': 18, 'min_samples_leaf': 13, 'max_features': 0.5}. Best is trial 1 with value: 0.7161779591510615.


Best trial: 1. Best value: 0.716178:   8%|▊         | 4/50 [01:10<13:58, 18.22s/it]

[I 2025-10-07 23:35:17,337] Trial 3 finished with value: 0.6921981945858124 and parameters: {'n_estimators': 311, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 16, 'max_features': 0.3}. Best is trial 1 with value: 0.7161779591510615.


Best trial: 1. Best value: 0.716178:  10%|█         | 5/50 [01:15<10:02, 13.40s/it]

[I 2025-10-07 23:35:22,188] Trial 4 finished with value: 0.601613424754515 and parameters: {'n_estimators': 448, 'max_depth': 9, 'min_samples_split': 18, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7161779591510615.


Best trial: 1. Best value: 0.716178:  12%|█▏        | 6/50 [01:17<07:04,  9.65s/it]

[I 2025-10-07 23:35:24,561] Trial 5 finished with value: 0.5524334442179315 and parameters: {'n_estimators': 306, 'max_depth': 33, 'min_samples_split': 5, 'min_samples_leaf': 18, 'max_features': 'log2'}. Best is trial 1 with value: 0.7161779591510615.


Best trial: 6. Best value: 0.752726:  14%|█▍        | 7/50 [02:26<20:38, 28.79s/it]

[I 2025-10-07 23:36:32,765] Trial 6 finished with value: 0.7527257093037459 and parameters: {'n_estimators': 378, 'max_depth': 28, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  16%|█▌        | 8/50 [02:29<14:24, 20.59s/it]

[I 2025-10-07 23:36:35,786] Trial 7 finished with value: 0.6592143850574833 and parameters: {'n_estimators': 180, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  18%|█▊        | 9/50 [03:04<17:10, 25.14s/it]

[I 2025-10-07 23:37:10,937] Trial 8 finished with value: 0.7514552762641278 and parameters: {'n_estimators': 170, 'max_depth': 33, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  20%|██        | 10/50 [03:07<12:19, 18.50s/it]

[I 2025-10-07 23:37:14,554] Trial 9 finished with value: 0.5811806116790487 and parameters: {'n_estimators': 294, 'max_depth': 21, 'min_samples_split': 4, 'min_samples_leaf': 13, 'max_features': 'log2'}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  22%|██▏       | 11/50 [03:48<16:19, 25.11s/it]

[I 2025-10-07 23:37:54,652] Trial 10 finished with value: 0.6065194869362278 and parameters: {'n_estimators': 481, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': 0.8}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  24%|██▍       | 12/50 [04:46<22:22, 35.34s/it]

[I 2025-10-07 23:38:53,389] Trial 11 finished with value: 0.7386819380765955 and parameters: {'n_estimators': 403, 'max_depth': 31, 'min_samples_split': 8, 'min_samples_leaf': 7, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  26%|██▌       | 13/50 [05:23<21:57, 35.62s/it]

[I 2025-10-07 23:39:29,658] Trial 12 finished with value: 0.7411344791751271 and parameters: {'n_estimators': 226, 'max_depth': 35, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  28%|██▊       | 14/50 [05:42<18:24, 30.67s/it]

[I 2025-10-07 23:39:48,898] Trial 13 finished with value: 0.7509769269433432 and parameters: {'n_estimators': 111, 'max_depth': 26, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  30%|███       | 15/50 [06:47<24:03, 41.23s/it]

[I 2025-10-07 23:40:54,595] Trial 14 finished with value: 0.7424913175673478 and parameters: {'n_estimators': 391, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  32%|███▏      | 16/50 [08:16<31:28, 55.56s/it]

[I 2025-10-07 23:42:23,421] Trial 15 finished with value: 0.7478509354640765 and parameters: {'n_estimators': 256, 'max_depth': 39, 'min_samples_split': 13, 'min_samples_leaf': 1, 'max_features': 0.8}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  34%|███▍      | 17/50 [08:20<22:00, 40.01s/it]

[I 2025-10-07 23:42:27,289] Trial 16 finished with value: 0.6775627741819686 and parameters: {'n_estimators': 165, 'max_depth': 29, 'min_samples_split': 11, 'min_samples_leaf': 9, 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  36%|███▌      | 18/50 [09:07<22:24, 42.02s/it]

[I 2025-10-07 23:43:13,961] Trial 17 finished with value: 0.7277648673248116 and parameters: {'n_estimators': 356, 'max_depth': 14, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  38%|███▊      | 19/50 [10:16<25:56, 50.22s/it]

[I 2025-10-07 23:44:23,308] Trial 18 finished with value: 0.728257642177338 and parameters: {'n_estimators': 416, 'max_depth': 39, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  40%|████      | 20/50 [11:34<29:17, 58.60s/it]

[I 2025-10-07 23:45:41,415] Trial 19 finished with value: 0.7466744669181171 and parameters: {'n_estimators': 349, 'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  42%|████▏     | 21/50 [12:05<24:15, 50.18s/it]

[I 2025-10-07 23:46:11,986] Trial 20 finished with value: 0.7470761015919367 and parameters: {'n_estimators': 247, 'max_depth': 44, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  44%|████▍     | 22/50 [12:29<19:43, 42.26s/it]

[I 2025-10-07 23:46:35,783] Trial 21 finished with value: 0.7501654204982752 and parameters: {'n_estimators': 112, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  46%|████▌     | 23/50 [13:03<17:53, 39.77s/it]

[I 2025-10-07 23:47:09,724] Trial 22 finished with value: 0.7510382331266795 and parameters: {'n_estimators': 156, 'max_depth': 25, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  48%|████▊     | 24/50 [13:38<16:37, 38.35s/it]

[I 2025-10-07 23:47:44,787] Trial 23 finished with value: 0.7523771026321009 and parameters: {'n_estimators': 165, 'max_depth': 34, 'min_samples_split': 11, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  50%|█████     | 25/50 [14:09<15:07, 36.31s/it]

[I 2025-10-07 23:48:16,327] Trial 24 finished with value: 0.7456663472366518 and parameters: {'n_estimators': 194, 'max_depth': 35, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  52%|█████▏    | 26/50 [14:47<14:43, 36.83s/it]

[I 2025-10-07 23:48:54,380] Trial 25 finished with value: 0.7442022576835073 and parameters: {'n_estimators': 141, 'max_depth': 35, 'min_samples_split': 11, 'min_samples_leaf': 3, 'max_features': 0.8}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  54%|█████▍    | 27/50 [14:52<10:27, 27.27s/it]

[I 2025-10-07 23:48:59,327] Trial 26 finished with value: 0.6873665479302091 and parameters: {'n_estimators': 271, 'max_depth': 43, 'min_samples_split': 16, 'min_samples_leaf': 8, 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  56%|█████▌    | 28/50 [15:26<10:42, 29.20s/it]

[I 2025-10-07 23:49:33,054] Trial 27 finished with value: 0.7422112598263154 and parameters: {'n_estimators': 198, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  58%|█████▊    | 29/50 [16:15<12:16, 35.05s/it]

[I 2025-10-07 23:50:21,752] Trial 28 finished with value: 0.6813217036636552 and parameters: {'n_estimators': 450, 'max_depth': 21, 'min_samples_split': 12, 'min_samples_leaf': 20, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  60%|██████    | 30/50 [16:44<11:08, 33.43s/it]

[I 2025-10-07 23:50:51,392] Trial 29 finished with value: 0.7141239435318002 and parameters: {'n_estimators': 330, 'max_depth': 21, 'min_samples_split': 15, 'min_samples_leaf': 11, 'max_features': 0.3}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  62%|██████▏   | 31/50 [17:23<11:06, 35.09s/it]

[I 2025-10-07 23:51:30,368] Trial 30 finished with value: 0.7007530287255506 and parameters: {'n_estimators': 373, 'max_depth': 37, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  64%|██████▍   | 32/50 [17:50<09:44, 32.50s/it]

[I 2025-10-07 23:51:56,803] Trial 31 finished with value: 0.7513794547612228 and parameters: {'n_estimators': 164, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  66%|██████▌   | 33/50 [18:13<08:25, 29.76s/it]

[I 2025-10-07 23:52:20,186] Trial 32 finished with value: 0.7520690103230699 and parameters: {'n_estimators': 141, 'max_depth': 32, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  68%|██████▊   | 34/50 [18:33<07:09, 26.87s/it]

[I 2025-10-07 23:52:40,319] Trial 33 finished with value: 0.7512246130434239 and parameters: {'n_estimators': 131, 'max_depth': 32, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 6. Best value: 0.752726:  70%|███████   | 35/50 [19:11<07:34, 30.29s/it]

[I 2025-10-07 23:53:18,582] Trial 34 finished with value: 0.7509320833592825 and parameters: {'n_estimators': 210, 'max_depth': 49, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 0.5}. Best is trial 6 with value: 0.7527257093037459.


Best trial: 35. Best value: 0.75506:  72%|███████▏  | 36/50 [19:28<06:06, 26.16s/it]

[I 2025-10-07 23:53:35,099] Trial 35 finished with value: 0.7550601667384464 and parameters: {'n_estimators': 141, 'max_depth': 41, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  74%|███████▍  | 37/50 [19:49<05:21, 24.72s/it]

[I 2025-10-07 23:53:56,423] Trial 36 finished with value: 0.7482257443922202 and parameters: {'n_estimators': 141, 'max_depth': 41, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  76%|███████▌  | 38/50 [20:13<04:52, 24.38s/it]

[I 2025-10-07 23:54:20,020] Trial 37 finished with value: 0.7454829194339728 and parameters: {'n_estimators': 108, 'max_depth': 48, 'min_samples_split': 12, 'min_samples_leaf': 5, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  78%|███████▊  | 39/50 [20:47<05:00, 27.29s/it]

[I 2025-10-07 23:54:54,109] Trial 38 finished with value: 0.7542429028630795 and parameters: {'n_estimators': 223, 'max_depth': 46, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  80%|████████  | 40/50 [21:14<04:31, 27.17s/it]

[I 2025-10-07 23:55:20,988] Trial 39 finished with value: 0.7510291797006302 and parameters: {'n_estimators': 230, 'max_depth': 47, 'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  82%|████████▏ | 41/50 [21:50<04:27, 29.78s/it]

[I 2025-10-07 23:55:56,861] Trial 40 finished with value: 0.7544197741184735 and parameters: {'n_estimators': 280, 'max_depth': 46, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  84%|████████▍ | 42/50 [22:23<04:07, 30.92s/it]

[I 2025-10-07 23:56:30,461] Trial 41 finished with value: 0.7543717723040156 and parameters: {'n_estimators': 288, 'max_depth': 46, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  86%|████████▌ | 43/50 [22:55<03:37, 31.11s/it]

[I 2025-10-07 23:57:02,017] Trial 42 finished with value: 0.7543275619185765 and parameters: {'n_estimators': 290, 'max_depth': 46, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  88%|████████▊ | 44/50 [23:27<03:08, 31.34s/it]

[I 2025-10-07 23:57:33,904] Trial 43 finished with value: 0.7473135847396898 and parameters: {'n_estimators': 283, 'max_depth': 46, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  90%|█████████ | 45/50 [24:02<02:42, 32.47s/it]

[I 2025-10-07 23:58:09,005] Trial 44 finished with value: 0.7517671085284474 and parameters: {'n_estimators': 302, 'max_depth': 50, 'min_samples_split': 18, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  92%|█████████▏| 46/50 [24:30<02:05, 31.27s/it]

[I 2025-10-07 23:58:37,472] Trial 45 finished with value: 0.7416445789618088 and parameters: {'n_estimators': 320, 'max_depth': 42, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  94%|█████████▍| 47/50 [24:57<01:29, 29.93s/it]

[I 2025-10-07 23:59:04,284] Trial 46 finished with value: 0.7535094433356967 and parameters: {'n_estimators': 264, 'max_depth': 45, 'min_samples_split': 16, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  96%|█████████▌| 48/50 [25:25<00:58, 29.44s/it]

[I 2025-10-07 23:59:32,585] Trial 47 finished with value: 0.7485247544960054 and parameters: {'n_estimators': 240, 'max_depth': 41, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506:  98%|█████████▊| 49/50 [26:00<00:31, 31.03s/it]

[I 2025-10-08 00:00:07,314] Trial 48 finished with value: 0.7462443462985651 and parameters: {'n_estimators': 333, 'max_depth': 46, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 0.3}. Best is trial 35 with value: 0.7550601667384464.


Best trial: 35. Best value: 0.75506: 100%|██████████| 50/50 [26:05<00:00, 31.31s/it]


[I 2025-10-08 00:00:12,194] Trial 49 finished with value: 0.7137117673114867 and parameters: {'n_estimators': 289, 'max_depth': 39, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 35 with value: 0.7550601667384464.
RF best R2: 0.7550601667384464
RF best params: {'n_estimators': 141, 'max_depth': 41, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 0.3}


# Try a meta learner or ensemble of the best models

In [23]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict

# reproducible folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1) Get OOF predictions for each base model (these preds are made by models
#    that were trained without the corresponding sample — no leakage)
print("Generating OOF preds (this may take time)...")
oof_knn = cross_val_predict(knn_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_svm = cross_val_predict(svm_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_rf  = cross_val_predict(rf_pipe,  X, y, cv=kf, n_jobs=-1, method='predict')

# Stack OOF predictions (n_samples x n_models)
stack_oof = np.vstack([oof_knn, oof_svm, oof_rf]).T

# 2) Simple average ensemble
ens_mean = stack_oof.mean(axis=1)
r2_mean = r2_score(y, ens_mean)
rmse_mean = np.sqrt(mean_squared_error(y, ens_mean))
print(f"Simple average ensemble -> R2: {r2_mean:.4f}, RMSE: {rmse_mean:.4f}")

Generating OOF preds (this may take time)...
Simple average ensemble -> R2: 0.7710, RMSE: 0.6512


In [24]:
# 3) Stacking: train a Ridge meta-learner on the OOF stack
meta = Ridge(alpha=1.0)
meta.fit(stack_oof, y)                 # training on OOF preds is OK (no leakage)
ens_stack = meta.predict(stack_oof)    # predictions on the same OOF matrix
r2_stack = r2_score(y, ens_stack)
rmse_stack = np.sqrt(mean_squared_error(y, ens_stack))
print(f"Stacking (Ridge) ensemble -> R2: {r2_stack:.4f}, RMSE: {rmse_stack:.4f}")

# Show meta-learner weights
print("Meta-learner coefficients (weights):", meta.coef_)
print("Meta-learner intercept:", meta.intercept_)

Stacking (Ridge) ensemble -> R2: 0.7715, RMSE: 0.6505
Meta-learner coefficients (weights): [0.32157995 0.36259839 0.33838847]
Meta-learner intercept: -0.1357436691919176


In [25]:
joblib.dump(meta, "../models/ii_meta.joblib")
print("Saved final base pipelines and meta-learner.")

Saved final base pipelines and meta-learner.
