# Import Libraries and Tools and Create Helper Functions

In [1]:
# Requirements: pandas, numpy, scikit-learn, xgboost, joblib, matplotlib
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Models for scaled data
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Tree based models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
integrase = pd.read_csv('../data/ii_df_clean.csv')
integrase.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_units_norm,IC50_nM,pIC50
0,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1400.0,5.85,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1400.0,5.853872
1,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1000.0,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1000.0,6.0
2,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1700.0,5.77,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1700.0,5.769551
3,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1000.0,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,1000.0,6.0
4,CHEMBL177126,O=C(/C=C/c1ccc(O)c(O)c1)O[C@H]1[C@H](O)C[C@](O...,IC50,250.0,6.6,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,nM,250.0,6.60206


# 5. QSAR Model for integrase inhibitor pIC50 value

We have appended the morgan fingerprints and molecular descriptors into a single dataframe

In [4]:
df = pd.read_csv('../data/ii_qsar_features.csv')
print("rows, cols:", df.shape)

rows, cols: (6386, 523)


In [5]:
df.head()

Unnamed: 0,MolWt,MolLogP,MolMR,TPSA,NumHDonors,NumHAcceptors,NumRotatableBonds,NumAromaticRings,HeavyAtomCount,FractionCSP3,...,Morgan_512_503,Morgan_512_504,Morgan_512_505,Morgan_512_506,Morgan_512_507,Morgan_512_508,Morgan_512_509,Morgan_512_510,Morgan_512_511,pIC50
0,498.484,2.892,125.2126,177.14,7,10,7,3,36,0.269231,...,0,1,0,0,1,0,0,0,0,5.853872
1,498.484,2.892,125.2126,177.14,7,10,7,3,36,0.269231,...,0,1,0,0,1,0,0,0,0,6.0
2,290.271,2.0301,73.8595,118.22,5,6,4,2,21,0.133333,...,0,0,0,0,0,0,0,0,0,5.769551
3,290.271,2.0301,73.8595,118.22,5,6,4,2,21,0.133333,...,0,0,0,0,0,0,0,0,0,6.0
4,516.455,1.0296,125.1976,211.28,7,11,7,2,37,0.24,...,0,0,1,0,1,0,0,0,0,6.60206


In [6]:
# Prepare X and y
y = df['pIC50']
X = df.drop(columns=['pIC50'])
print('Feature count: ', X.shape[1])

Feature count:  522


In [7]:
y.isna().sum()

np.int64(0)

In [8]:
# ------------------------------------------------------------------------------
# Quick feature cleanup
# ------------------------------------------------------------------------------

# a. remove near-constant features

# Fit the selector
vt = VarianceThreshold(threshold=1e-6)
X_v = vt.fit_transform(X)
# Features kept
kept_features = X.columns[vt.get_support()].tolist()
# Features removed
removed_features = [col for col in X.columns if col not in kept_features]
# Create the reduced DataFrame
X = pd.DataFrame(X_v, columns=kept_features)

print(f"Features after variance filter: {len(kept_features)}")
print(f"Features removed: {len(removed_features)}")
# Optional: display or save the removed features
print("\nRemoved features (first 20):")
print(removed_features[:20])

Features after variance filter: 522
Features removed: 0

Removed features (first 20):
[]


In [9]:
# b. (optional) remove extremely collinear features (simple correlation filter)

corr_thresh = 0.98
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > corr_thresh)]
if to_drop:
    X.drop(columns=to_drop, inplace=True)
    print("Dropped highly correlated features:", to_drop)

Dropped highly correlated features: ['MolMR', 'HeavyAtomCount']


In [10]:
X.shape

(6386, 520)

In [11]:
# ------------------------------------------------------------------------------
# Define descriptor columns to scale and fingerprint columns to pass through
# ------------------------------------------------------------------------------
descriptors = ['MolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors',
    'NumRotatableBonds', 'NumAromaticRings', 'FractionCSP3']
fingerprints = [c for c in X.columns if c not in descriptors]

# ColumnTransformer: scale descriptors, pass fingerprints through
preprocessor = ColumnTransformer([
    ("desc", StandardScaler(), descriptors),
    ("fp", "passthrough", fingerprints)
])

In [12]:
# ---------------------------
# k-fold OOF trainer
# ---------------------------
def kfold_train_predict(model, X_df, y_arr, n_splits=5, random_state=42):
    """
    model: estimator or pipeline (should accept fit/predict)
    X_df: pandas DataFrame (full dataset)
    y_arr: numpy array or Series
    returns: oof_preds (np.array same length as X_df), mean_r2, mean_rmse
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X_df))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_df, y_arr), 1):
        X_train, X_val = X_df.iloc[train_idx], X_df.iloc[val_idx]
        y_train, y_val = y_arr[train_idx], y_arr[val_idx]

        # fit on train fold, predict on val fold
        model.fit(X_train, y_train)
        preds = model.predict(X_val)

        oof_preds[val_idx] = preds

        fold_r2 = r2_score(y_val, preds)
        fold_rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_scores.append((fold_r2, fold_rmse))
        print(f"Fold {fold}: R2 = {fold_r2:.3f}, RMSE = {fold_rmse:.3f}")

    mean_r2 = np.mean([s[0] for s in fold_scores])
    mean_rmse = np.mean([s[1] for s in fold_scores])
    print(f"\nMean CV R2 = {mean_r2:.3f}, Mean RMSE = {mean_rmse:.3f}")
    return oof_preds, mean_r2, mean_rmse

In [13]:
# ---------------------------
# Define models and pipelines
# ---------------------------
models = {}

# Linear models (need scaling) -> include preprocessor
models['Linear'] = Pipeline([("preproc", preprocessor), ("model", LinearRegression())])
models['ElasticNet'] = Pipeline([("preproc", preprocessor), ("model", ElasticNet(random_state=42, max_iter=5000))])

# SVR (needs scaling)
models['SVM'] = Pipeline([("preproc", preprocessor), ("model", SVR())])

# KNN
models['KNN'] = Pipeline([("preproc", preprocessor), ("model", KNeighborsRegressor())])

# MLP
models['MLP'] = Pipeline([("preproc", preprocessor), ("model", MLPRegressor(max_iter=2000, random_state=42))])

# Random Forest (trees don't need scaling, but pipeline is fine)
models['Random Forest'] = Pipeline([("preproc", preprocessor), ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])

# XGBoost
models['XGBoost'] = Pipeline([("preproc", preprocessor), ("model", XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1, objective='reg:squarederror'))])

In [14]:
# ---------------------------
# Run k-fold for each model
# ---------------------------
results = {}
oof_dict = {}

for name, model in models.items():
    print(f"\n=== Training model: {name} ===")
    oof_preds, mean_r2, mean_rmse = kfold_train_predict(model, X, y, n_splits=5, random_state=42)
    results[name] = {"r2": mean_r2, "rmse": mean_rmse}
    oof_dict[name] = oof_preds

# Summary
print("\nSummary (mean CV results):")
for name, res in results.items():
    print(f"{name}: R2 = {res['r2']:.4f}, RMSE = {res['rmse']:.4f}")


=== Training model: Linear ===
Fold 1: R2 = 0.632, RMSE = 0.811
Fold 2: R2 = 0.650, RMSE = 0.818
Fold 3: R2 = 0.631, RMSE = 0.838
Fold 4: R2 = 0.618, RMSE = 0.836
Fold 5: R2 = 0.653, RMSE = 0.797

Mean CV R2 = 0.637, Mean RMSE = 0.820

=== Training model: ElasticNet ===
Fold 1: R2 = -0.001, RMSE = 1.337
Fold 2: R2 = -0.001, RMSE = 1.382
Fold 3: R2 = -0.001, RMSE = 1.380
Fold 4: R2 = -0.000, RMSE = 1.352
Fold 5: R2 = -0.000, RMSE = 1.352

Mean CV R2 = -0.001, Mean RMSE = 1.361

=== Training model: SVM ===
Fold 1: R2 = 0.732, RMSE = 0.692
Fold 2: R2 = 0.763, RMSE = 0.672
Fold 3: R2 = 0.745, RMSE = 0.697
Fold 4: R2 = 0.739, RMSE = 0.691
Fold 5: R2 = 0.761, RMSE = 0.661

Mean CV R2 = 0.748, Mean RMSE = 0.683

=== Training model: KNN ===
Fold 1: R2 = 0.709, RMSE = 0.721
Fold 2: R2 = 0.752, RMSE = 0.688
Fold 3: R2 = 0.740, RMSE = 0.703
Fold 4: R2 = 0.715, RMSE = 0.721
Fold 5: R2 = 0.763, RMSE = 0.658

Mean CV R2 = 0.736, Mean RMSE = 0.698

=== Training model: MLP ===
Fold 1: R2 = 0.643, RMS

# 6. Tune the Best Models

### Support Vector Machine (SVR),
### K-Nearest Neighbours (KNN),
### Random Forest (RF)

In [15]:
import optuna
import joblib
from sklearn.decomposition import PCA

In [16]:
# PCA variant of the model
n_fp = len(fingerprints)
print(f"Using {len(descriptors)} descriptor columns and {n_fp} fingerprint columns.")

if n_fp == 0:
    raise ValueError("No fingerprint columns detected. Make sure your X contains fingerprint bit columns.")

# ---------- CV settings ----------
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)

Using 8 descriptor columns and 512 fingerprint columns.


In [17]:
# ----------------------- SVM ---------------------------------
def run_optuna_svm(n_trials=80):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        # SVR hyperparameters (log-uniform sampling where appropriate)
        kernel = trial.suggest_categorical("kernel", ["rbf", "poly", "sigmoid"])
        C = trial.suggest_loguniform("C", 1e-3, 1e3)
        epsilon = trial.suggest_loguniform("epsilon", 1e-4, 1.0)
        gamma_choice = trial.suggest_categorical("gamma_choice", ["scale", "auto", "numeric"])
        if gamma_choice == "numeric":
            gamma = trial.suggest_loguniform("gamma", 1e-5, 1e1)
        else:
            gamma = gamma_choice
        # degree and coef0 for polynomial kernel
        if kernel == "poly":
            degree = trial.suggest_int("degree", 2, 5)
            coef0 = trial.suggest_float("coef0", 0.0, 1.0)
        else:
            degree = 3
            coef0 = 0.0

        svr = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, degree=degree, coef0=coef0, max_iter=100000)
        pipe = Pipeline([("preproc", preprocessor), ("svr", svr)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    # reconstruct SVR
    best_kernel = best.get("kernel", "rbf")
    best_gamma = best.get("gamma", best.get("gamma_choice", "scale"))
    if best_gamma == "numeric":
        best_gamma = best.get("gamma")
    best_degree = best.get("degree", 3)
    best_coef0 = best.get("coef0", 0.0)

    best_svm = SVR(
        kernel=best_kernel,
        C=best.get("C"),
        epsilon=best.get("epsilon"),
        gamma=best_gamma,
        degree=best_degree,
        coef0=best_coef0,
        max_iter=100000
    )
    best_pipe = Pipeline([("preproc", preprocessor_best), ("svr", best_svm)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_svm.joblib")
    print("SVM best R2:", study.best_value)
    print("SVM best params:", study.best_params)
    return study, best_pipe

In [18]:
# ------------------ KNN -----------------------
def run_optuna_knn(n_trials=50):
    def objective(trial):
        use_pca = trial.suggest_categorical("use_pca", [True, False])
        if use_pca:
            max_comp = min(n_fp, 300)
            n_comp = trial.suggest_int("pca_n_components", 10, max(10, max_comp))
            fp_transform = Pipeline([("pca", PCA(n_components=n_comp, svd_solver="randomized", random_state=42))])
        else:
            fp_transform = "passthrough"

        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", fp_transform, fingerprints)
        ])

        n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
        weights = trial.suggest_categorical("weights", ["uniform", "distance"])
        p = trial.suggest_int("p", 1, 2)  # 1 = manhattan, 2 = euclidean

        knn = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, p=p)
        pipe = Pipeline([("preproc", preprocessor), ("knn", knn)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    # Rebuild best pipeline and fit on full data
    best = study.best_params
    if best.get("use_pca", False):
        fp_transform = Pipeline([("pca", PCA(n_components=best["pca_n_components"], random_state=42))])
    else:
        fp_transform = "passthrough"
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", fp_transform, fingerprints)
    ])
    best_knn = KNeighborsRegressor(n_neighbors=best["n_neighbors"], weights=best["weights"], p=best["p"])
    best_pipe = Pipeline([("preproc", preprocessor_best), ("knn", best_knn)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_knn.joblib")
    print("KNN best R2:", study.best_value)
    print("KNN best params:", study.best_params)
    return study, best_pipe

In [19]:
# -------------------- Random Forest --------------------------------
def run_optuna_rf(n_trials=50):
    def objective(trial):
        preprocessor = ColumnTransformer([
            ("desc", StandardScaler(), descriptors),
            ("fp", "passthrough", fingerprints)
        ])

        n_estimators = trial.suggest_int("n_estimators", 100, 500)
        max_depth = trial.suggest_int("max_depth", 3, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
        max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", 0.3, 0.5, 0.8])

        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                   max_features=max_features, n_jobs=-1, random_state=42)
        pipe = Pipeline([("preproc", preprocessor), ("rf", rf)])
        scores = cross_val_score(pipe, X, y, cv=cv_outer, scoring="r2", n_jobs=-1)
        return float(np.mean(scores))

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=n_trials, n_jobs=1, show_progress_bar=True)

    best = study.best_params
    preprocessor_best = ColumnTransformer([
        ("desc", StandardScaler(), descriptors),
        ("fp", "passthrough", fingerprints)
    ])
    best_rf = RandomForestRegressor(n_estimators=best["n_estimators"], max_depth=best["max_depth"],
                                    min_samples_split=best["min_samples_split"],
                                    min_samples_leaf=best["min_samples_leaf"],
                                    max_features=best["max_features"], n_jobs=-1, random_state=42)
    best_pipe = Pipeline([("preproc", preprocessor_best), ("rf", best_rf)])
    best_pipe.fit(X, y)
    joblib.dump(best_pipe, "../models/ii_rf.joblib")
    print("RF best R2:", study.best_value)
    print("RF best params:", study.best_params)
    return study, best_pipe

In [20]:
# ----------------------------
# Example: run all studies sequentially (adjust n_trials)
# ----------------------------
svm_study, svm_pipe = run_optuna_svm(n_trials=50)

[I 2025-10-07 09:33:14,474] A new study created in memory with name: no-name-4be66ad6-de0b-4fc1-939b-018d2ecae2e8
Best trial: 0. Best value: 0.493822:   2%|▏         | 1/50 [00:27<22:18, 27.31s/it]

[I 2025-10-07 09:33:41,783] Trial 0 finished with value: 0.49382176062160116 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 0.28486218977786587, 'epsilon': 0.045020786810284145, 'gamma_choice': 'auto'}. Best is trial 0 with value: 0.49382176062160116.


Best trial: 0. Best value: 0.493822:   4%|▍         | 2/50 [00:34<12:32, 15.68s/it]

[I 2025-10-07 09:33:49,311] Trial 1 finished with value: -0.09013309286166415 and parameters: {'use_pca': True, 'pca_n_components': 57, 'kernel': 'sigmoid', 'C': 0.10713066514866337, 'epsilon': 0.2007601635921036, 'gamma_choice': 'numeric', 'gamma': 1.0690461365612158e-05}. Best is trial 0 with value: 0.49382176062160116.


Best trial: 2. Best value: 0.742249:   6%|▌         | 3/50 [01:07<18:18, 23.37s/it]

[I 2025-10-07 09:34:21,826] Trial 2 finished with value: 0.7422494446083656 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 7.307734766708703, 'epsilon': 0.00014012184811066261, 'gamma_choice': 'numeric', 'gamma': 0.0060199338871944254, 'degree': 3, 'coef0': 0.40782778574134493}. Best is trial 2 with value: 0.7422494446083656.


Best trial: 2. Best value: 0.742249:   8%|▊         | 4/50 [01:24<16:07, 21.03s/it]

[I 2025-10-07 09:34:39,284] Trial 3 finished with value: -0.05914526631916508 and parameters: {'use_pca': True, 'pca_n_components': 210, 'kernel': 'sigmoid', 'C': 0.0021107991365853754, 'epsilon': 0.053891673054272504, 'gamma_choice': 'auto'}. Best is trial 2 with value: 0.7422494446083656.


Best trial: 2. Best value: 0.742249:  10%|█         | 5/50 [01:41<14:34, 19.44s/it]

[I 2025-10-07 09:34:55,890] Trial 4 finished with value: 0.6797676604126618 and parameters: {'use_pca': True, 'pca_n_components': 124, 'kernel': 'rbf', 'C': 0.7865352624569372, 'epsilon': 0.0001087447477402026, 'gamma_choice': 'numeric', 'gamma': 0.011789977679857507}. Best is trial 2 with value: 0.7422494446083656.


Best trial: 2. Best value: 0.742249:  12%|█▏        | 6/50 [01:59<14:01, 19.12s/it]

[I 2025-10-07 09:35:14,397] Trial 5 finished with value: 0.6207745233552616 and parameters: {'use_pca': True, 'pca_n_components': 211, 'kernel': 'poly', 'C': 0.07472821282057776, 'epsilon': 0.0028172267652687333, 'gamma_choice': 'scale', 'degree': 5, 'coef0': 0.27853242269971157}. Best is trial 2 with value: 0.7422494446083656.


Best trial: 6. Best value: 0.746953:  14%|█▍        | 7/50 [02:40<18:38, 26.02s/it]

[I 2025-10-07 09:35:54,632] Trial 6 finished with value: 0.7469527676836697 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 4.9541472024773485, 'epsilon': 0.0006504373594656819, 'gamma_choice': 'numeric', 'gamma': 0.014252148546912242}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  16%|█▌        | 8/50 [05:52<55:19, 79.04s/it]

[I 2025-10-07 09:39:07,203] Trial 7 finished with value: 0.46227702629380163 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.11686650100244611, 'epsilon': 0.1016190359128714, 'gamma_choice': 'auto'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  18%|█▊        | 9/50 [06:03<39:20, 57.57s/it]

[I 2025-10-07 09:39:17,543] Trial 8 finished with value: -1651.3676270884032 and parameters: {'use_pca': True, 'pca_n_components': 138, 'kernel': 'sigmoid', 'C': 1.444343477751153, 'epsilon': 0.00033433742123634586, 'gamma_choice': 'scale'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  20%|██        | 10/50 [06:09<27:56, 41.91s/it]

[I 2025-10-07 09:39:24,404] Trial 9 finished with value: 0.20925176673209772 and parameters: {'use_pca': True, 'pca_n_components': 189, 'kernel': 'sigmoid', 'C': 0.004083397034448919, 'epsilon': 0.789966418482619, 'gamma_choice': 'scale'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  22%|██▏       | 11/50 [06:32<23:29, 36.14s/it]

[I 2025-10-07 09:39:47,444] Trial 10 finished with value: 0.27164812139083677 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 908.1125840234736, 'epsilon': 0.0021985272716996316, 'gamma_choice': 'numeric', 'gamma': 7.551874508965239}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  24%|██▍       | 12/50 [07:02<21:34, 34.07s/it]

[I 2025-10-07 09:40:16,805] Trial 11 finished with value: 0.7228849157527124 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 26.43199088240462, 'epsilon': 0.0005925733389821275, 'gamma_choice': 'numeric', 'gamma': 0.009038899184979436, 'degree': 2, 'coef0': 0.8287627110687377}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  26%|██▌       | 13/50 [07:26<19:13, 31.17s/it]

[I 2025-10-07 09:40:41,278] Trial 12 finished with value: 0.7438868806297217 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 16.689811030759028, 'epsilon': 0.00012230119399942848, 'gamma_choice': 'numeric', 'gamma': 0.008973539550924602, 'degree': 3, 'coef0': 0.09486838398706005}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  28%|██▊       | 14/50 [08:04<19:49, 33.04s/it]

[I 2025-10-07 09:41:18,637] Trial 13 finished with value: -2.12656306688306 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 84.61842593563819, 'epsilon': 0.002453088162900407, 'gamma_choice': 'numeric', 'gamma': 0.06824090338459224, 'degree': 4, 'coef0': 0.020475514079013657}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  30%|███       | 15/50 [08:29<17:50, 30.57s/it]

[I 2025-10-07 09:41:43,501] Trial 14 finished with value: 0.6376072867896921 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 182.9040492883249, 'epsilon': 0.0006913987142878796, 'gamma_choice': 'numeric', 'gamma': 0.00017986114219928042}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  32%|███▏      | 16/50 [08:57<16:59, 29.99s/it]

[I 2025-10-07 09:42:12,140] Trial 15 finished with value: 0.6716165475466511 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 4.326271066276375, 'epsilon': 0.010690758344782041, 'gamma_choice': 'numeric', 'gamma': 0.2973554238164173, 'degree': 2, 'coef0': 0.03235707925908447}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  34%|███▍      | 17/50 [09:20<15:18, 27.83s/it]

[I 2025-10-07 09:42:34,958] Trial 16 finished with value: 0.6424090356809132 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 18.3308049050975, 'epsilon': 0.010762825626315082, 'gamma_choice': 'numeric', 'gamma': 0.0005505740464759103}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  36%|███▌      | 18/50 [09:53<15:43, 29.49s/it]

[I 2025-10-07 09:43:08,318] Trial 17 finished with value: 0.6031993045041806 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.02017683324113047, 'epsilon': 0.0002962096652362387, 'gamma_choice': 'numeric', 'gamma': 0.4483532119259336, 'degree': 4, 'coef0': 0.7773204898331425}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  38%|███▊      | 19/50 [10:16<14:10, 27.44s/it]

[I 2025-10-07 09:43:30,981] Trial 18 finished with value: 0.6405675579892124 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 2.124757499422458, 'epsilon': 0.0009907985911237055, 'gamma_choice': 'auto'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  40%|████      | 20/50 [10:51<14:48, 29.63s/it]

[I 2025-10-07 09:44:05,691] Trial 19 finished with value: 0.6997336109496998 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 168.79463195028376, 'epsilon': 0.00021715813834806942, 'gamma_choice': 'scale'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  42%|████▏     | 21/50 [11:25<14:58, 30.98s/it]

[I 2025-10-07 09:44:39,827] Trial 20 finished with value: 0.6615842403474081 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 27.268513841763916, 'epsilon': 0.006216884708703227, 'gamma_choice': 'numeric', 'gamma': 0.0007745302753293189, 'degree': 3, 'coef0': 0.6045843934624218}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  44%|████▍     | 22/50 [12:02<15:17, 32.77s/it]

[I 2025-10-07 09:45:16,759] Trial 21 finished with value: 0.7446222331886881 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 8.473613643910948, 'epsilon': 0.00015001318435132772, 'gamma_choice': 'numeric', 'gamma': 0.00630004887893221, 'degree': 3, 'coef0': 0.369964727735586}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  46%|████▌     | 23/50 [1:50:16<13:26:18, 1791.80s/it]

[I 2025-10-07 11:23:31,386] Trial 22 finished with value: 0.6947558174958411 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 6.70242098996058, 'epsilon': 0.00010667535263028312, 'gamma_choice': 'numeric', 'gamma': 0.03744909233288536, 'degree': 3, 'coef0': 0.24388421794080653}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  48%|████▊     | 24/50 [3:01:45<18:21:02, 2540.85s/it]

[I 2025-10-07 12:34:59,527] Trial 23 finished with value: 0.4060876098850212 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.5522415763898777, 'epsilon': 0.00041576313715833356, 'gamma_choice': 'numeric', 'gamma': 0.002124401821308686, 'degree': 3, 'coef0': 0.21993719628675668}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  50%|█████     | 25/50 [3:03:01<12:30:39, 1801.57s/it]

[I 2025-10-07 12:36:16,411] Trial 24 finished with value: -2313.217835433061 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 59.803268756748, 'epsilon': 0.0011131417163638834, 'gamma_choice': 'numeric', 'gamma': 0.10524513125133106, 'degree': 4, 'coef0': 0.560626576858636}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  52%|█████▏    | 26/50 [3:03:44<8:29:34, 1273.92s/it] 

[I 2025-10-07 12:36:59,265] Trial 25 finished with value: 0.6316188079781335 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 997.7559734386393, 'epsilon': 0.0002549605389343964, 'gamma_choice': 'numeric', 'gamma': 7.099484909328733e-05, 'degree': 2, 'coef0': 0.41501275474893723}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  54%|█████▍    | 27/50 [3:04:26<5:46:36, 904.18s/it] 

[I 2025-10-07 12:37:40,863] Trial 26 finished with value: 0.6687383166400303 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 2.977923570020278, 'epsilon': 0.0011696672520945506, 'gamma_choice': 'numeric', 'gamma': 0.002675239422600834}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  56%|█████▌    | 28/50 [3:05:11<3:57:02, 646.48s/it]

[I 2025-10-07 12:38:26,081] Trial 27 finished with value: 0.6894404479917722 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 9.98961723925264, 'epsilon': 0.0001866907457249561, 'gamma_choice': 'numeric', 'gamma': 0.02148218074596243, 'degree': 5, 'coef0': 0.1968798364158245}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  58%|█████▊    | 29/50 [3:05:42<2:41:38, 461.82s/it]

[I 2025-10-07 12:38:57,073] Trial 28 finished with value: 0.6942152754750402 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 60.12386040437709, 'epsilon': 0.00044124703003925735, 'gamma_choice': 'scale', 'degree': 3, 'coef0': 0.11066433853803909}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  60%|██████    | 30/50 [3:06:12<1:50:45, 332.27s/it]

[I 2025-10-07 12:39:27,078] Trial 29 finished with value: 0.5543948119693213 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.3186617694121348, 'epsilon': 0.005314196311495404, 'gamma_choice': 'auto'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  62%|██████▏   | 31/50 [3:06:46<1:16:53, 242.79s/it]

[I 2025-10-07 12:40:01,085] Trial 30 finished with value: -4847.839577919121 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 318.0172802162471, 'epsilon': 0.00010363553533842419, 'gamma_choice': 'auto'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  64%|██████▍   | 32/50 [3:07:17<53:44, 179.14s/it]  

[I 2025-10-07 12:40:31,717] Trial 31 finished with value: 0.7268901865283782 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 12.232727445989378, 'epsilon': 0.00018490296226978268, 'gamma_choice': 'numeric', 'gamma': 0.003498195526698459, 'degree': 3, 'coef0': 0.39668429847406933}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  66%|██████▌   | 33/50 [3:07:45<37:56, 133.92s/it]

[I 2025-10-07 12:41:00,109] Trial 32 finished with value: 0.7459210071183294 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 5.924172225317784, 'epsilon': 0.00015844225109461327, 'gamma_choice': 'numeric', 'gamma': 0.007651903127585358, 'degree': 3, 'coef0': 0.3994956257553618}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  68%|██████▊   | 34/50 [3:08:19<27:42, 103.89s/it]

[I 2025-10-07 12:41:33,923] Trial 33 finished with value: 0.6956343429158517 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 4.195116623322836, 'epsilon': 0.0007127770291715195, 'gamma_choice': 'numeric', 'gamma': 0.019379568581212356, 'degree': 4, 'coef0': 0.652487675436423}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  70%|███████   | 35/50 [3:08:39<19:39, 78.62s/it] 

[I 2025-10-07 12:41:53,572] Trial 34 finished with value: 0.5509187980022578 and parameters: {'use_pca': True, 'pca_n_components': 299, 'kernel': 'poly', 'C': 1.553242236923031, 'epsilon': 0.022307725374209993, 'gamma_choice': 'numeric', 'gamma': 0.0009840923497803395, 'degree': 2, 'coef0': 0.3594720316066907}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  72%|███████▏  | 36/50 [3:09:07<14:50, 63.63s/it]

[I 2025-10-07 12:42:22,224] Trial 35 finished with value: -5265820.513636751 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 24.7421933399277, 'epsilon': 0.00016513621612692346, 'gamma_choice': 'numeric', 'gamma': 0.10245834598690734}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  74%|███████▍  | 37/50 [3:09:13<10:01, 46.23s/it]

[I 2025-10-07 12:42:27,873] Trial 36 finished with value: 0.5080108132345293 and parameters: {'use_pca': True, 'pca_n_components': 13, 'kernel': 'poly', 'C': 0.7816536005434915, 'epsilon': 0.0016152559647951883, 'gamma_choice': 'numeric', 'gamma': 0.006684779041624965, 'degree': 3, 'coef0': 0.9708641166382118}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  76%|███████▌  | 38/50 [3:10:01<09:20, 46.69s/it]

[I 2025-10-07 12:43:15,630] Trial 37 finished with value: 0.2480869084600698 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 10.454800516274608, 'epsilon': 0.0004085326731619523, 'gamma_choice': 'numeric', 'gamma': 0.27893443249280786, 'degree': 3, 'coef0': 0.5305062167312641}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  78%|███████▊  | 39/50 [3:10:29<07:34, 41.28s/it]

[I 2025-10-07 12:43:44,298] Trial 38 finished with value: 0.5262925645995322 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 0.20879725365805252, 'epsilon': 0.00010266321077185371, 'gamma_choice': 'auto'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  80%|████████  | 40/50 [3:10:49<05:48, 34.83s/it]

[I 2025-10-07 12:44:04,074] Trial 39 finished with value: -17938.702991730293 and parameters: {'use_pca': True, 'pca_n_components': 300, 'kernel': 'sigmoid', 'C': 5.577308752142624, 'epsilon': 0.0002595630411093102, 'gamma_choice': 'scale'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  82%|████████▏ | 41/50 [3:10:58<04:04, 27.12s/it]

[I 2025-10-07 12:44:13,200] Trial 40 finished with value: 0.6966659763963874 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 50.52914146558253, 'epsilon': 0.735290626435479, 'gamma_choice': 'numeric', 'gamma': 0.03432500678222642, 'degree': 4, 'coef0': 0.14047857104419323}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  84%|████████▍ | 42/50 [3:11:24<03:34, 26.85s/it]

[I 2025-10-07 12:44:39,424] Trial 41 finished with value: 0.7005095271521039 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 2.758905317174222, 'epsilon': 0.00015687990958853523, 'gamma_choice': 'numeric', 'gamma': 0.005171626866126463, 'degree': 3, 'coef0': 0.32117715684723624}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  86%|████████▌ | 43/50 [3:11:50<03:04, 26.34s/it]

[I 2025-10-07 12:45:04,577] Trial 42 finished with value: 0.5761038699496724 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.9661416111591902, 'epsilon': 0.0001476445497757162, 'gamma_choice': 'numeric', 'gamma': 0.0018862586902640827, 'degree': 3, 'coef0': 0.45809168175526893}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  88%|████████▊ | 44/50 [3:12:03<02:14, 22.45s/it]

[I 2025-10-07 12:45:17,957] Trial 43 finished with value: 0.7306313780494598 and parameters: {'use_pca': True, 'pca_n_components': 86, 'kernel': 'poly', 'C': 15.649238755534942, 'epsilon': 0.00045353136710233596, 'gamma_choice': 'numeric', 'gamma': 0.009906736943008735, 'degree': 3, 'coef0': 0.4814211874874596}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  90%|█████████ | 45/50 [3:12:28<01:56, 23.24s/it]

[I 2025-10-07 12:45:43,046] Trial 44 finished with value: 0.5442721910923254 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 6.1089819717146705, 'epsilon': 0.0002974608275239559, 'gamma_choice': 'numeric', 'gamma': 0.00025077111622090763, 'degree': 2, 'coef0': 0.3252443754786307}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  92%|█████████▏| 46/50 [3:13:00<01:43, 25.94s/it]

[I 2025-10-07 12:46:15,270] Trial 45 finished with value: 0.7025753804402638 and parameters: {'use_pca': False, 'kernel': 'rbf', 'C': 29.00973170951448, 'epsilon': 0.0007238523432232988, 'gamma_choice': 'numeric', 'gamma': 0.01697518657810917}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  94%|█████████▍| 47/50 [3:13:26<01:17, 25.94s/it]

[I 2025-10-07 12:46:41,203] Trial 46 finished with value: -1302.9676044568314 and parameters: {'use_pca': False, 'kernel': 'sigmoid', 'C': 1.645883026017646, 'epsilon': 0.16729541276495447, 'gamma_choice': 'scale'}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  96%|█████████▌| 48/50 [3:13:51<00:51, 25.67s/it]

[I 2025-10-07 12:47:06,260] Trial 47 finished with value: -0.10124178786702749 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 0.0013072188154196274, 'epsilon': 0.00015319974748938053, 'gamma_choice': 'numeric', 'gamma': 1.572535914418811e-05, 'degree': 3, 'coef0': 0.1276495506427531}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953:  98%|█████████▊| 49/50 [3:14:11<00:23, 23.98s/it]

[I 2025-10-07 12:47:26,298] Trial 48 finished with value: 0.7296441638857482 and parameters: {'use_pca': True, 'pca_n_components': 252, 'kernel': 'rbf', 'C': 8.238447167620915, 'epsilon': 0.00026546079529375373, 'gamma_choice': 'numeric', 'gamma': 0.0048164592244755}. Best is trial 6 with value: 0.7469527676836697.


Best trial: 6. Best value: 0.746953: 100%|██████████| 50/50 [3:14:42<00:00, 233.66s/it]


[I 2025-10-07 12:47:57,408] Trial 49 finished with value: 0.7395079647832297 and parameters: {'use_pca': False, 'kernel': 'poly', 'C': 146.66374254220705, 'epsilon': 0.004043180249250977, 'gamma_choice': 'auto', 'degree': 3, 'coef0': 0.4395039717926551}. Best is trial 6 with value: 0.7469527676836697.
SVM best R2: 0.7469527676836697
SVM best params: {'use_pca': False, 'kernel': 'rbf', 'C': 4.9541472024773485, 'epsilon': 0.0006504373594656819, 'gamma_choice': 'numeric', 'gamma': 0.014252148546912242}


In [21]:
knn_study, knn_pipe = run_optuna_knn(n_trials=50)

[I 2025-10-07 12:48:17,487] A new study created in memory with name: no-name-67de0f85-d0f7-4ba6-9ddc-d107611b51c1


Best trial: 0. Best value: 0.681186:   2%|▏         | 1/50 [00:08<07:08,  8.74s/it]

[I 2025-10-07 12:48:26,229] Trial 0 finished with value: 0.6811862836458603 and parameters: {'use_pca': False, 'n_neighbors': 28, 'weights': 'distance', 'p': 1}. Best is trial 0 with value: 0.6811862836458603.


Best trial: 1. Best value: 0.714801:   4%|▍         | 2/50 [00:10<03:43,  4.65s/it]

[I 2025-10-07 12:48:28,018] Trial 1 finished with value: 0.7148013855644735 and parameters: {'use_pca': True, 'pca_n_components': 27, 'n_neighbors': 15, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.7148013855644735.


Best trial: 1. Best value: 0.714801:   6%|▌         | 3/50 [00:11<02:18,  2.94s/it]

[I 2025-10-07 12:48:28,920] Trial 2 finished with value: 0.7059714709453813 and parameters: {'use_pca': True, 'pca_n_components': 25, 'n_neighbors': 22, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 0.7148013855644735.


Best trial: 1. Best value: 0.714801:   8%|▊         | 4/50 [00:12<01:40,  2.18s/it]

[I 2025-10-07 12:48:29,925] Trial 3 finished with value: 0.6916035033469166 and parameters: {'use_pca': True, 'pca_n_components': 57, 'n_neighbors': 25, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.7148013855644735.


Best trial: 4. Best value: 0.733294:  10%|█         | 5/50 [00:14<01:30,  2.00s/it]

[I 2025-10-07 12:48:31,620] Trial 4 finished with value: 0.7332939617481529 and parameters: {'use_pca': True, 'pca_n_components': 42, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 4 with value: 0.7332939617481529.


Best trial: 4. Best value: 0.733294:  12%|█▏        | 6/50 [00:14<01:10,  1.61s/it]

[I 2025-10-07 12:48:32,474] Trial 5 finished with value: 0.7312232116765285 and parameters: {'use_pca': True, 'pca_n_components': 62, 'n_neighbors': 17, 'weights': 'distance', 'p': 2}. Best is trial 4 with value: 0.7332939617481529.


Best trial: 4. Best value: 0.733294:  14%|█▍        | 7/50 [00:23<02:46,  3.87s/it]

[I 2025-10-07 12:48:40,984] Trial 6 finished with value: 0.6832676017583182 and parameters: {'use_pca': False, 'n_neighbors': 25, 'weights': 'distance', 'p': 1}. Best is trial 4 with value: 0.7332939617481529.


Best trial: 4. Best value: 0.733294:  16%|█▌        | 8/50 [00:24<02:03,  2.94s/it]

[I 2025-10-07 12:48:41,932] Trial 7 finished with value: 0.7082662204481756 and parameters: {'use_pca': False, 'n_neighbors': 16, 'weights': 'uniform', 'p': 2}. Best is trial 4 with value: 0.7332939617481529.


Best trial: 8. Best value: 0.735638:  18%|█▊        | 9/50 [00:32<03:10,  4.65s/it]

[I 2025-10-07 12:48:50,355] Trial 8 finished with value: 0.7356380240665863 and parameters: {'use_pca': False, 'n_neighbors': 6, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  20%|██        | 10/50 [00:40<03:49,  5.73s/it]

[I 2025-10-07 12:48:58,488] Trial 9 finished with value: 0.6467645106797324 and parameters: {'use_pca': False, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  22%|██▏       | 11/50 [00:49<04:15,  6.54s/it]

[I 2025-10-07 12:49:06,887] Trial 10 finished with value: 0.7299821749035617 and parameters: {'use_pca': False, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  24%|██▍       | 12/50 [00:55<04:03,  6.40s/it]

[I 2025-10-07 12:49:12,956] Trial 11 finished with value: 0.7060642340716852 and parameters: {'use_pca': True, 'pca_n_components': 248, 'n_neighbors': 2, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  26%|██▌       | 13/50 [00:59<03:26,  5.58s/it]

[I 2025-10-07 12:49:16,664] Trial 12 finished with value: 0.7317901602612744 and parameters: {'use_pca': True, 'pca_n_components': 150, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  28%|██▊       | 14/50 [01:07<03:52,  6.46s/it]

[I 2025-10-07 12:49:25,132] Trial 13 finished with value: 0.7329972127848094 and parameters: {'use_pca': False, 'n_neighbors': 7, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 8. Best value: 0.735638:  30%|███       | 15/50 [01:16<04:06,  7.05s/it]

[I 2025-10-07 12:49:33,552] Trial 14 finished with value: 0.7228567872828293 and parameters: {'use_pca': False, 'n_neighbors': 11, 'weights': 'uniform', 'p': 1}. Best is trial 8 with value: 0.7356380240665863.


Best trial: 15. Best value: 0.740968:  32%|███▏      | 16/50 [01:19<03:27,  6.09s/it]

[I 2025-10-07 12:49:37,428] Trial 15 finished with value: 0.7409678142366317 and parameters: {'use_pca': True, 'pca_n_components': 139, 'n_neighbors': 4, 'weights': 'uniform', 'p': 1}. Best is trial 15 with value: 0.7409678142366317.


Best trial: 15. Best value: 0.740968:  34%|███▍      | 17/50 [01:28<03:46,  6.87s/it]

[I 2025-10-07 12:49:46,119] Trial 16 finished with value: 0.720105510878192 and parameters: {'use_pca': False, 'n_neighbors': 12, 'weights': 'uniform', 'p': 1}. Best is trial 15 with value: 0.7409678142366317.


Best trial: 17. Best value: 0.742593:  36%|███▌      | 18/50 [01:30<02:52,  5.39s/it]

[I 2025-10-07 12:49:48,042] Trial 17 finished with value: 0.742592503298043 and parameters: {'use_pca': True, 'pca_n_components': 186, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 17 with value: 0.742592503298043.


Best trial: 17. Best value: 0.742593:  38%|███▊      | 19/50 [01:32<02:13,  4.29s/it]

[I 2025-10-07 12:49:49,786] Trial 18 finished with value: 0.7189746852493936 and parameters: {'use_pca': True, 'pca_n_components': 172, 'n_neighbors': 5, 'weights': 'distance', 'p': 2}. Best is trial 17 with value: 0.742592503298043.


Best trial: 17. Best value: 0.742593:  40%|████      | 20/50 [01:34<01:46,  3.54s/it]

[I 2025-10-07 12:49:51,553] Trial 19 finished with value: 0.725751658650424 and parameters: {'use_pca': True, 'pca_n_components': 162, 'n_neighbors': 12, 'weights': 'uniform', 'p': 2}. Best is trial 17 with value: 0.742592503298043.


Best trial: 17. Best value: 0.742593:  42%|████▏     | 21/50 [01:36<01:32,  3.18s/it]

[I 2025-10-07 12:49:53,913] Trial 20 finished with value: 0.7048324358686171 and parameters: {'use_pca': True, 'pca_n_components': 225, 'n_neighbors': 20, 'weights': 'uniform', 'p': 2}. Best is trial 17 with value: 0.742592503298043.


Best trial: 21. Best value: 0.743032:  44%|████▍     | 22/50 [01:37<01:13,  2.61s/it]

[I 2025-10-07 12:49:55,197] Trial 21 finished with value: 0.7430324978746815 and parameters: {'use_pca': True, 'pca_n_components': 107, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  46%|████▌     | 23/50 [01:39<01:00,  2.22s/it]

[I 2025-10-07 12:49:56,513] Trial 22 finished with value: 0.7425767066857081 and parameters: {'use_pca': True, 'pca_n_components': 116, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  48%|████▊     | 24/50 [01:40<00:49,  1.90s/it]

[I 2025-10-07 12:49:57,644] Trial 23 finished with value: 0.7326296868261024 and parameters: {'use_pca': True, 'pca_n_components': 89, 'n_neighbors': 9, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  50%|█████     | 25/50 [01:41<00:42,  1.72s/it]

[I 2025-10-07 12:49:58,952] Trial 24 finished with value: 0.6491493677621606 and parameters: {'use_pca': True, 'pca_n_components': 106, 'n_neighbors': 1, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  52%|█████▏    | 26/50 [01:43<00:44,  1.85s/it]

[I 2025-10-07 12:50:01,105] Trial 25 finished with value: 0.7330088353126957 and parameters: {'use_pca': True, 'pca_n_components': 204, 'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  54%|█████▍    | 27/50 [01:44<00:38,  1.69s/it]

[I 2025-10-07 12:50:02,410] Trial 26 finished with value: 0.7275013513931475 and parameters: {'use_pca': True, 'pca_n_components': 111, 'n_neighbors': 14, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  56%|█████▌    | 28/50 [01:48<00:47,  2.17s/it]

[I 2025-10-07 12:50:05,709] Trial 27 finished with value: 0.7376345986514664 and parameters: {'use_pca': True, 'pca_n_components': 288, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  58%|█████▊    | 29/50 [01:50<00:44,  2.10s/it]

[I 2025-10-07 12:50:07,658] Trial 28 finished with value: 0.7402513041437812 and parameters: {'use_pca': True, 'pca_n_components': 193, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  60%|██████    | 30/50 [01:51<00:37,  1.85s/it]

[I 2025-10-07 12:50:08,924] Trial 29 finished with value: 0.72613798211471 and parameters: {'use_pca': True, 'pca_n_components': 125, 'n_neighbors': 7, 'weights': 'distance', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  62%|██████▏   | 31/50 [01:52<00:31,  1.67s/it]

[I 2025-10-07 12:50:10,176] Trial 30 finished with value: 0.7251158530887498 and parameters: {'use_pca': True, 'pca_n_components': 86, 'n_neighbors': 13, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  64%|██████▍   | 32/50 [01:54<00:29,  1.63s/it]

[I 2025-10-07 12:50:11,694] Trial 31 finished with value: 0.687031441365136 and parameters: {'use_pca': True, 'pca_n_components': 133, 'n_neighbors': 30, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 21. Best value: 0.743032:  66%|██████▌   | 33/50 [01:55<00:28,  1.66s/it]

[I 2025-10-07 12:50:13,415] Trial 32 finished with value: 0.7336902159310775 and parameters: {'use_pca': True, 'pca_n_components': 143, 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 21 with value: 0.7430324978746815.


Best trial: 33. Best value: 0.74309:  68%|██████▊   | 34/50 [01:57<00:27,  1.69s/it] 

[I 2025-10-07 12:50:15,203] Trial 33 finished with value: 0.7430896340738857 and parameters: {'use_pca': True, 'pca_n_components': 179, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  70%|███████   | 35/50 [01:59<00:26,  1.75s/it]

[I 2025-10-07 12:50:17,071] Trial 34 finished with value: 0.7401584735808773 and parameters: {'use_pca': True, 'pca_n_components': 178, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  72%|███████▏  | 36/50 [02:01<00:26,  1.89s/it]

[I 2025-10-07 12:50:19,286] Trial 35 finished with value: 0.7316759330342404 and parameters: {'use_pca': True, 'pca_n_components': 216, 'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  74%|███████▍  | 37/50 [02:02<00:21,  1.63s/it]

[I 2025-10-07 12:50:20,328] Trial 36 finished with value: 0.7145447992912921 and parameters: {'use_pca': True, 'pca_n_components': 90, 'n_neighbors': 3, 'weights': 'distance', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  76%|███████▌  | 38/50 [02:05<00:22,  1.85s/it]

[I 2025-10-07 12:50:22,674] Trial 37 finished with value: 0.7061074574777548 and parameters: {'use_pca': True, 'pca_n_components': 235, 'n_neighbors': 19, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  78%|███████▊  | 39/50 [02:07<00:20,  1.85s/it]

[I 2025-10-07 12:50:24,524] Trial 38 finished with value: 0.7335556668139105 and parameters: {'use_pca': True, 'pca_n_components': 183, 'n_neighbors': 9, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  80%|████████  | 40/50 [02:08<00:16,  1.67s/it]

[I 2025-10-07 12:50:25,776] Trial 39 finished with value: 0.7212057992296097 and parameters: {'use_pca': True, 'pca_n_components': 114, 'n_neighbors': 25, 'weights': 'distance', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  82%|████████▏ | 41/50 [02:10<00:17,  1.96s/it]

[I 2025-10-07 12:50:28,430] Trial 40 finished with value: 0.7380212502056258 and parameters: {'use_pca': True, 'pca_n_components': 259, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  84%|████████▍ | 42/50 [02:12<00:14,  1.87s/it]

[I 2025-10-07 12:50:30,088] Trial 41 finished with value: 0.7404102960073098 and parameters: {'use_pca': True, 'pca_n_components': 153, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  86%|████████▌ | 43/50 [02:13<00:12,  1.72s/it]

[I 2025-10-07 12:50:31,456] Trial 42 finished with value: 0.7192761854976688 and parameters: {'use_pca': True, 'pca_n_components': 131, 'n_neighbors': 2, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  88%|████████▊ | 44/50 [02:18<00:15,  2.53s/it]

[I 2025-10-07 12:50:35,866] Trial 43 finished with value: 0.734521749159039 and parameters: {'use_pca': True, 'pca_n_components': 165, 'n_neighbors': 3, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  90%|█████████ | 45/50 [02:20<00:11,  2.35s/it]

[I 2025-10-07 12:50:37,816] Trial 44 finished with value: 0.7398714671967224 and parameters: {'use_pca': True, 'pca_n_components': 198, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  92%|█████████▏| 46/50 [02:24<00:11,  2.79s/it]

[I 2025-10-07 12:50:41,613] Trial 45 finished with value: 0.6460467023329799 and parameters: {'use_pca': True, 'pca_n_components': 146, 'n_neighbors': 1, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  94%|█████████▍| 47/50 [02:25<00:06,  2.24s/it]

[I 2025-10-07 12:50:42,571] Trial 46 finished with value: 0.7396304960994539 and parameters: {'use_pca': True, 'pca_n_components': 59, 'n_neighbors': 7, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  96%|█████████▌| 48/50 [02:33<00:08,  4.14s/it]

[I 2025-10-07 12:50:51,131] Trial 47 finished with value: 0.7299821749035617 and parameters: {'use_pca': False, 'n_neighbors': 8, 'weights': 'uniform', 'p': 1}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309:  98%|█████████▊| 49/50 [02:34<00:03,  3.18s/it]

[I 2025-10-07 12:50:52,089] Trial 48 finished with value: 0.7417036044383717 and parameters: {'use_pca': True, 'pca_n_components': 77, 'n_neighbors': 4, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.


Best trial: 33. Best value: 0.74309: 100%|██████████| 50/50 [02:35<00:00,  3.11s/it]


[I 2025-10-07 12:50:52,978] Trial 49 finished with value: 0.733084273111819 and parameters: {'use_pca': True, 'pca_n_components': 23, 'n_neighbors': 6, 'weights': 'uniform', 'p': 2}. Best is trial 33 with value: 0.7430896340738857.
KNN best R2: 0.7430896340738857
KNN best params: {'use_pca': True, 'pca_n_components': 179, 'n_neighbors': 5, 'weights': 'uniform', 'p': 2}


In [22]:
rf_study, rf_pipe = run_optuna_rf(n_trials=50)

[I 2025-10-07 12:50:53,252] A new study created in memory with name: no-name-9db19126-d4e1-4cfe-81bd-4516ad8ba78e
Best trial: 0. Best value: 0.738181:   2%|▏         | 1/50 [00:49<40:22, 49.44s/it]

[I 2025-10-07 12:51:42,684] Trial 0 finished with value: 0.7381812605308179 and parameters: {'n_estimators': 330, 'max_depth': 35, 'min_samples_split': 12, 'min_samples_leaf': 7, 'max_features': 0.5}. Best is trial 0 with value: 0.7381812605308179.


Best trial: 0. Best value: 0.738181:   4%|▍         | 2/50 [01:25<33:09, 41.44s/it]

[I 2025-10-07 12:52:18,535] Trial 1 finished with value: 0.698292213644846 and parameters: {'n_estimators': 207, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 12, 'max_features': 0.8}. Best is trial 0 with value: 0.7381812605308179.


Best trial: 0. Best value: 0.738181:   6%|▌         | 3/50 [01:31<20:00, 25.55s/it]

[I 2025-10-07 12:52:25,169] Trial 2 finished with value: 0.6244280895391272 and parameters: {'n_estimators': 428, 'max_depth': 20, 'min_samples_split': 15, 'min_samples_leaf': 17, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7381812605308179.


Best trial: 3. Best value: 0.750852:   8%|▊         | 4/50 [02:35<31:00, 40.45s/it]

[I 2025-10-07 12:53:28,461] Trial 3 finished with value: 0.7508519859760598 and parameters: {'n_estimators': 310, 'max_depth': 46, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  10%|█         | 5/50 [02:36<19:50, 26.46s/it]

[I 2025-10-07 12:53:30,114] Trial 4 finished with value: 0.6431468952904847 and parameters: {'n_estimators': 102, 'max_depth': 47, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  12%|█▏        | 6/50 [03:08<20:38, 28.15s/it]

[I 2025-10-07 12:54:01,545] Trial 5 finished with value: 0.7406215720914674 and parameters: {'n_estimators': 340, 'max_depth': 19, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 0.3}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  14%|█▍        | 7/50 [03:12<14:36, 20.38s/it]

[I 2025-10-07 12:54:05,916] Trial 6 finished with value: 0.6299166269598244 and parameters: {'n_estimators': 376, 'max_depth': 27, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  16%|█▌        | 8/50 [03:35<14:47, 21.13s/it]

[I 2025-10-07 12:54:28,657] Trial 7 finished with value: 0.6856950932707904 and parameters: {'n_estimators': 121, 'max_depth': 34, 'min_samples_split': 4, 'min_samples_leaf': 18, 'max_features': 0.8}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  18%|█▊        | 9/50 [04:02<15:40, 22.94s/it]

[I 2025-10-07 12:54:55,582] Trial 8 finished with value: 0.7319590874685316 and parameters: {'n_estimators': 293, 'max_depth': 37, 'min_samples_split': 13, 'min_samples_leaf': 8, 'max_features': 0.3}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  20%|██        | 10/50 [04:39<18:15, 27.39s/it]

[I 2025-10-07 12:55:32,923] Trial 9 finished with value: 0.6974567370333055 and parameters: {'n_estimators': 367, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': 0.5}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 3. Best value: 0.750852:  22%|██▏       | 11/50 [05:20<20:30, 31.56s/it]

[I 2025-10-07 12:56:13,947] Trial 10 finished with value: 0.7504279637836524 and parameters: {'n_estimators': 234, 'max_depth': 50, 'min_samples_split': 19, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 3 with value: 0.7508519859760598.


Best trial: 11. Best value: 0.751731:  24%|██▍       | 12/50 [06:06<22:44, 35.91s/it]

[I 2025-10-07 12:56:59,802] Trial 11 finished with value: 0.7517306633720165 and parameters: {'n_estimators': 241, 'max_depth': 48, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 11 with value: 0.7517306633720165.


Best trial: 11. Best value: 0.751731:  26%|██▌       | 13/50 [06:53<24:11, 39.23s/it]

[I 2025-10-07 12:57:46,657] Trial 12 finished with value: 0.7516851692828022 and parameters: {'n_estimators': 247, 'max_depth': 42, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 11 with value: 0.7517306633720165.


Best trial: 13. Best value: 0.751986:  28%|██▊       | 14/50 [07:32<23:35, 39.31s/it]

[I 2025-10-07 12:58:26,159] Trial 13 finished with value: 0.7519860289942809 and parameters: {'n_estimators': 203, 'max_depth': 41, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  30%|███       | 15/50 [07:36<16:35, 28.44s/it]

[I 2025-10-07 12:58:29,410] Trial 14 finished with value: 0.6540761877782447 and parameters: {'n_estimators': 172, 'max_depth': 41, 'min_samples_split': 17, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  32%|███▏      | 16/50 [08:08<16:44, 29.55s/it]

[I 2025-10-07 12:59:01,521] Trial 15 finished with value: 0.744691020873269 and parameters: {'n_estimators': 183, 'max_depth': 29, 'min_samples_split': 17, 'min_samples_leaf': 4, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  34%|███▍      | 17/50 [08:44<17:18, 31.46s/it]

[I 2025-10-07 12:59:37,437] Trial 16 finished with value: 0.7000992467807661 and parameters: {'n_estimators': 271, 'max_depth': 42, 'min_samples_split': 7, 'min_samples_leaf': 15, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  36%|███▌      | 18/50 [09:11<16:10, 30.33s/it]

[I 2025-10-07 13:00:05,125] Trial 17 finished with value: 0.743101739356069 and parameters: {'n_estimators': 153, 'max_depth': 49, 'min_samples_split': 18, 'min_samples_leaf': 4, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  38%|███▊      | 19/50 [09:23<12:46, 24.74s/it]

[I 2025-10-07 13:00:16,854] Trial 18 finished with value: 0.744310019677745 and parameters: {'n_estimators': 455, 'max_depth': 31, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  40%|████      | 20/50 [09:45<11:54, 23.83s/it]

[I 2025-10-07 13:00:38,558] Trial 19 finished with value: 0.7411152296585721 and parameters: {'n_estimators': 220, 'max_depth': 23, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  42%|████▏     | 21/50 [10:08<11:21, 23.50s/it]

[I 2025-10-07 13:01:01,276] Trial 20 finished with value: 0.5629512708813542 and parameters: {'n_estimators': 257, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 11, 'max_features': 0.8}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  44%|████▍     | 22/50 [10:54<14:12, 30.46s/it]

[I 2025-10-07 13:01:47,974] Trial 21 finished with value: 0.7517245873609301 and parameters: {'n_estimators': 248, 'max_depth': 42, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 13. Best value: 0.751986:  46%|████▌     | 23/50 [11:33<14:53, 33.08s/it]

[I 2025-10-07 13:02:27,175] Trial 22 finished with value: 0.7455469016836954 and parameters: {'n_estimators': 195, 'max_depth': 38, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 13 with value: 0.7519860289942809.


Best trial: 23. Best value: 0.752726:  48%|████▊     | 24/50 [12:06<14:14, 32.85s/it]

[I 2025-10-07 13:02:59,484] Trial 23 finished with value: 0.7527259026087686 and parameters: {'n_estimators': 142, 'max_depth': 44, 'min_samples_split': 18, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  50%|█████     | 25/50 [12:08<09:54, 23.77s/it]

[I 2025-10-07 13:03:02,059] Trial 24 finished with value: 0.6627308053161739 and parameters: {'n_estimators': 145, 'max_depth': 45, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  52%|█████▏    | 26/50 [12:40<10:24, 26.02s/it]

[I 2025-10-07 13:03:33,330] Trial 25 finished with value: 0.7477859249759049 and parameters: {'n_estimators': 152, 'max_depth': 50, 'min_samples_split': 16, 'min_samples_leaf': 3, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  54%|█████▍    | 27/50 [13:04<09:45, 25.44s/it]

[I 2025-10-07 13:03:57,408] Trial 26 finished with value: 0.7367920079831513 and parameters: {'n_estimators': 134, 'max_depth': 32, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  56%|█████▌    | 28/50 [13:34<09:51, 26.90s/it]

[I 2025-10-07 13:04:27,710] Trial 27 finished with value: 0.7228500506355152 and parameters: {'n_estimators': 183, 'max_depth': 39, 'min_samples_split': 15, 'min_samples_leaf': 10, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  58%|█████▊    | 29/50 [14:22<11:37, 33.23s/it]

[I 2025-10-07 13:05:15,726] Trial 28 finished with value: 0.74982317004862 and parameters: {'n_estimators': 219, 'max_depth': 44, 'min_samples_split': 18, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  60%|██████    | 30/50 [14:36<09:09, 27.48s/it]

[I 2025-10-07 13:05:29,769] Trial 29 finished with value: 0.6804225519676038 and parameters: {'n_estimators': 102, 'max_depth': 34, 'min_samples_split': 13, 'min_samples_leaf': 20, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  62%|██████▏   | 31/50 [14:41<06:31, 20.58s/it]

[I 2025-10-07 13:05:34,279] Trial 30 finished with value: 0.6923288858281044 and parameters: {'n_estimators': 279, 'max_depth': 36, 'min_samples_split': 16, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  64%|██████▍   | 32/50 [15:35<09:11, 30.63s/it]

[I 2025-10-07 13:06:28,355] Trial 31 finished with value: 0.7516055565154253 and parameters: {'n_estimators': 245, 'max_depth': 40, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  66%|██████▌   | 33/50 [16:19<09:52, 34.86s/it]

[I 2025-10-07 13:07:13,083] Trial 32 finished with value: 0.7518934433692794 and parameters: {'n_estimators': 205, 'max_depth': 44, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  68%|██████▊   | 34/50 [17:23<11:34, 43.40s/it]

[I 2025-10-07 13:08:16,395] Trial 33 finished with value: 0.7415240569519563 and parameters: {'n_estimators': 211, 'max_depth': 47, 'min_samples_split': 19, 'min_samples_leaf': 3, 'max_features': 0.8}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  70%|███████   | 35/50 [17:27<07:56, 31.76s/it]

[I 2025-10-07 13:08:21,004] Trial 34 finished with value: 0.7108568410805839 and parameters: {'n_estimators': 163, 'max_depth': 45, 'min_samples_split': 17, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  72%|███████▏  | 36/50 [18:07<07:58, 34.21s/it]

[I 2025-10-07 13:09:00,940] Trial 35 finished with value: 0.7502459748054133 and parameters: {'n_estimators': 199, 'max_depth': 48, 'min_samples_split': 19, 'min_samples_leaf': 2, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 23. Best value: 0.752726:  74%|███████▍  | 37/50 [18:38<07:13, 33.32s/it]

[I 2025-10-07 13:09:32,168] Trial 36 finished with value: 0.6996561248431834 and parameters: {'n_estimators': 218, 'max_depth': 44, 'min_samples_split': 15, 'min_samples_leaf': 15, 'max_features': 0.5}. Best is trial 23 with value: 0.7527259026087686.


Best trial: 37. Best value: 0.754028:  76%|███████▌  | 38/50 [19:18<07:03, 35.33s/it]

[I 2025-10-07 13:10:12,184] Trial 37 finished with value: 0.7540281206990932 and parameters: {'n_estimators': 303, 'max_depth': 47, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 37 with value: 0.7540281206990932.


Best trial: 37. Best value: 0.754028:  78%|███████▊  | 39/50 [19:50<06:15, 34.14s/it]

[I 2025-10-07 13:10:43,553] Trial 38 finished with value: 0.7237752541680014 and parameters: {'n_estimators': 313, 'max_depth': 22, 'min_samples_split': 20, 'min_samples_leaf': 9, 'max_features': 0.3}. Best is trial 37 with value: 0.7540281206990932.


Best trial: 37. Best value: 0.754028:  80%|████████  | 40/50 [20:31<06:02, 36.25s/it]

[I 2025-10-07 13:11:24,723] Trial 39 finished with value: 0.7430051535856944 and parameters: {'n_estimators': 340, 'max_depth': 37, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 37 with value: 0.7540281206990932.


Best trial: 37. Best value: 0.754028:  82%|████████▏ | 41/50 [21:14<05:43, 38.17s/it]

[I 2025-10-07 13:12:07,371] Trial 40 finished with value: 0.743566923952489 and parameters: {'n_estimators': 384, 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 37 with value: 0.7540281206990932.


Best trial: 41. Best value: 0.754265:  84%|████████▍ | 42/50 [21:55<05:13, 39.18s/it]

[I 2025-10-07 13:12:48,902] Trial 41 finished with value: 0.7542650333282206 and parameters: {'n_estimators': 294, 'max_depth': 47, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 41 with value: 0.7542650333282206.


Best trial: 41. Best value: 0.754265:  86%|████████▌ | 43/50 [22:42<04:49, 41.34s/it]

[I 2025-10-07 13:13:35,254] Trial 42 finished with value: 0.7518144683828221 and parameters: {'n_estimators': 315, 'max_depth': 45, 'min_samples_split': 18, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 41 with value: 0.7542650333282206.


Best trial: 41. Best value: 0.754265:  88%|████████▊ | 44/50 [23:42<04:41, 46.98s/it]

[I 2025-10-07 13:14:35,424] Trial 43 finished with value: 0.7433945869011742 and parameters: {'n_estimators': 499, 'max_depth': 47, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 0.3}. Best is trial 41 with value: 0.7542650333282206.


Best trial: 44. Best value: 0.755229:  90%|█████████ | 45/50 [24:27<03:52, 46.58s/it]

[I 2025-10-07 13:15:21,069] Trial 44 finished with value: 0.7552294858921716 and parameters: {'n_estimators': 354, 'max_depth': 43, 'min_samples_split': 17, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.


Best trial: 44. Best value: 0.755229:  92%|█████████▏| 46/50 [25:06<02:56, 44.07s/it]

[I 2025-10-07 13:15:59,292] Trial 45 finished with value: 0.749114160842057 and parameters: {'n_estimators': 354, 'max_depth': 40, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.


Best trial: 44. Best value: 0.755229:  94%|█████████▍| 47/50 [25:20<01:45, 35.12s/it]

[I 2025-10-07 13:16:13,526] Trial 46 finished with value: 0.7505116042102433 and parameters: {'n_estimators': 121, 'max_depth': 50, 'min_samples_split': 19, 'min_samples_leaf': 2, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.


Best trial: 44. Best value: 0.755229:  96%|█████████▌| 48/50 [26:03<01:15, 37.52s/it]

[I 2025-10-07 13:16:56,657] Trial 47 finished with value: 0.742154853097788 and parameters: {'n_estimators': 403, 'max_depth': 43, 'min_samples_split': 17, 'min_samples_leaf': 5, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.


Best trial: 44. Best value: 0.755229:  98%|█████████▊| 49/50 [26:44<00:38, 38.57s/it]

[I 2025-10-07 13:17:37,678] Trial 48 finished with value: 0.7552209736272391 and parameters: {'n_estimators': 301, 'max_depth': 47, 'min_samples_split': 16, 'min_samples_leaf': 1, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.


Best trial: 44. Best value: 0.755229: 100%|██████████| 50/50 [27:15<00:00, 32.70s/it]


[I 2025-10-07 13:18:08,329] Trial 49 finished with value: 0.7402174898307601 and parameters: {'n_estimators': 287, 'max_depth': 47, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 0.3}. Best is trial 44 with value: 0.7552294858921716.
RF best R2: 0.7552294858921716
RF best params: {'n_estimators': 354, 'max_depth': 43, 'min_samples_split': 17, 'min_samples_leaf': 1, 'max_features': 0.3}


# 7. Try a meta learner or ensemble of the best models

In [23]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict

# reproducible folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1) Get OOF predictions for each base model (these preds are made by models
#    that were trained without the corresponding sample — no leakage)
print("Generating OOF preds (this may take time)...")
oof_knn = cross_val_predict(knn_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_svm = cross_val_predict(svm_pipe, X, y, cv=kf, n_jobs=-1, method='predict')
oof_rf  = cross_val_predict(rf_pipe,  X, y, cv=kf, n_jobs=-1, method='predict')

# Stack OOF predictions (n_samples x n_models)
stack_oof = np.vstack([oof_knn, oof_svm, oof_rf]).T

# 2) Simple average ensemble
ens_mean = stack_oof.mean(axis=1)
r2_mean = r2_score(y, ens_mean)
rmse_mean = np.sqrt(mean_squared_error(y, ens_mean))
print(f"Simple average ensemble -> R2: {r2_mean:.4f}, RMSE: {rmse_mean:.4f}")

Generating OOF preds (this may take time)...


Simple average ensemble -> R2: 0.7724, RMSE: 0.6491


In [24]:
# 3) Stacking: train a Ridge meta-learner on the OOF stack
meta = Ridge(alpha=1.0)
meta.fit(stack_oof, y)                 # training on OOF preds is OK (no leakage)
ens_stack = meta.predict(stack_oof)    # predictions on the same OOF matrix
r2_stack = r2_score(y, ens_stack)
rmse_stack = np.sqrt(mean_squared_error(y, ens_stack))
print(f"Stacking (Ridge) ensemble -> R2: {r2_stack:.4f}, RMSE: {rmse_stack:.4f}")

# Show meta-learner weights
print("Meta-learner coefficients (weights):", meta.coef_)
print("Meta-learner intercept:", meta.intercept_)

Stacking (Ridge) ensemble -> R2: 0.7728, RMSE: 0.6487
Meta-learner coefficients (weights): [0.3226784  0.29527209 0.40098253]
Meta-learner intercept: -0.11706293555618341


In [25]:
joblib.dump(meta, "../models/ii_meta.joblib")
print("Saved final base pipelines and meta-learner.")

Saved final base pipelines and meta-learner.
