# Random forest (RF) and Support Vector Regression (SVR)

In [1]:
# %pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [3]:
# import required packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# import datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.head())

# split features and target variable
X_train = train.drop(["id", "SMILES", "Tm"], axis = 1)
y_train = train["Tm"]
X_test = test.drop(["id", "SMILES"], axis = 1) 
id_test = test["id"] 

     id                       SMILES      Tm  Group 1  Group 2  Group 3  \
0  2175        FC1=C(F)C(F)(F)C1(F)F  213.15        0        0        0   
1  1222  c1ccc2c(c1)ccc3Nc4ccccc4c23  407.15        0        0        0   
2  2994          CCN1C(C)=Nc2ccccc12  324.15        2        1        0   
3  1704                   CC#CC(=O)O  351.15        1        0        0   
4  2526                    CCCCC(S)C  126.15        2        3        0   

   Group 4  Group 5  Group 6  Group 7  ...  Group 415  Group 416  Group 417  \
0        0        0        0        0  ...          0          0          0   
1        0        0        0        0  ...          0          0          0   
2        0        0        0        0  ...          0          0          0   
3        0        0        0        0  ...          0          0          0   
4        0        0        0        0  ...          0          0          0   

   Group 418  Group 419  Group 420  Group 421  Group 422  Group 423  Group

# 1) Random Forest

### 1.1.) RandomizedSearchCV

- defining a range/distribution of values for each hyperparameter --> random sampling of a fixed number of combinations
- faster than Grid Search; good for a first pass

In [3]:
# parameter grid to sample from
parameters_grid = {
    "n_estimators": [100, 200, 400, 600, 800, 1000],
    "max_depth": [10, 15, 20, 25, 30, 35, None],
    "min_samples_split": [2, 3, 5, 7, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2", 0.02, 0.05, 0.1, 0.2]} # percentages, e.g. 0.05 equals 5 % of features

# Model initialization
rf = RandomForestRegressor(random_state = 123)

# RandomizedSearchCV initialization
rf_randomized_search = RandomizedSearchCV(
    estimator = rf,
    param_distributions = parameters_grid,
    n_iter = 70,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    random_state = 123,
    error_score = "raise")

# RandomizedSearchCV fit to the training data
rf_randomized_search.fit(X_train, y_train)

# Extraction of best result
print("Best parameters found: ", rf_randomized_search.best_params_)
print("Best score (Neg. MAE): ", rf_randomized_search.best_score_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
Best parameters found:  {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 30}
Best score (Neg. MAE):  -37.117318102559906


In [9]:
# random forest with found best parameters from randomized search

rf1 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.2, max_depth = 30, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf1.predict(X_test)
results_rf1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf1.index.name = "id"
results_rf1.to_csv("prediction_rf1.csv", index=False)

### 1.2.) GridSearchCV

- every single combination of specified hyperparameters tested --> best combination within the defined grid found
- can be very slow for large grids
- good to use for fine-tuning after narrowing down via Random Search (done here)

In [13]:
## grid search 1

# grid with parameters to search (for fine-tuning of n_estimators, max_depth and max_features
parameters_grid = {
    "n_estimators": [400, 600],
    "max_depth": [20, 30, 40, None],
    "max_features": [0.1, 0.15, 0.2, 0.25, 0.3]}

# model initialisation
rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

# GridSearchCV initialisation
rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

# GridSearchCV fit to the training data
rf_grid_search.fit(X_train, y_train)

# Extraction of best result
print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': None, 'max_features': 0.3, 'n_estimators': 600}
Best score (Neg. MAE):  -35.96683474779305


In [14]:
# random forest with found best parameters from grid search 1

rf2 = RandomForestRegressor(n_estimators = 600, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.3, max_depth = None, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf2.predict(X_test)
results_rf2 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf2.index.name = "id"
results_rf2.to_csv("prediction_rf2.csv", index=False)

In [15]:
## grid search 2

parameters_grid = {
    "n_estimators": [350, 400, 450, 500],
    "max_depth": [25, 30, 35],
    "max_features": [0.15, 0.2, 0.25]}

rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

rf_grid_search.fit(X_train, y_train)

print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': 35, 'max_features': 0.25, 'n_estimators': 400}
Best score (Neg. MAE):  -36.463945340580196


In [16]:
# random forest with found best parameters from grid search 2

rf3 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.25, max_depth = 35, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf3.predict(X_test)
results_rf3 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf3.index.name = "id"
results_rf3.to_csv("prediction_rf3.csv", index=False)

## 1.3) Optuna

employs intelligent, adaptive sampling techniques like Bayesian optimization to guide the search, allowing it to converge on better results faster

In [7]:
# Define the Objective Function
def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 50, 700)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.25, 0.3, 0.5, 0.7])
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error'])

    # Initialize the Model
    rf = RandomForestRegressor(
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        max_features = max_features,
        criterion = criterion,
        random_state = 123,
        n_jobs=-1)
    
    # Evaluate the Model using Cross-Validation
    score = cross_val_score(
        rf, 
        X_train, 
        y_train, 
        n_jobs = -1, 
        cv = 5, 
        scoring = "neg_mean_absolute_error")
    
    # Return the mean cross-validation score
    mean_absolute_error = -score.mean()
    return mean_absolute_error

# Create the Study
study = optuna.create_study(
    direction = 'minimize', 
    sampler = optuna.samplers.TPESampler(seed = 123))

# Run Optimization
# The study runs the objective function either 100 times (each time suggesting a new set of parameters based on the results of the previous 
# trials), or for 1 hour (timeout = 3600; trial that is ongoing after 1 hour is still finished)
print("Starting optimization...")
study.optimize(objective, n_trials = 100, timeout = 3600)
print("Optimization finished.")

[I 2025-11-25 11:03:18,036] A new study created in memory with name: no-name-27fcfc02-f146-420a-8a8c-0526f0e02c9b


Starting optimization...


[I 2025-11-25 11:03:38,535] Trial 0 finished with value: 39.28955919757761 and parameters: {'n_estimators': 503, 'max_depth': 18, 'min_samples_split': 4, 'max_features': 0.3, 'criterion': 'squared_error'}. Best is trial 0 with value: 39.28955919757761.
[I 2025-11-25 11:05:31,886] Trial 1 finished with value: 48.2996878757893 and parameters: {'n_estimators': 524, 'max_depth': 25, 'min_samples_split': 2, 'max_features': 'log2', 'criterion': 'absolute_error'}. Best is trial 0 with value: 39.28955919757761.
[I 2025-11-25 11:05:55,740] Trial 2 finished with value: 36.48492253640203 and parameters: {'n_estimators': 521, 'max_depth': 33, 'min_samples_split': 8, 'max_features': 0.5, 'criterion': 'squared_error'}. Best is trial 2 with value: 36.48492253640203.
[I 2025-11-25 11:11:47,088] Trial 3 finished with value: 37.75728692507363 and parameters: {'n_estimators': 371, 'max_depth': 24, 'min_samples_split': 4, 'max_features': 0.25, 'criterion': 'absolute_error'}. Best is trial 2 with value: 36

Optimization finished.


In [10]:
# Get the best trial information
best_trial = study.best_trial

print("\n--- Best Trial Results ---")
print(f"Best cross-validation mean absolute error: {best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")


--- Best Trial Results ---
Best cross-validation mean absolute error: 36.3218
Best hyperparameters:
  n_estimators: 698
  max_depth: 48
  min_samples_split: 10
  max_features: 0.7
  criterion: absolute_error


In [11]:
# Train the final model with the best parameters on the entire training set
rf4 = RandomForestRegressor(n_estimators = 698, max_depth = 48, min_samples_split = 10, max_features = 0.7, criterion = "absolute_error",
                            random_state = 123).fit(X_train, y_train)
y_pred = rf4.predict(X_test)
results_rf4 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf4.index.name = "id"
results_rf4.to_csv("Submissions/prediction_rf4.csv", index=False)

In [5]:
## optuna 2nd try (with pruning & enqueuing the baseline!)

# Define the Objective Function
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 50, 700)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.25, 0.3, 0.5, 0.7])

    # Initialize the Model
    rf = RandomForestRegressor(
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        max_features = max_features,
        random_state = 123,
        n_jobs = -1)

    # Manual cross-validation loop (necessary in case of pruning)
    kf = KFold(n_splits = 5, shuffle = True, random_state = 123)
    mae_scores = []

    for step, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # Fit and Predict
        rf.fit(X_tr, y_tr)
        preds = rf.predict(X_val)
        
        # Calculate MAE for this fold
        fold_mae = mean_absolute_error(y_val, preds)
        mae_scores.append(fold_mae)

        # Report the MEAN MAE up to this point
        intermediate_value = np.mean(mae_scores)
        
        trial.report(intermediate_value, step = step)

        if trial.should_prune():
            raise optuna.TrialPruned()

    # Return overall mean MAE
    return np.mean(mae_scores)

# Create study
study = optuna.create_study(
    direction = "minimize",
    pruner = optuna.pruners.MedianPruner(n_warmup_steps = 1))

# ENQUEUE YOUR BASELINE (starts at the prior best model)
study.enqueue_trial({
    "n_estimators": 698,
    "max_depth": 48,
    "min_samples_split": 10,
    "max_features": 0.7})

# Run optimization
print("Starting optimization...")
study.optimize(objective, n_trials = 100, timeout = 5400)
print("Optimization finished.")

[I 2025-11-25 19:49:10,295] A new study created in memory with name: no-name-cd2fff42-a579-458e-96e2-d569ea887b42


Starting optimization...


[I 2025-11-25 19:49:45,770] Trial 0 finished with value: 36.27434170732677 and parameters: {'n_estimators': 698, 'max_depth': 48, 'min_samples_split': 10, 'max_features': 0.7}. Best is trial 0 with value: 36.27434170732677.
[I 2025-11-25 19:49:47,979] Trial 1 finished with value: 50.656079842865864 and parameters: {'n_estimators': 140, 'max_depth': 12, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 36.27434170732677.
[I 2025-11-25 19:50:04,626] Trial 2 finished with value: 37.196079847994184 and parameters: {'n_estimators': 639, 'max_depth': 30, 'min_samples_split': 10, 'max_features': 0.3}. Best is trial 0 with value: 36.27434170732677.
[I 2025-11-25 19:50:10,662] Trial 3 finished with value: 40.95141933563673 and parameters: {'n_estimators': 266, 'max_depth': 33, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 36.27434170732677.
[I 2025-11-25 19:50:38,747] Trial 4 finished with value: 36.10516335839534 and parameters: {'n_est

Optimization finished.


In [6]:
# Get the best trial information
best_trial = study.best_trial

print("\n--- Best Trial Results ---")
print(f"Best cross-validation mean absolute error: {best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")


--- Best Trial Results ---
Best cross-validation mean absolute error: 35.8421
Best hyperparameters:
  n_estimators: 515
  max_depth: 49
  min_samples_split: 4
  max_features: 0.5


In [8]:
# Train the final model with the best parameters on the entire training set
rf5 = RandomForestRegressor(n_estimators = 515, max_depth = 49, min_samples_split = 4, max_features = 0.5, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf5.predict(X_test)
results_rf5 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf5.index.name = "id"
results_rf5.to_csv("Submissions/prediction_rf5.csv", index=False)

## 2.) SVR with radial kernel

SVR: a tube with an estimated function (hyperplane) in the middle and boundaries on either side defined by ε. The algorithm’s goal is to minimize the error by identifying a function that puts more of the original points inside the tube while at the same time reducing the "slack."
Difference to SVM: the support vectors are the points that fall outside the tube rather than the ones at the margin

Radial and polynomial kernel: A kernel is a function that takes the original non-linear problem and transforms it into a linear one, which is then handled by the algorithm in a higher-dimensional space.

Hyperparameters that need to be tuned:
- C (regularization parameter): high C leads to stronger penalization of errors (points outside the tube), leading to a more complex model that fits the data more closely; small C allows more errors and leads to a simpler model with a larger margin
- epsilon: value of epsilon determines the width of the tube around the estimated function (hyperplane); points that fall inside this tube are considered as correct predictions and are not penalized
- gamma: inverse of the radius of influence of samples selected by the model as support vectors; with a low gamma the influence of individual training examples reaches further, affecting a larger region of the feature space (leads to a smoother and less complex decision boundary, but can result in underfitting); with a high gamma the influence is closer, affecting only the region near the training example (can lead to overfitting)

In [4]:
## randomized search

parameters_grid = {
    "C": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 1000],
    "epsilon": [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2, 4],
    "gamma": ["scale", "auto", 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.01, 1, 10]}

svr_rbf = SVR(kernel = "rbf")

svr_rbf_randomized_search = RandomizedSearchCV(
    estimator = svr_rbf,
    param_distributions = parameters_grid,
    n_iter = 70,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    random_state = 123,
    error_score = "raise")

svr_rbf_randomized_search.fit(X_train, y_train)

print("Best parameters found: ", svr_rbf_randomized_search.best_params_)
print("Best score (Neg. MAE): ", svr_rbf_randomized_search.best_score_)

Best parameters found:  {'gamma': 'scale', 'epsilon': 0.01, 'C': 1000}
Best score (Neg. MAE):  -32.52189746601051


In [6]:
svr_rbf_1 = SVR(kernel = "rbf", gamma = "scale", epsilon = 0.01, C = 1000).fit(X_train, y_train)

y_pred = svr_rbf_1.predict(X_test)
results_svr_rbf_1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_rbf_1.index.name = "id"
results_svr_rbf_1.to_csv("prediction_svr_rbf_1.csv", index=False)

In [7]:
print(svr_rbf_1._gamma)
# gamma = "scale" uses this value for gamma:

0.023719825417948447


In [9]:
## grid search

parameters_grid = {
    "C": [0.1, 0.5, 1, 10, 100, 500, 1000, 2000],
    "epsilon": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
    "gamma": ["scale", "auto", 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]}

svr_rbf = SVR(kernel = "rbf")

svr_rbf_grid_search = GridSearchCV(
    estimator = svr_rbf,
    param_grid = parameters_grid,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    error_score = "raise")

svr_rbf_grid_search.fit(X_train, y_train)

print("Best parameters found: ", svr_rbf_grid_search.best_params_)
print("Best score (Neg. MAE): ", svr_rbf_grid_search.best_score_)

Best parameters found:  {'C': 1000, 'epsilon': 0.5, 'gamma': 'scale'}
Best score (Neg. MAE):  -32.48013996312687


In [10]:
svr_rbf_2 = SVR(kernel = "rbf", gamma = "scale", epsilon = 0.5, C = 1000).fit(X_train, y_train)

y_pred = svr_rbf_2.predict(X_test)
results_svr_rbf_2 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_rbf_2.index.name = "id"
results_svr_rbf_2.to_csv("prediction_svr_rbf_2.csv", index=False)

In [4]:
## optuna

# Define the Objective Function
def objective_svr_rbf(trial):

    gamma = trial.suggest_float("gamma", 0.0001, 10, log = True)
    epsilon = trial.suggest_float("epsilon", 0.01, 5, log=True)
    C = trial.suggest_float("C", 1, 10000, log=True)

    # Initialize the Model
    svr_rbf = SVR(
        kernel = "rbf",
        gamma = gamma,
        epsilon = epsilon,
        C = C)
    
    # Evaluate the Model using Cross-Validation
    score = cross_val_score(
        svr_rbf, 
        X_train, 
        y_train, 
        n_jobs = -1, 
        cv = 5, 
        scoring = "neg_mean_absolute_error")
    
    # Return the mean cross-validation score
    mean_absolute_error = -score.mean()
    return mean_absolute_error

# Create the Study
study = optuna.create_study(
    direction = 'minimize', 
    sampler = optuna.samplers.TPESampler(seed = 123))

# Run Optimization
# The study runs the objective function either 100 times (each time suggesting a new set of parameters based on the results of the previous 
# trials), or for 1.5 hours (timeout = 5400; trial that is ongoing after 1.5 hours is still finished)
print("Starting optimization...")
study.optimize(objective_svr_rbf, n_trials = 100, timeout = 5400)
print("Optimization finished.")

[I 2025-11-25 16:21:39,304] A new study created in memory with name: no-name-b70e2240-78e0-40d5-a0ea-e90e2c357016


Starting optimization...


[I 2025-11-25 16:21:56,316] Trial 0 finished with value: 47.85090855990434 and parameters: {'gamma': 0.3036308728080769, 'epsilon': 0.059194517112258364, 'C': 8.07989680831947}. Best is trial 0 with value: 47.85090855990434.
[I 2025-11-25 16:22:05,856] Trial 1 finished with value: 36.179502677455176 and parameters: {'gamma': 0.057091814418589744, 'epsilon': 0.874631608797138, 'C': 49.252223377910575}. Best is trial 1 with value: 36.179502677455176.
[I 2025-11-25 16:22:14,196] Trial 2 finished with value: 62.63939422705969 and parameters: {'gamma': 8.01347694748292, 'epsilon': 0.7052367291265605, 'C': 83.89336334792569}. Best is trial 1 with value: 36.179502677455176.
[I 2025-11-25 16:22:23,483] Trial 3 finished with value: 33.55832319385047 and parameters: {'gamma': 0.009132456052661999, 'epsilon': 0.08437760066092627, 'C': 824.5155098953618}. Best is trial 3 with value: 33.55832319385047.
[I 2025-11-25 16:22:32,112] Trial 4 finished with value: 39.721156656057566 and parameters: {'gam

Optimization finished.


In [5]:
# Get the best trial information
best_trial = study.best_trial

print("\n--- Best Trial Results ---")
print(f"Best cross-validation mean absolute error: {best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")


--- Best Trial Results ---
Best cross-validation mean absolute error: 32.4705
Best hyperparameters:
  gamma: 0.037280069763315986
  epsilon: 0.14239737749344758
  C: 498.7516274892395


In [6]:
# Train the final model with the best parameters on the entire training set
svr_rbf_3 = SVR(**best_trial.params, kernel = "rbf").fit(X_train, y_train)
y_pred = svr_rbf_3.predict(X_test)
results_svr_rbf_3 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_rbf_3.index.name = "id"
results_svr_rbf_3.to_csv("Submissions/prediction_svr_rbf_3.csv", index=False)

## 3.) SVM with polynomial kernel

Hyperparameters that need to be tuned:
- C (see above)
- epsilon (see above)
- d (degree): degree of the polynomial function; determines the dimensionality of the feature space transformation; higher d --> more complex, non-linear relationships can be captured, but computation time and risk of overfitting increases; d=1 is linear kernel, d=2 is quadratic, d=3 is cubic
- gamma: see above; often left at default value or searched over a small space
- coef0 (r): independent term in polynomial kernel; shifts the input data; allows the kernel to map the data non-linearly even if d=1; often less crucial than C and d

In [11]:
## grid search (directly done and with smaller grid and lower cv as SVR with polynomial kernel very computationally heavy - took forever before)

parameters_grid = {
    "C": [0.1, 1, 10, 100],
    "epsilon": [0.001, 0.01, 0.1],
    "degree": [2, 3],
    "gamma": ["scale", 0.001, 0.01],
    "coef0": [-1, 0, 1]} 

svr_poly = SVR(kernel = "poly")

svr_poly_grid_search = GridSearchCV(
    estimator = svr_poly,
    param_grid = parameters_grid,
    cv = 4,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    error_score = "raise")

svr_poly_grid_search.fit(X_train, y_train)

print("Best parameters found: ", svr_poly_grid_search.best_params_)
print("Best score (Neg. MAE): ", svr_poly_grid_search.best_score_)

Best parameters found:  {'C': 100, 'coef0': 1, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale'}
Best score (Neg. MAE):  -36.46079321797002


In [7]:
svr_poly_1 = SVR(kernel = "poly", gamma = "scale", epsilon = 0.1, C = 100, degree = 3, coef0 = 1).fit(X_train, y_train)

y_pred = svr_poly_1.predict(X_test)
results_svr_poly_1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_poly_1.index.name = "id"
results_svr_poly_1.to_csv("Submissions/prediction_svr_poly_1.csv", index=False)

In [None]:
## optuna (with pruning & enqueuing the baseline)

# Define the Objective Function
def objective_svr_poly(trial):

    gamma = trial.suggest_categorical("gamma", ["scale", "auto"])
    epsilon = trial.suggest_float("epsilon", 0.01, 1.0, log=True)
    C = trial.suggest_float("C", 0.1, 1000, log=True)
    degree = trial.suggest_int("degree", 2, 5)
    coef0 = trial.suggest_float("coef0", 0, 2)

    # Initialize the Model
    svr_poly = SVR(
        kernel = "poly",
        gamma = gamma,
        epsilon = epsilon,
        C = C,
        degree = degree,
        coef0 = coef0)

    # Manual cross-validation loop (necessary in case of pruning)
    kf = KFold(n_splits = 4, shuffle = True, random_state = 123)
    mae_scores = []

    for step, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # Fit and Predict
        svr_poly.fit(X_tr, y_tr)
        preds = svr_poly.predict(X_val)
        
        # Calculate MAE for this fold
        fold_mae = mean_absolute_error(y_val, preds)
        mae_scores.append(fold_mae)

        # Report the MEAN MAE up to this point
        intermediate_value = np.mean(mae_scores)
        
        trial.report(intermediate_value, step = step)

        if trial.should_prune():
            raise optuna.TrialPruned()

    # Return overall mean MAE
    return np.mean(mae_scores)

# Create study
study = optuna.create_study(
    direction = "minimize",
    pruner = optuna.pruners.MedianPruner(n_warmup_steps = 1))

# ENQUEUE YOUR BASELINE (starts at the prior best model)
study.enqueue_trial({
    "gamma": "scale",
    "epsilon": 0.1,
    "C": 100,
    "degree": 3,
    "coef0": 1.0})

# Run optimization
print("Starting optimization...")
study.optimize(objective_svr_poly, n_trials = 100, timeout = 5400)
print("Optimization finished.")

[I 2025-11-25 18:02:04,719] A new study created in memory with name: no-name-2e12925a-6641-4c61-a63c-431f6f38a738


Starting optimization...


[I 2025-11-25 18:02:24,760] Trial 0 finished with value: 36.46027345008784 and parameters: {'gamma': 'scale', 'epsilon': 0.1, 'C': 100, 'degree': 3, 'coef0': 1.0}. Best is trial 0 with value: 36.46027345008784.
[I 2025-11-25 18:02:30,742] Trial 1 finished with value: 46.012186461981884 and parameters: {'gamma': 'auto', 'epsilon': 0.2097887315997636, 'C': 12.183006480888618, 'degree': 4, 'coef0': 1.5879375883879039}. Best is trial 0 with value: 36.46027345008784.
[I 2025-11-25 18:02:37,132] Trial 2 finished with value: 39.34129333173601 and parameters: {'gamma': 'auto', 'epsilon': 0.28815130342751305, 'C': 209.28896449099057, 'degree': 5, 'coef0': 0.9310097183979786}. Best is trial 0 with value: 36.46027345008784.
[I 2025-11-25 18:02:43,118] Trial 3 finished with value: 54.56178832008926 and parameters: {'gamma': 'scale', 'epsilon': 0.13803766172895515, 'C': 0.37163677370699527, 'degree': 2, 'coef0': 1.7870190366306102}. Best is trial 0 with value: 36.46027345008784.
[I 2025-11-25 18:02

In [1]:
# optuna search process was left running for over 1.5 hours (then stopped by interrupting the kernel) but only achieved 5 trials in this 
# amount of time (reason why the 6th trial took so long unclear);
# no improvement compared to the model found by grid search