# Machine learning with random forest and SVM

## 1.) Random forest

In [8]:
## import required packages
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVR
#import optuna

In [9]:
## import datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.head())

## split features and target variable
X_train = train.drop(["id", "SMILES", "Tm"], axis = 1)
y_train = train["Tm"]
X_test = test.drop(["id", "SMILES"], axis = 1) 
id_test = test["id"] 

     id                       SMILES      Tm  Group 1  Group 2  Group 3  \
0  2175        FC1=C(F)C(F)(F)C1(F)F  213.15        0        0        0   
1  1222  c1ccc2c(c1)ccc3Nc4ccccc4c23  407.15        0        0        0   
2  2994          CCN1C(C)=Nc2ccccc12  324.15        2        1        0   
3  1704                   CC#CC(=O)O  351.15        1        0        0   
4  2526                    CCCCC(S)C  126.15        2        3        0   

   Group 4  Group 5  Group 6  Group 7  ...  Group 415  Group 416  Group 417  \
0        0        0        0        0  ...          0          0          0   
1        0        0        0        0  ...          0          0          0   
2        0        0        0        0  ...          0          0          0   
3        0        0        0        0  ...          0          0          0   
4        0        0        0        0  ...          0          0          0   

   Group 418  Group 419  Group 420  Group 421  Group 422  Group 423  Group

### 1.1.) RandomizedSearchCV

- defining a range/distribution of values for each hyperparameter --> random sampling of a fixed number of combinations
- faster than Grid Search; good for a first pass

In [3]:
## parameter grid to sample from
parameters_grid = {
    "n_estimators": [100, 200, 400, 600, 800, 1000],
    "max_depth": [10, 15, 20, 25, 30, 35, None],
    "min_samples_split": [2, 3, 5, 7, 10],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt", "log2", 0.02, 0.05, 0.1, 0.2]} # percentages, e.g. 0.05 equals 5 % of features

## Model initialization
rf = RandomForestRegressor(random_state = 123)

## RandomizedSearchCV initialization
rf_randomized_search = RandomizedSearchCV(
    estimator = rf,
    param_distributions = parameters_grid,
    n_iter = 70,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    random_state = 123,
    error_score = "raise")

## RandomizedSearchCV fit to the training data
rf_randomized_search.fit(X_train, y_train)

## Extraction of best result
print("Best parameters found: ", rf_randomized_search.best_params_)
print("Best score (Neg. MAE): ", rf_randomized_search.best_score_)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
Best parameters found:  {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_depth': 30}
Best score (Neg. MAE):  -37.117318102559906


In [9]:
# random forest with found best parameters from randomized search

rf1 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.2, max_depth = 30, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf1.predict(X_test)
results_rf1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf1.index.name = "id"
results_rf1.to_csv("prediction_rf1.csv", index=False)

### 1.2.) GridSearchCV

- every single combination of specified hyperparameters tested --> best combination within the defined grid found
- can be very slow for large grids
- good to use for fine-tuning after narrowing down via Random Search (done here)

In [13]:
## grid search 1

# grid with parameters to search (for fine-tuning of n_estimators, max_depth and max_features
parameters_grid = {
    "n_estimators": [400, 600],
    "max_depth": [20, 30, 40, None],
    "max_features": [0.1, 0.15, 0.2, 0.25, 0.3]}

# model initialisation
rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

# GridSearchCV initialisation
rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

# GridSearchCV fit to the training data
rf_grid_search.fit(X_train, y_train)

## Extraction of best result
print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': None, 'max_features': 0.3, 'n_estimators': 600}
Best score (Neg. MAE):  -35.96683474779305


In [14]:
# random forest with found best parameters from grid search 1

rf2 = RandomForestRegressor(n_estimators = 600, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.3, max_depth = None, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf2.predict(X_test)
results_rf2 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf2.index.name = "id"
results_rf2.to_csv("prediction_rf2.csv", index=False)

In [15]:
## grid search 2

parameters_grid = {
    "n_estimators": [350, 400, 450, 500],
    "max_depth": [25, 30, 35],
    "max_features": [0.15, 0.2, 0.25]}

rf = RandomForestRegressor(min_samples_split = 2, min_samples_leaf = 1, random_state = 123)

rf_grid_search = GridSearchCV(estimator = rf, param_grid = parameters_grid, cv = 5, scoring = "neg_mean_absolute_error", n_jobs = -1,
                              error_score = "raise")

rf_grid_search.fit(X_train, y_train)

print("Best parameters found: ", rf_grid_search.best_params_)
print("Best score (Neg. MAE): ", rf_grid_search.best_score_)

Best parameters found:  {'max_depth': 35, 'max_features': 0.25, 'n_estimators': 400}
Best score (Neg. MAE):  -36.463945340580196


In [16]:
# random forest with found best parameters from grid search 2

rf3 = RandomForestRegressor(n_estimators = 400, min_samples_split = 2, min_samples_leaf = 1, max_features = 0.25, max_depth = 35, 
                            random_state = 123).fit(X_train, y_train)
y_pred = rf3.predict(X_test)
results_rf3 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_rf3.index.name = "id"
results_rf3.to_csv("prediction_rf3.csv", index=False)

## 1.3) Optuna

employs intelligent, adaptive sampling techniques like Bayesian optimization to guide the search, allowing it to converge on better results faster

In [None]:
# Define the Objective Function

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 50, 700)
    max_depth = trial.suggest_int('max_depth', 5, None)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.25, 0.3, 0.5, 0.7])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])

    # Initialize the Model
    rf = RandomForestRegressor(
        n_estimators = n_estimators,
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        max_features = max_features,
        criterion = criterion,
        random_state = 123,
        n_jobs=-1)
    
    # Evaluate the Model using Cross-Validation
    # Use 5-fold cross-validation on the training set
    # 'accuracy' is the scoring metric. We negate the result because Optuna 
    # is often set to 'minimize', and minimizing negative accuracy is the same 
    # as maximizing positive accuracy.
    score = cross_val_score(
        clf, 
        X_train, 
        y_train, 
        n_jobs=-1, 
        cv=5, 
        scoring='accuracy'
    )
    
    # Return the mean cross-validation score (Accuracy)
    # We return the NEGATED mean accuracy if direction is 'minimize'
    # Since we use direction='maximize' later, we return the POSITIVE mean accuracy
    return score.mean()


# --- 3. Create and Run the Study ---

# 3.1. Create the Study
# We set the direction to 'maximize' because we want to maximize the mean accuracy.
study = optuna.create_study(
    direction='maximize', 
    sampler=optuna.samplers.TPESampler(seed=42) # TPE sampler is the default and best for many tasks
)

# 3.2. Run Optimization
# The study will run the objective function 100 times, each time suggesting a new 
# set of parameters based on the results of the previous trials.
print("Starting optimization...")
study.optimize(objective, n_trials=100)
print("Optimization finished.")


# --- 4. Analyze Results ---

# Get the best trial information
best_trial = study.best_trial

print("\n--- Best Trial Results ---")
print(f"Best cross-validation accuracy: {best_trial.value:.4f}")
print("Best hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")

# --- 5. Final Model Training and Evaluation (Optional) ---
# Train the final model with the best parameters on the entire training set
final_clf = RandomForestClassifier(**best_trial.params, random_state=42)
final_clf.fit(X_train, y_train)

# Evaluate on the unseen test set
test_accuracy = final_clf.score(X_test, y_test)
print(f"\nTest set accuracy with optimal parameters: {test_accuracy:.4f}")

## 2.) SVR with radial kernel

SVR: a tube with an estimated function (hyperplane) in the middle and boundaries on either side defined by ε. The algorithm’s goal is to minimize the error by identifying a function that puts more of the original points inside the tube while at the same time reducing the "slack."
Difference to SVM: the support vectors are the points that fall outside the tube rather than the ones at the margin

Radial and polynomial kernel: A kernel is a function that takes the original non-linear problem and transforms it into a linear one, which is then handled by the algorithm in a higher-dimensional space.

Hyperparameters that need to be tuned:
- C (regularization parameter): high C leads to stronger penalization of errors (points outside the tube), leading to a more complex model that fits the data more closely; small C allows more errors and leads to a simpler model with a larger margin
- epsilon: value of epsilon determines the width of the tube around the estimated function (hyperplane); points that fall inside this tube are considered as correct predictions and are not penalized
- gamma: inverse of the radius of influence of samples selected by the model as support vectors; with a low gamma the influence of individual training examples reaches further, affecting a larger region of the feature space (leads to a smoother and less complex decision boundary, but can result in underfitting); with a high gamma the influence is closer, affecting only the region near the training example (can lead to overfitting)

In [4]:
## randomized search

parameters_grid = {
    "C": [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 1000],
    "epsilon": [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2, 4],
    "gamma": ["scale", "auto", 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.01, 1, 10]}

svr_rbf = SVR(kernel = "rbf")

svr_rbf_randomized_search = RandomizedSearchCV(
    estimator = svr_rbf,
    param_distributions = parameters_grid,
    n_iter = 70,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    random_state = 123,
    error_score = "raise")

svr_rbf_randomized_search.fit(X_train, y_train)

print("Best parameters found: ", svr_rbf_randomized_search.best_params_)
print("Best score (Neg. MAE): ", svr_rbf_randomized_search.best_score_)

Best parameters found:  {'gamma': 'scale', 'epsilon': 0.01, 'C': 1000}
Best score (Neg. MAE):  -32.52189746601051


In [6]:
svr_rbf_1 = SVR(kernel = "rbf", gamma = "scale", epsilon = 0.01, C = 1000).fit(X_train, y_train)

y_pred = svr_rbf_1.predict(X_test)
results_svr_rbf_1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_rbf_1.index.name = "id"
results_svr_rbf_1.to_csv("prediction_svr_rbf_1.csv", index=False)

In [7]:
print(svr_rbf_1._gamma)
# gamma = "scale" uses this value for gamma:

0.023719825417948447


In [9]:
## grid search

parameters_grid = {
    "C": [0.1, 0.5, 1, 10, 100, 500, 1000, 2000],
    "epsilon": [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
    "gamma": ["scale", "auto", 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]}

svr_rbf = SVR(kernel = "rbf")

svr_rbf_grid_search = GridSearchCV(
    estimator = svr_rbf,
    param_grid = parameters_grid,
    cv = 5,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    error_score = "raise")

svr_rbf_grid_search.fit(X_train, y_train)

print("Best parameters found: ", svr_rbf_grid_search.best_params_)
print("Best score (Neg. MAE): ", svr_rbf_grid_search.best_score_)

Best parameters found:  {'C': 1000, 'epsilon': 0.5, 'gamma': 'scale'}
Best score (Neg. MAE):  -32.48013996312687


In [10]:
svr_rbf_2 = SVR(kernel = "rbf", gamma = "scale", epsilon = 0.5, C = 1000).fit(X_train, y_train)

y_pred = svr_rbf_2.predict(X_test)
results_svr_rbf_2 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_rbf_2.index.name = "id"
results_svr_rbf_2.to_csv("prediction_svr_rbf_2.csv", index=False)

## 3.) SVM with polynomial kernel

Hyperparameters that need to be tuned:
- C (see above)
- epsilon (see above)
- d (degree): degree of the polynomial function; determines the dimensionality of the feature space transformation; higher d --> more complex, non-linear relationships can be captured, but computation time and risk of overfitting increases; d=1 is linear kernel, d=2 is quadratic, d=3 is cubic
- gamma: see above; often left at default value or searched over a small space
- coef0 (r): independent term in polynomial kernel; shifts the input data; allows the kernel to map the data non-linearly even if d=1; often less crucial than C and d

In [11]:
## grid search (directly done and with smaller grid and lower cv as svr with polynomial kernel very computationally heavy - took forever before)

parameters_grid = {
    "C": [0.1, 1, 10, 100],
    "epsilon": [0.001, 0.01, 0.1],
    "degree": [2, 3],
    "gamma": ["scale", 0.001, 0.01],
    "coef0": [-1, 0, 1]} 

svr_poly = SVR(kernel = "poly")

svr_poly_grid_search = GridSearchCV(
    estimator = svr_poly,
    param_grid = parameters_grid,
    cv = 4,
    n_jobs = -1,
    scoring = "neg_mean_absolute_error",
    error_score = "raise")

svr_poly_grid_search.fit(X_train, y_train)

print("Best parameters found: ", svr_poly_grid_search.best_params_)
print("Best score (Neg. MAE): ", svr_poly_grid_search.best_score_)

Best parameters found:  {'C': 100, 'coef0': 1, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale'}
Best score (Neg. MAE):  -36.46079321797002


In [12]:
svr_poly_1 = SVR(kernel = "poly", gamma = "scale", epsilon = 0.1, C = 100, degree = 3, coef0 = 1).fit(X_train, y_train)

y_pred = svr_poly_1.predict(X_test)
results_svr_poly_1 = pd.DataFrame({"id": id_test, "Tm": y_pred})
results_svr_poly_1.index.name = "id"
results_svr_poly_1.to_csv("Submissions/prediction_svr_poly_1.csv", index=False)

In [None]:
## do everything with Optuna too
## measures to prevent overfitting?!
## standardization/normalization of the data?
## also try SVR with other kernels: ‘linear’ (if a linear relationship is suspected),‘sigmoid’ (also for non-linear), ‘precomputed’