In [58]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import smogn
from sklearn.metrics import r2_score
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
import os

In [59]:
df = pd.read_csv(f"data/data.csv")
df.head(1)

Unnamed: 0,OBJECTID_x,ID,pH,As,Cd,Ni,Pb,Zn,Hg,slope,...,HYDGRP_D,ANION_EXCL,SOL_CRK,SOL_K1,CLAY1,SILT1,SAND1,ROCK1,SOL_EC1,distance
0,1,KW-SK-S-36,7.63,17.46,0.48,1.65,91.1,,0.0,36.896198,...,0,0.5,0.5,43.15,6.0,45.0,50.0,58.0,0,26.743154


In [60]:
# Select only the columns of interest
selected_columns = ['pH', 'As', 'Cd', 'Ni', 'slope', 'altitude', 'profile_curve', 'planform_curve',
                    'HYDGRP_A', 'HYDGRP_B', 'HYDGRP_C', 'HYDGRP_D', 'SOL_K1', 'CLAY1', 'SILT1', 'SAND1', 'ROCK1', 'distance']
df_filtered = df[selected_columns]

In [61]:
# Function to remove outliers for a specific column using 1.5*IQR method
def remove_outliers_iqr_column(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df_out

# Remove outliers for 'As'
df_cleaned_As = remove_outliers_iqr_column(df_filtered, 'As')

# Remove outliers for 'Cd'
df_cleaned_Cd = remove_outliers_iqr_column(df_filtered, 'Cd')

# Remove outliers for 'Ni'
df_cleaned_Ni = remove_outliers_iqr_column(df_filtered, 'Ni')

In [62]:
# Features and targets
features = ['pH', 'slope', 'altitude', 'profile_curve', 'planform_curve', 'HYDGRP_A', 'HYDGRP_B', 'HYDGRP_C', 'HYDGRP_D',
            'SOL_K1', 'CLAY1', 'SILT1', 'SAND1', 'ROCK1', 'distance']
targets = ['As', 'Cd', 'Ni']

In [63]:
# # Splitting the data into training and testing sets (80:20)
# X = df_cleaned[features]

In [64]:
# Initialize StandardScaler
scaler_As = StandardScaler()
scaler_Cd = StandardScaler()
scaler_Ni = StandardScaler()

# Features and targets for cleaned DataFrames (with target log convert)
X_As = df_cleaned_As[features]
y_As = df_cleaned_As[['As']]

X_Cd = df_cleaned_Cd[features]
y_Cd = df_cleaned_Cd[['Cd']]

X_Ni = df_cleaned_Ni[features]
y_Ni = df_cleaned_Ni[['Ni']]


# Split and scale for 'As'
As_X_train, As_X_test, As_y_train, As_y_test = train_test_split(X_As, y_As, test_size=0.2, random_state=42)
As_X_train = scaler_As.fit_transform(As_X_train)
As_X_test = scaler_As.transform(As_X_test)

# Split and scale for 'Cd'
Cd_X_train, Cd_X_test, Cd_y_train, Cd_y_test = train_test_split(X_Cd, y_Cd, test_size=0.2, random_state=42)
Cd_X_train = scaler_Cd.fit_transform(Cd_X_train)
Cd_X_test = scaler_Cd.transform(Cd_X_test)

# Split and scale for 'Ni'
Ni_X_train, Ni_X_test, Ni_y_train, Ni_y_test = train_test_split(X_Ni, y_Ni, test_size=0.2, random_state=42)
Ni_X_train = scaler_Ni.fit_transform(Ni_X_train)
Ni_X_test = scaler_Ni.transform(Ni_X_test)

In [65]:
# # Initialize StandardScaler
# scaler = StandardScaler()

# # Split and scale for 'As'
# As_X_train, As_X_test, As_y_train, As_y_test = train_test_split(X, y[['As']], test_size=0.2, random_state=42)
# As_X_train = scaler.fit_transform(As_X_train)
# As_X_test = scaler.transform(As_X_test)

# # Split and scale for 'Cd'
# Cd_X_train, Cd_X_test, Cd_y_train, Cd_y_test = train_test_split(X, y[['Cd']], test_size=0.2, random_state=42)
# Cd_X_train = scaler.fit_transform(Cd_X_train)
# Cd_X_test = scaler.transform(Cd_X_test)

# # Split and scale for 'Ni'
# Ni_X_train, Ni_X_test, Ni_y_train, Ni_y_test = train_test_split(X, y[['Ni']], test_size=0.2, random_state=42)
# Ni_X_train = scaler.fit_transform(Ni_X_train)
# Ni_X_test = scaler.transform(Ni_X_test)

In [66]:
# Show the shapes to confirm
As_X_train.shape, As_X_test.shape, Cd_X_train.shape, Cd_X_test.shape, Ni_X_train.shape, Ni_X_test.shape

((1030, 15), (258, 15), (1016, 15), (255, 15), (1139, 15), (285, 15))

In [67]:
# For As
As_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((As_X_train, As_y_train)), columns=features + ['As']),
    y='As',
    k=5,
    samp_method='balance',
    rel_thres=0.3
)
As_X_train = As_balanced_data[features]
As_y_train = As_balanced_data['As']

# For Cd
Cd_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((Cd_X_train, Cd_y_train)), columns=features + ['Cd']),
    y='Cd',
    k=5,
    samp_method='balance',
    rel_thres=0.3
)
Cd_X_train = Cd_balanced_data[features]
Cd_y_train = Cd_balanced_data['Cd']

# For Ni
Ni_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((Ni_X_train, Ni_y_train)), columns=features + ['Ni']),
    y='Ni',
    k=5,
    samp_method='balance',
    rel_thres=0.3
)
Ni_X_train = Ni_balanced_data[features]
Ni_y_train = Ni_balanced_data['Ni']

dist_matrix:   0%|          | 0/219 [00:00<?, ?it/s]

dist_matrix: 100%|##########| 219/219 [00:14<00:00, 15.62it/s]
synth_matrix: 100%|##########| 219/219 [00:00<00:00, 578.67it/s]
r_index: 100%|##########| 76/76 [00:00<00:00, 633.19it/s]
dist_matrix: 100%|##########| 188/188 [00:10<00:00, 17.41it/s]
synth_matrix: 100%|##########| 188/188 [00:00<00:00, 627.97it/s]
r_index: 100%|##########| 131/131 [00:00<00:00, 609.95it/s]
dist_matrix: 100%|##########| 289/289 [00:26<00:00, 11.04it/s]
r_index: 100%|##########| 281/281 [00:00<00:00, 631.58it/s]


In [68]:
As_X_train.shape, As_X_test.shape, Cd_X_train.shape, Cd_X_test.shape, Ni_X_train.shape, Ni_X_test.shape

((810, 15), (258, 15), (828, 15), (255, 15), (850, 15), (285, 15))

In [69]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RF

### As

In [70]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [71]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(As_X_train, As_y_train.values.ravel())

In [72]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 50), ('min_samples_leaf', 2), ('min_samples_split', 3), ('n_estimators', 100)])
Best score for RF:  0.30235001407684525


In [73]:
# Predict on the training set
As_y_pred_rf_train = opt_rf.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_rf_train))
r2_rf_train = r2_score(As_y_train, As_y_pred_rf_train)

# Predict on the test set
As_y_pred_rf_test = opt_rf.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_rf_test))
r2_rf_test = r2_score(As_y_test, As_y_pred_rf_test)



In [74]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 0.4024312770356228
Random Forest Test RMSE: 0.9220493070681306

Random Forest Training R2: 0.835911802377212
Random Forest Test R2: -0.05811738521330345


### Cd

In [75]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [76]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(Cd_X_train, Cd_y_train.values.ravel())



In [77]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 43), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 53)])
Best score for RF:  0.4156689854936726


In [78]:
# Predict on the training set
Cd_y_pred_rf_train = opt_rf.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_rf_train))
r2_rf_train = r2_score(Cd_y_train, Cd_y_pred_rf_train)

# Predict on the test set
Cd_y_pred_rf_test = opt_rf.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_rf_test))
r2_rf_test = r2_score(Cd_y_test, Cd_y_pred_rf_test)



In [79]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 0.02310794539635266
Random Forest Test RMSE: 0.06839738160441662

Random Forest Training R2: 0.9185113281830278
Random Forest Test R2: -0.1483959657371683


### Ni

In [80]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [81]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(Ni_X_train, Ni_y_train.values.ravel())



In [82]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 48), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 100)])
Best score for RF:  0.6130331340440616


In [83]:
# Predict on the training set
Ni_y_pred_rf_train = opt_rf.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_rf_train))
r2_rf_train = r2_score(Ni_y_train, Ni_y_pred_rf_train)

# Predict on the test set
Ni_y_pred_rf_test = opt_rf.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_rf_test))
r2_rf_test = r2_score(Ni_y_test, Ni_y_pred_rf_test)



In [84]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 2.9310616704390746
Random Forest Test RMSE: 7.485183309980645

Random Forest Training R2: 0.9485034090664214
Random Forest Test R2: 0.6044334767097723


# GBM

### As

In [85]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [86]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(As_X_train, As_y_train.ravel())

In [87]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 6), ('min_samples_leaf', 3), ('min_samples_split', 9), ('n_estimators', 20), ('subsample', 0.6460224183143063)])
Best score for GBR:  0.2867939269936689


In [88]:
# Predict on the training set
As_y_pred_gbr_train = opt_gbr.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_gbr_train))
r2_gbr_train = r2_score(As_y_train, As_y_pred_gbr_train)

# Predict on the test set
As_y_pred_gbr_test = opt_gbr.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_gbr_test))
r2_gbr_test = r2_score(As_y_test, As_y_pred_gbr_test)



In [89]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}\n')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 0.5859472489062496
GBM Test RMSE: 0.9447139728559318

GBM Training R2: 0.6521348436803882
GBM Test R2: -0.11077535698123842


### Cd

In [90]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [91]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(Cd_X_train, Cd_y_train.ravel())

In [92]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 3), ('min_samples_leaf', 4), ('min_samples_split', 10), ('n_estimators', 100), ('subsample', 0.8941040359252148)])
Best score for GBR:  0.40546652339051975


In [93]:
# Predict on the training set
Cd_y_pred_gbr_train = opt_gbr.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_gbr_train))
r2_gbr_train = r2_score(Cd_y_train, Cd_y_pred_gbr_train)

# Predict on the test set
Cd_y_pred_gbr_test = opt_gbr.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_gbr_test))
r2_gbr_test = r2_score(Cd_y_test, Cd_y_pred_gbr_test)



In [94]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 0.045547322266831135
GBM Training R2: 0.683407754151965
GBM Test RMSE: 0.06717511098640949
GBM Test R2: -0.10771870882893464


### Ni

In [95]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [96]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(Ni_X_train, Ni_y_train.ravel())

In [97]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 28), ('min_samples_leaf', 3), ('min_samples_split', 3), ('n_estimators', 61), ('subsample', 0.5784805195638673)])
Best score for GBR:  0.630609539357946


In [98]:
# Predict on the training set
Ni_y_pred_gbr_train = opt_gbr.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_gbr_train))
r2_gbr_train = r2_score(Ni_y_train, Ni_y_pred_gbr_train)

# Predict on the test set
Ni_y_pred_gbr_test = opt_gbr.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_gbr_test))
r2_gbr_test = r2_score(Ni_y_test, Ni_y_pred_gbr_test)



In [99]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}\n')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 1.263457233287823
GBM Test RMSE: 7.386841472488777

GBM Training R2: 0.9904313722496192
GBM Test R2: 0.6147592614772992


# SVM

### As

In [100]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [101]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=20,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(As_X_train, As_y_train.ravel())

In [102]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 2.0402114962615974), ('degree', 4), ('epsilon', 0.035482409357961285), ('kernel', 'rbf')])
Best score for SVM:  0.25942033607656423


In [103]:
# Predict on the training set
As_y_pred_svm_train = opt_svm.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_svm_train))
r2_svm_train = r2_score(As_y_train, As_y_pred_svm_train)

# Predict on the test set
As_y_pred_svm_test = opt_svm.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_svm_test))
r2_svm_test = r2_score(As_y_test, As_y_pred_svm_test)



In [104]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 0.6744909708298944
SVM Test RMSE: 0.9525192011290202

SVM Training R2: 0.5390581169816
SVM Test R2: -0.12920563461018086


### Cd

In [105]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [106]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=20,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(Cd_X_train, Cd_y_train.ravel())

In [107]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 0.02266518746180975), ('degree', 5), ('epsilon', 6.119884560960456e-05), ('kernel', 'rbf')])
Best score for SVM:  0.3233050566599334


In [108]:
# Predict on the training set
Cd_y_pred_svm_train = opt_svm.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_svm_train))
r2_svm_train = r2_score(Cd_y_train, Cd_y_pred_svm_train)

# Predict on the test set
Cd_y_pred_svm_test = opt_svm.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_svm_test))
r2_svm_test = r2_score(Cd_y_test, Cd_y_pred_svm_test)



In [109]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 0.06085605286739849
SVM Test RMSE: 0.06520632777830501

SVM Training R2: 0.43482605137102137
SVM Test R2: -0.04373967182340199


### Ni

In [110]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [111]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=20,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(Ni_X_train, Ni_y_train.ravel())

In [112]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 579.4030798388169), ('degree', 2), ('epsilon', 10.0), ('kernel', 'rbf')])
Best score for SVM:  0.3604488233087428


In [113]:
# Predict on the training set
Ni_y_pred_svm_train = opt_svm.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_svm_train))
r2_svm_train = r2_score(Ni_y_train, Ni_y_pred_svm_train)

# Predict on the test set
Ni_y_pred_svm_test = opt_svm.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_svm_test))
r2_svm_test = r2_score(Ni_y_test, Ni_y_pred_svm_test)



In [114]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 7.209898588347954
SVM Test RMSE: 11.634932472036043

SVM Training R2: 0.6884075832965584
SVM Test R2: 0.04425423305743548
