In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import smogn
from sklearn.metrics import r2_score
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
import os

In [2]:
df = pd.read_csv(f"data/data.csv")
df.head(1)

Unnamed: 0,OBJECTID_x,ID,pH,As,Cd,Ni,Pb,Zn,Hg,slope,...,HYDGRP_D,ANION_EXCL,SOL_CRK,SOL_K1,CLAY1,SILT1,SAND1,ROCK1,SOL_EC1,distance
0,1,KW-SK-S-36,7.63,17.46,0.48,1.65,91.1,,0.0,36.896198,...,0,0.5,0.5,43.15,6.0,45.0,50.0,58.0,0,26.743154


In [3]:
# Select only the columns of interest
selected_columns = ['pH', 'As', 'Cd', 'Ni', 'slope', 'altitude', 'profile_curve', 'planform_curve',
                    'HYDGRP_A', 'HYDGRP_B', 'HYDGRP_C', 'HYDGRP_D', 'SOL_K1', 'CLAY1', 'SILT1', 'SAND1', 'ROCK1', 'distance']
df_filtered = df[selected_columns]

In [4]:
# Function to remove outliers for a specific column using 1.5*IQR method
def remove_outliers_iqr_column(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df_out

# Remove outliers for 'As'
df_cleaned_As = remove_outliers_iqr_column(df_filtered, 'As')

# Remove outliers for 'Cd'
df_cleaned_Cd = remove_outliers_iqr_column(df_filtered, 'Cd')

# Remove outliers for 'Ni'
df_cleaned_Ni = remove_outliers_iqr_column(df_filtered, 'Ni')

In [5]:
# Features and targets
features = ['pH', 'slope', 'altitude', 'profile_curve', 'planform_curve', 'HYDGRP_A', 'HYDGRP_B', 'HYDGRP_C', 'HYDGRP_D',
            'SOL_K1', 'CLAY1', 'SILT1', 'SAND1', 'ROCK1', 'distance']
targets = ['As', 'Cd', 'Ni']

In [6]:
# # Splitting the data into training and testing sets (80:20)
# X = df_cleaned[features]

In [7]:
# Initialize StandardScaler
scaler_As = StandardScaler()
scaler_Cd = StandardScaler()
scaler_Ni = StandardScaler()

# Features and targets for cleaned DataFrames (with target log convert)
X_As = df_cleaned_As[features]
y_As = df_cleaned_As[['As']]

X_Cd = df_cleaned_Cd[features]
y_Cd = df_cleaned_Cd[['Cd']]

X_Ni = df_cleaned_Ni[features]
y_Ni = df_cleaned_Ni[['Ni']]


# Split and scale for 'As'
As_X_train, As_X_test, As_y_train, As_y_test = train_test_split(X_As, y_As, test_size=0.2, random_state=42)
As_X_train = scaler_As.fit_transform(As_X_train)
As_X_test = scaler_As.transform(As_X_test)

# Split and scale for 'Cd'
Cd_X_train, Cd_X_test, Cd_y_train, Cd_y_test = train_test_split(X_Cd, y_Cd, test_size=0.2, random_state=42)
Cd_X_train = scaler_Cd.fit_transform(Cd_X_train)
Cd_X_test = scaler_Cd.transform(Cd_X_test)

# Split and scale for 'Ni'
Ni_X_train, Ni_X_test, Ni_y_train, Ni_y_test = train_test_split(X_Ni, y_Ni, test_size=0.2, random_state=42)
Ni_X_train = scaler_Ni.fit_transform(Ni_X_train)
Ni_X_test = scaler_Ni.transform(Ni_X_test)

In [8]:
# # Initialize StandardScaler
# scaler = StandardScaler()

# # Split and scale for 'As'
# As_X_train, As_X_test, As_y_train, As_y_test = train_test_split(X, y[['As']], test_size=0.2, random_state=42)
# As_X_train = scaler.fit_transform(As_X_train)
# As_X_test = scaler.transform(As_X_test)

# # Split and scale for 'Cd'
# Cd_X_train, Cd_X_test, Cd_y_train, Cd_y_test = train_test_split(X, y[['Cd']], test_size=0.2, random_state=42)
# Cd_X_train = scaler.fit_transform(Cd_X_train)
# Cd_X_test = scaler.transform(Cd_X_test)

# # Split and scale for 'Ni'
# Ni_X_train, Ni_X_test, Ni_y_train, Ni_y_test = train_test_split(X, y[['Ni']], test_size=0.2, random_state=42)
# Ni_X_train = scaler.fit_transform(Ni_X_train)
# Ni_X_test = scaler.transform(Ni_X_test)

In [9]:
# Show the shapes to confirm
As_X_train.shape, As_X_test.shape, Cd_X_train.shape, Cd_X_test.shape, Ni_X_train.shape, Ni_X_test.shape

((1030, 15), (258, 15), (1016, 15), (255, 15), (1139, 15), (285, 15))

In [10]:
# For As
As_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((As_X_train, As_y_train)), columns=features + ['As']),
    y='As',
    k=5,
    samp_method='extreme',
    rel_thres=0.7
)
As_X_train = As_balanced_data[features]
As_y_train = As_balanced_data['As']

# For Cd
Cd_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((Cd_X_train, Cd_y_train)), columns=features + ['Cd']),
    y='Cd',
    k=5,
    samp_method='extreme',
    rel_thres=0.7
)
Cd_X_train = Cd_balanced_data[features]
Cd_y_train = Cd_balanced_data['Cd']

# For Ni
Ni_balanced_data = smogn.smoter(
    data=pd.DataFrame(np.hstack((Ni_X_train, Ni_y_train)), columns=features + ['Ni']),
    y='Ni',
    k=5,
    samp_method='extreme',
    rel_thres=0.7
)
Ni_X_train = Ni_balanced_data[features]
Ni_y_train = Ni_balanced_data['Ni']

dist_matrix:   2%|2         | 3/144 [00:00<00:06, 20.83it/s]

dist_matrix: 100%|##########| 144/144 [00:06<00:00, 23.22it/s]
synth_matrix: 100%|##########| 144/144 [00:01<00:00, 112.77it/s]
r_index: 100%|##########| 28/28 [00:00<00:00, 474.48it/s]
dist_matrix: 100%|##########| 86/86 [00:02<00:00, 39.10it/s]
synth_matrix: 100%|##########| 86/86 [00:01<00:00, 64.35it/s]
r_index: 100%|##########| 68/68 [00:00<00:00, 540.64it/s]
dist_matrix: 100%|##########| 105/105 [00:03<00:00, 32.48it/s]
synth_matrix: 100%|##########| 105/105 [00:01<00:00, 71.94it/s]
r_index: 100%|##########| 94/94 [00:00<00:00, 533.97it/s]


In [11]:
As_X_train.shape, As_X_test.shape, Cd_X_train.shape, Cd_X_test.shape, Ni_X_train.shape, Ni_X_test.shape

((1601, 15), (258, 15), (1765, 15), (255, 15), (1970, 15), (285, 15))

In [12]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# RF

### As

In [13]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [14]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(As_X_train, As_y_train.values.ravel())



In [15]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 11), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 100)])
Best score for RF:  0.6277329583980213


In [16]:
# Predict on the training set
As_y_pred_rf_train = opt_rf.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_rf_train))
r2_rf_train = r2_score(As_y_train, As_y_pred_rf_train)

# Predict on the test set
As_y_pred_rf_test = opt_rf.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_rf_test))
r2_rf_test = r2_score(As_y_test, As_y_pred_rf_test)



In [17]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 0.36182899493067777
Random Forest Test RMSE: 0.8456551434239846

Random Forest Training R2: 0.8894829968331133
Random Forest Test R2: 0.10995462498983977


### Cd

In [18]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [19]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(Cd_X_train, Cd_y_train.values.ravel())



In [20]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 28), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 85)])
Best score for RF:  0.7395824138562748


In [21]:
# Predict on the training set
Cd_y_pred_rf_train = opt_rf.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_rf_train))
r2_rf_train = r2_score(Cd_y_train, Cd_y_pred_rf_train)

# Predict on the test set
Cd_y_pred_rf_test = opt_rf.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_rf_test))
r2_rf_test = r2_score(Cd_y_test, Cd_y_pred_rf_test)



In [22]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 0.018664407117952347
Random Forest Test RMSE: 0.06162297647198577

Random Forest Training R2: 0.9635676324415786
Random Forest Test R2: 0.0678237184259668


### Ni

In [23]:
# Define the hyperparameter space
param_space_rf = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10)
}

In [24]:
# Initialize and fit BayesSearchCV with k-fold
opt_rf = BayesSearchCV(
    RandomForestRegressor(),
    param_space_rf,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_rf.fit(Ni_X_train, Ni_y_train.values.ravel())



In [25]:
# Best hyperparameters and score for RF
print("Best hyperparameters for RF: ", opt_rf.best_params_)
print("Best score for RF: ", opt_rf.best_score_)

Best hyperparameters for RF:  OrderedDict([('max_depth', 16), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 97)])
Best score for RF:  0.8696497303886433


In [26]:
# Predict on the training set
Ni_y_pred_rf_train = opt_rf.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_rf_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_rf_train))
r2_rf_train = r2_score(Ni_y_train, Ni_y_pred_rf_train)

# Predict on the test set
Ni_y_pred_rf_test = opt_rf.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_rf_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_rf_test))
r2_rf_test = r2_score(Ni_y_test, Ni_y_pred_rf_test)



In [27]:
print(f'Random Forest Training RMSE: {rmse_rf_train}')
print(f'Random Forest Test RMSE: {rmse_rf_test}\n')
print(f'Random Forest Training R2: {r2_rf_train}')
print(f'Random Forest Test R2: {r2_rf_test}')

Random Forest Training RMSE: 2.0044219784917017
Random Forest Test RMSE: 6.37428947231096

Random Forest Training R2: 0.9834803159369605
Random Forest Test R2: 0.7131345711430482


# GBM

### As

In [28]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [29]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(As_X_train, As_y_train.ravel())

In [30]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 7), ('min_samples_leaf', 4), ('min_samples_split', 7), ('n_estimators', 33), ('subsample', 1.0)])
Best score for GBR:  0.6146111056169924


In [31]:
# Predict on the training set
As_y_pred_gbr_train = opt_gbr.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_gbr_train))
r2_gbr_train = r2_score(As_y_train, As_y_pred_gbr_train)

# Predict on the test set
As_y_pred_gbr_test = opt_gbr.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_gbr_test))
r2_gbr_test = r2_score(As_y_test, As_y_pred_gbr_test)



In [32]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}\n')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 0.42344079756275343
GBM Test RMSE: 0.8869995980924382

GBM Training R2: 0.8486411685326466
GBM Test R2: 0.020797745882137564


### Cd

In [33]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [34]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(Cd_X_train, Cd_y_train.ravel())

In [35]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 12), ('min_samples_leaf', 3), ('min_samples_split', 6), ('n_estimators', 33), ('subsample', 0.8308248789745161)])
Best score for GBR:  0.7291908194706933


In [36]:
# Predict on the training set
Cd_y_pred_gbr_train = opt_gbr.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_gbr_train))
r2_gbr_train = r2_score(Cd_y_train, Cd_y_pred_gbr_train)

# Predict on the test set
Cd_y_pred_gbr_test = opt_gbr.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_gbr_test))
r2_gbr_test = r2_score(Cd_y_test, Cd_y_pred_gbr_test)



In [37]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 0.015880181265560712
GBM Training R2: 0.9736263700846902
GBM Test RMSE: 0.06228556591383879
GBM Test R2: 0.047669847868091586


### Ni

In [38]:
# Define the hyperparameter space for GBR
param_space_gbr = {
    'n_estimators': (10, 100),
    'max_depth': (1, 50),
    'min_samples_leaf': (1, 4),
    'min_samples_split': (2, 10),
    'subsample': (0.5, 1)
}

In [39]:
# Initialize and fit BayesSearchCV with k-fold for GBR
opt_gbr = BayesSearchCV(
    GradientBoostingRegressor(),
    param_space_gbr,
    n_iter=100,
    cv=kf,
    n_jobs=-1
)
opt_gbr.fit(Ni_X_train, Ni_y_train.ravel())

In [40]:
# Best hyperparameters and score for GBR
print("Best hyperparameters for GBR: ", opt_gbr.best_params_)
print("Best score for GBR: ", opt_gbr.best_score_)

Best hyperparameters for GBR:  OrderedDict([('max_depth', 14), ('min_samples_leaf', 3), ('min_samples_split', 2), ('n_estimators', 35), ('subsample', 0.7029082320325464)])
Best score for GBR:  0.8711427957346997


In [41]:
# Predict on the training set
Ni_y_pred_gbr_train = opt_gbr.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_gbr_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_gbr_train))
r2_gbr_train = r2_score(Ni_y_train, Ni_y_pred_gbr_train)

# Predict on the test set
Ni_y_pred_gbr_test = opt_gbr.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_gbr_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_gbr_test))
r2_gbr_test = r2_score(Ni_y_test, Ni_y_pred_gbr_test)



In [42]:
print(f'GBM Training RMSE: {rmse_gbr_train}')
print(f'GBM Test RMSE: {rmse_gbr_test}\n')
print(f'GBM Training R2: {r2_gbr_train}')
print(f'GBM Test R2: {r2_gbr_test}')

GBM Training RMSE: 1.469813845328491
GBM Test RMSE: 6.516702809774502

GBM Training R2: 0.9911172366296761
GBM Test R2: 0.7001731786564442


# SVM

### As

In [43]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [44]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=10,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(As_X_train, As_y_train.ravel())

In [45]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 7.219864234978867), ('degree', 2), ('epsilon', 0.008118511982856591), ('kernel', 'rbf')])
Best score for SVM:  0.5456921629994023


In [46]:
# Predict on the training set
As_y_pred_svm_train = opt_svm.predict(As_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(As_y_train, As_y_pred_svm_train))
r2_svm_train = r2_score(As_y_train, As_y_pred_svm_train)

# Predict on the test set
As_y_pred_svm_test = opt_svm.predict(As_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(As_y_test, As_y_pred_svm_test))
r2_svm_test = r2_score(As_y_test, As_y_pred_svm_test)



In [47]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 0.5350429015763649
SVM Test RMSE: 0.955449309298481

SVM Training R2: 0.7583428877277805
SVM Test R2: -0.13616357036572824


### Cd

In [48]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [49]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=10,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(Cd_X_train, Cd_y_train.ravel())

In [50]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 492.3881419514819), ('degree', 2), ('epsilon', 7.683539036118337e-05), ('kernel', 'linear')])
Best score for SVM:  0.6135735484744306


In [51]:
# Predict on the training set
Cd_y_pred_svm_train = opt_svm.predict(Cd_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(Cd_y_train, Cd_y_pred_svm_train))
r2_svm_train = r2_score(Cd_y_train, Cd_y_pred_svm_train)

# Predict on the test set
Cd_y_pred_svm_test = opt_svm.predict(Cd_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(Cd_y_test, Cd_y_pred_svm_test))
r2_svm_test = r2_score(Cd_y_test, Cd_y_pred_svm_test)



In [52]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 0.0595447156441702
SVM Test RMSE: 0.06588870772095909

SVM Training R2: 0.6291951139531589
SVM Test R2: -0.0656993112113542


### Ni

In [53]:
# Define the hyperparameter space for SVM
param_space_svm = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'epsilon': (1e-6, 1e+1, 'log-uniform'),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': (1, 5)
}

In [54]:
# Initialize and fit BayesSearchCV with k-fold for SVM
opt_svm = BayesSearchCV(
    SVR(),
    param_space_svm,
    n_iter=10,
    cv=kf,
    n_jobs=-1
)
opt_svm.fit(Ni_X_train, Ni_y_train.ravel())

In [55]:
# Best hyperparameters and score for SVM
print("Best hyperparameters for SVM: ", opt_svm.best_params_)
print("Best score for SVM: ", opt_svm.best_score_)

Best hyperparameters for SVM:  OrderedDict([('C', 13.15468862536928), ('degree', 5), ('epsilon', 0.0008006652352888595), ('kernel', 'rbf')])
Best score for SVM:  0.7666114402752845


In [56]:
# Predict on the training set
Ni_y_pred_svm_train = opt_svm.predict(Ni_X_train)

# Calculate RMSE and R2 for the training set
rmse_svm_train = np.sqrt(mean_squared_error(Ni_y_train, Ni_y_pred_svm_train))
r2_svm_train = r2_score(Ni_y_train, Ni_y_pred_svm_train)

# Predict on the test set
Ni_y_pred_svm_test = opt_svm.predict(Ni_X_test)

# Calculate RMSE and R2 for the test set
rmse_svm_test = np.sqrt(mean_squared_error(Ni_y_test, Ni_y_pred_svm_test))
r2_svm_test = r2_score(Ni_y_test, Ni_y_pred_svm_test)



In [57]:
print(f'SVM Training RMSE: {rmse_svm_train}')
print(f'SVM Test RMSE: {rmse_svm_test}\n')
print(f'SVM Training R2: {r2_svm_train}')
print(f'SVM Test R2: {r2_svm_test}')

SVM Training RMSE: 6.664121770600545
SVM Test RMSE: 9.805542927290096

SVM Training R2: 0.8173964261000011
SVM Test R2: 0.3211748258047179
