In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.5.0


In [None]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import Lasso, Ridge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xg
import joblib

In [None]:
data = pd.read_excel("StandardScaledData.xlsx")
columns = data.columns

In [None]:
validation = data[data['Condition']==5]
data2 = data.drop(validation.index)

In [None]:
train, test = train_test_split(data2, test_size=0.2, random_state=30)

In [None]:
train = train.drop(['Identified','Name','Isomeric SMILES', 'Condition'], axis=1)
x_train, y_train = train.drop(['GL-LC-MS'],axis=1), train['GL-LC-MS']

In [None]:
test = test.drop(['Identified','Name','Isomeric SMILES', 'Condition'], axis=1)
x_test, y_test = test.drop(['GL-LC-MS'], axis=1), test['GL-LC-MS']

In [None]:
validation = validation.drop(['Identified','Name','Isomeric SMILES','Condition'], axis=1)
x_val, y_val = validation.drop(['GL-LC-MS'],axis=1), validation['GL-LC-MS']

In [None]:
#Construimos los modelos
models =[]
LassoModel = Lasso(alpha = 1e-04)
models.append(LassoModel)
GPR = GaussianProcessRegressor(alpha = 0.063, kernel = RBF(length_scale = 66),
                               n_restarts_optimizer = 9, normalize_y = True)
models.append(GPR)
XGBR = xg.XGBRegressor(n_estimators = 150, gamma = 0.075, learning_rate = 0.3,
                       max_depth = 6, min_child_weight = 1, reg_alpha =  0.5,
                       reg_lambda = 0.25, subsample = 0.8, random_state = 30)
models.append(XGBR)
RFR= RandomForestRegressor(n_estimators=32, max_depth=None, min_samples_split=2,
                           min_samples_leaf=1, bootstrap=True,
                           random_state=30)
#RFR= RandomForestRegressor(n_estimators=32, max_depth=None, min_samples_split=2,
                           #min_samples_leaf=1, max_features='auto', bootstrap=True,
                           #random_state=30)
models.append(RFR)
svr = SVR(gamma = 0.001, epsilon = 0.045, C = 115000, kernel = 'rbf')
models.append(svr)

In [None]:
#Entrenamiento y guardar modelos
filenames = ['Lasso.sav', 'GPR.sav', 'XGBR.sav', 'RFR.sav', 'SVR.sav']
trained_mod = []
index = 0
for model in models:
    model.fit(x_train, y_train)
    trained_mod.append(model)
    name = filenames[index]
    joblib.dump(model, name)
    index += 1

In [None]:
#Calcular las métricas
MSE_train  = []
MSE_test  = []
MSE_val = []
RMSE_train = []
RMSE_test = []
RMSE_val = []
R2_train = []
R2_test = []
R2_val = []
for model in trained_mod:

  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)
  y_pred_val = model.predict(x_val)
  mse_train = mean_squared_error(y_train,y_pred_train, squared=False)
  MSE_train.append(mse_train)
  mse_test = mean_squared_error(y_test,y_pred_test, squared=False)
  MSE_test.append(mse_test)
  mse_val = mean_squared_error(y_val,y_pred_val, squared=False)
  MSE_val.append(rmse_val)
  rmse_train = mean_squared_error(y_train,y_pred_train, squared=True)
  RMSE_train.append(rmse_train)
  rmse_test = mean_squared_error(y_test,y_pred_test, squared=True)
  RMSE_test.append(rmse_test)
  rmse_val = mean_squared_error(y_val,y_pred_val, squared=True)
  RMSE_val.append(rmse_val)
  r2_train = r2_score(y_train, y_pred_train)
  R2_train.append(r2_train)
  r2_test = r2_score(y_test, y_pred_test)
  R2_test.append(r2_test)
  r2_val = r2_score(y_val, y_pred_val)
  R2_val.append(r2_val)



In [None]:
performance_metrics = pd.DataFrame( {'Model': models,'MSE Train': MSE_train,
                                     'MSE Test': MSE_test, 'MSE Validation': MSE_val,
                                     'RMSE Train': RMSE_train, 'RMSE Test': RMSE_test,
                                      'RMSE Validation': RMSE_val, 'R2 Train': R2_train,
                                      'R2 Test': R2_test, 'R2 Validation': R2_val})
performance_metrics.round(decimals=3)

Unnamed: 0,Model,MSE Train,MSE Test,MSE Validation,RMSE Train,RMSE Test,RMSE Validation,R2 Train,R2 Test,R2 Validation
0,Lasso(alpha=0.0001),0.721,0.71,0.696,0.52,0.505,0.484,0.579,0.62,0.603
1,"GaussianProcessRegressor(alpha=0.063, kernel=R...",0.242,0.406,0.339,0.058,0.165,0.115,0.953,0.876,0.906
2,"XGBRegressor(base_score=None, booster=None, ca...",0.182,0.389,0.272,0.033,0.151,0.074,0.973,0.886,0.939
3,"(DecisionTreeRegressor(max_features=1.0, rando...",0.146,0.361,0.334,0.021,0.13,0.111,0.983,0.902,0.909
4,"SVR(C=115000, epsilon=0.045, gamma=0.001)",0.24,0.384,0.315,0.057,0.148,0.099,0.953,0.889,0.919
