# batch script for reproducibility

jupyter nbconvert --to script generate_results.ipynb

In [1]:
import matplotlib
matplotlib.use('Agg')
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import LinearSegmentedColormap as lsc
from matplotlib import pyplot as plt

In [2]:
import pandas as pd
import numpy as np
import scipy
import sklearn 
import xgboost

import math
import random 
import re
import itertools

from collections import Counter 

from scipy.stats import binom_test
from scipy.stats import norm

from functools import partial

from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.preprocessing import PolynomialFeatures as plf
from sklearn import preprocessing

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge 
from xgboost import XGBRegressor

from ksuzuki_pylib import *

In [3]:
print('sklearn:', sklearn.__version__)
print('xgboost:', xgboost.__version__)
print('pandas:', pd.__version__)
print('numpy:', np.__version__)
print('scipy:', scipy.__version__)
print('matplotlib:', matplotlib.__version__)

sklearn: 0.21.0
xgboost: 0.82
pandas: 0.24.2
numpy: 1.16.3
scipy: 1.2.1
matplotlib: 3.0.3


In [4]:
np.random.seed(0)
random.seed(0)

In [5]:
range_ntree = [500, 1500]

# xgb ranges
range_depth = [5, 6, 7, 8, 9, 10]
range_subsample = [0.6, 0.7, 0.8, 0.9, 1]
range_colsample = [0.6, 0.7, 0.8, 0.9, 1]
range_lr = [0.1, 0.05, 0.02]
range_gamma = [0, 0.1, 1.0, 10.0]

In [8]:
# XGB
cv_xgb = GridSearchCV(
    XGBRegressor(random_state=929), 
    param_grid={"n_estimators": range_ntree,
                "max_depth": range_depth, 
                "subsample": range_subsample, 
                "colsample_bytree": range_colsample,
                "learning_rate": range_lr,
                "gamma": range_gamma})

## OCM data

In [9]:
ocm = pd.read_csv("input/OCM_matrix.csv").drop(['Unnamed: 0'], axis=1)

ocm_desc = pd.read_csv("input/OCM_matrix_desc.csv")
ind = ocm_desc['Unnamed: 0']
ocm_desc.index = ind
ocm_desc = ocm_desc.drop(['Unnamed: 0'], axis=1)

#exclude cat which has 'Th' in its compser
ind = ocm['Th'] == 0
ocm = ocm[ind]

#exclude data which has over 5 metals in his component
ind = (ocm.loc[:,:'Zr']>0).sum(axis=1)
ind = ind < 5
ocm = ocm.loc[ind]

ind = (ocm_desc.loc[:,:'Zr']>0).sum(axis=1)
ind = ind < 5
ocm_desc = ocm_desc.loc[ind]

ocm = ocm.drop(['Support_Co'],axis=1)
ocm_desc = ocm_desc.drop(['Support_Co'],axis=1)

# rename columns
sup = ocm.loc[:,'Support_Zr':'Support_Si'].columns
prom = ocm.loc[:,'Promotor_B':'Promotor_S'].columns
comp = ocm.loc[:,:'Zr'].columns
cond = ocm.loc[:,'Temperature, K':'P total, bar'].columns
prep = ocm.loc[:,'Impregnation':'Therm.decomp.'].columns

#exclude experimental condition feature
nocond = list(comp) + list(sup)
ocm_nocond = ocm[nocond]
ocm_nocond = pd.concat([ocm_nocond,ocm.iloc[:,-1]], axis=1)

In [10]:
desc = pd.read_csv('input/Descriptors.csv',skiprows = [0],index_col='symbol').drop(['Unnamed: 0',
                                                                               'name',
                                                                               'ionic radius',
                                                                               'covalent radius',
                                                                               'VdW radius',
                                                                               'crystal radius',
                                                                               'a x 106 ',
                                                                               'Heat capacity ',
                                                                               'l',
                                                                               'electron affinity ',
                                                                               'VE',
                                                                               'Surface energy '],axis=1)
desc = desc.loc[ocm.loc[:,:'Zr'].columns,]
desc = desc.fillna(desc.mean())

In [11]:
# num of elements in each multicomponent catalyst
print(Counter((ocm.loc[:,:'Zr']>0).sum(axis=1)))

Counter({2: 779, 3: 578, 1: 345, 4: 131})


## Fig.3 & Table 3

In [12]:
def crossvalid(xx,yy,model,cv,txt):
    cv = Cv_Pred_Expected(estimator=model,X=xx,y=yy,cv=10,redc = True)
    cv.cross_validation()
    tes_rmse = cv.rmse
    tes_std = cv.std
    trn_rmse = cv.rmse_train
    trn_std = cv.std_train
    print()
    print("[%s] RMSE %1.3f (STD: %1.3f) ... test" % (txt,tes_rmse,tes_std))
    print("[%s] RMSE %1.3f (STD: %1.3f) ... train" % (txt,trn_rmse,trn_std))
    ret_obj = {}
    ret_obj['tes_mean'] = tes_rmse
    ret_obj['tes_sd']   = tes_std 
    ret_obj['trn_mean'] = trn_rmse
    ret_obj['trn_sd']   = trn_std 
    return ret_obj

### ocm_nocond

In [13]:
rmse_ocm_nocond = {}
data = shuffle(ocm_nocond, random_state=10)

target_data = ocm_nocond

In [14]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [15]:
model = best_xgb
print(model)
rmse_ocm_nocond['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.9, gamma=0, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=1)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 5.722 STD_test: 0.424
RMSE_train: 5.248 STD_train: 0.048
Data size:(1833, 69)

[XGB] RMSE 5.722 (STD: 0.424) ... test
[XGB] RMSE 5.248 (STD: 0.048) ... train


### ocm

In [16]:
rmse_ocm = {}
data = shuffle(ocm, random_state=10)
target_data = ocm

In [17]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [18]:
model = best_xgb
print(model)
rmse_ocm['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.9, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=1)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 4.812 STD_test: 0.351
RMSE_train: 3.824 STD_train: 0.038
Data size:(1833, 104)

[XGB] RMSE 4.812 (STD: 0.351) ... test
[XGB] RMSE 3.824 (STD: 0.038) ... train


### ocm_desc

In [19]:
rmse_ocm_desc = {}
data = shuffle(ocm_desc, random_state=929)
target_data = ocm_desc

In [20]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [21]:
model = best_xgb
print(model)
rmse_ocm_desc['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=0.9)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 4.831 STD_test: 0.366
RMSE_train: 3.949 STD_train: 0.050
Data size:(1833, 148)

[XGB] RMSE 4.831 (STD: 0.366) ... test
[XGB] RMSE 3.949 (STD: 0.050) ... train


### Table 3

In [22]:
pd.options.display.precision = 2

print(pd.DataFrame(rmse_ocm_nocond))

print(pd.DataFrame(rmse_ocm))

print(pd.DataFrame(rmse_ocm_desc))

           XGB
tes_mean  5.72
tes_sd    0.42
trn_mean  5.25
trn_sd    0.05
           XGB
tes_mean  4.81
tes_sd    0.35
trn_mean  3.82
trn_sd    0.04
           XGB
tes_mean  4.83
tes_sd    0.37
trn_mean  3.95
trn_sd    0.05


## WGS data

In [23]:
desc = pd.read_csv('input/Descriptors.csv',skiprows = [0],index_col='symbol').drop(['Unnamed: 0',
                                                                               'name',
                                                                               'ionic radius',
                                                                               'covalent radius',
                                                                               'VdW radius',
                                                                               'crystal radius',
                                                                               'a x 106 ',
                                                                               'Heat capacity ',
                                                                               'l',
                                                                               'electron affinity ',
                                                                               'VE',
                                                                               'Surface energy '],axis=1)
desc = desc.iloc[:83, :]

In [24]:
wgs = pd.read_csv("input/wgs.csv")
wgs.index = list(wgs.iloc[:,0])
wgs = wgs.drop([wgs.columns[0]], axis=1)

wgs_desc = pd.read_csv("input/wgs_desc.csv")
wgs_desc.index = list(wgs_desc.iloc[:,0])
wgs_desc = wgs_desc.drop([wgs_desc.columns[0]], axis=1)

atom = list(wgs.loc[:,:'Pd'].columns) + list(wgs.loc[:,'Li':'Sr'].columns)
desc = desc.loc[atom]

desc=desc.fillna(desc.mean())

## Table 4 (wgs)

### wgs_nocond

In [25]:
comppro = list(wgs.loc[:, :'Pd'].columns)
wgs_nocond = wgs.loc[:, comppro + ['CO Conversion']]

In [26]:
rmse_wgs_nocond = {}
data = shuffle(wgs_nocond, random_state=10)
target_data = wgs_nocond

In [27]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [28]:
model = best_xgb
print(model)
rmse_wgs_nocond['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=1)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 31.436 STD_test: 0.579
RMSE_train: 31.053 STD_train: 0.068
Data size:(4185, 35)

[XGB] RMSE 31.436 (STD: 0.579) ... test
[XGB] RMSE 31.053 (STD: 0.068) ... train


### wgs

In [29]:
rmse_wgs = {}
data = shuffle(wgs, random_state=10)
target_data = wgs

In [30]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [31]:
model = best_xgb
print(model)
rmse_wgs['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=0.9)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 15.470 STD_test: 0.376
RMSE_train: 14.006 STD_train: 0.129
Data size:(4185, 79)

[XGB] RMSE 15.470 (STD: 0.376) ... test
[XGB] RMSE 14.006 (STD: 0.129) ... train


### wgs_desc

In [32]:
rmse_wgs_desc = {}
data = shuffle(wgs_desc, random_state=929)
target_data = wgs_desc

In [33]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [34]:
model = best_xgb
print(model)
rmse_wgs_desc['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=0.9)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 19.802 STD_test: 0.556
RMSE_train: 19.040 STD_train: 0.073
Data size:(4185, 134)

[XGB] RMSE 19.802 (STD: 0.556) ... test
[XGB] RMSE 19.040 (STD: 0.073) ... train


### Table 4

In [35]:
pd.options.display.precision = 2

print(pd.DataFrame(rmse_wgs_nocond))

print(pd.DataFrame(rmse_wgs))

print(pd.DataFrame(rmse_wgs_desc))

            XGB
tes_mean  31.44
tes_sd     0.58
trn_mean  31.05
trn_sd     0.07
            XGB
tes_mean  15.47
tes_sd     0.38
trn_mean  14.01
trn_sd     0.13
            XGB
tes_mean  19.80
tes_sd     0.56
trn_mean  19.04
trn_sd     0.07


## CO oxidation data

In [36]:
co = pd.read_csv('input/co.csv')
ind = co['Data No']
co.index = ind
co= co.drop(['Data No'], axis=1)

co_desc = pd.read_csv('input/co_desc.csv')
ind = co_desc['Data No']
co_desc.index = ind
co_desc= co_desc.drop(['Data No'], axis=1)

In [37]:
# devine to each groups
baseatom = co.loc[:,:'Pd'].columns
supatom = co.loc[:,'Al_s':'Co_s'].columns
pro = co.loc[:,'Ce_p':'Cu_p']
proatom = [re.findall(r'[A-Z][a-z]?',x) for x in co.loc[:,'Ce_p':'Cu_p']]
proatom = [y for x in proatom for y in x ]
env = co.drop(list(baseatom)+list(pro),axis=1).iloc[:,:-1].columns

conv = co.iloc[:,-1]

In [38]:
desc = pd.read_csv('input/Descriptors.csv',skiprows = [0],index_col='symbol').drop(['Unnamed: 0',
                                                                               'name',
                                                                               'ionic radius',
                                                                               'covalent radius',
                                                                               'VdW radius',
                                                                               'crystal radius',
                                                                               'a x 106 ',
                                                                               'Heat capacity ',
                                                                               'l',
                                                                               'electron affinity ',
                                                                               'VE',
                                                                               'Surface energy '],axis=1)
desc=desc.loc[list(baseatom)+list(proatom),:]
desc = desc.fillna(desc.mean())

## Table 4 (co)

### co_nocond

In [39]:
co_nocond = pd.concat([co.loc[:,:'Pd'], pro, co.iloc[:,-1]],axis=1)

In [40]:
rmse_co_nocond = {}
data = shuffle(co_nocond, random_state=10)
target_data = co_nocond

In [41]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [42]:
model = best_xgb
print(model)
rmse_co_nocond['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.9, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=1)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 31.381 STD_test: 0.579
RMSE_train: 30.946 STD_train: 0.076
Data size:(5567, 26)

[XGB] RMSE 31.381 (STD: 0.579) ... test
[XGB] RMSE 30.946 (STD: 0.076) ... train


### co

In [43]:
rmse_co = {}
data = shuffle(co, random_state=10)
target_data = co

In [44]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [45]:
model = best_xgb
print(model)
rmse_co['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.9, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=0.9)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 24.524 STD_test: 0.740
RMSE_train: 23.565 STD_train: 0.087
Data size:(5567, 60)

[XGB] RMSE 24.524 (STD: 0.740) ... test
[XGB] RMSE 23.565 (STD: 0.087) ... train


### co_desc

In [46]:
rmse_co_desc = {}
data = shuffle(co_desc, random_state=929)
target_data = co_desc

In [47]:
# XGB
cv_xgb.fit(target_data.iloc[:,:-1], target_data.iloc[:,-1])
cvntree = cv_xgb.best_params_['n_estimators']
cvdepth = cv_xgb.best_params_['max_depth']
cvsubsample = cv_xgb.best_params_['subsample']
cvcolsample = cv_xgb.best_params_['colsample_bytree']
cvlr = cv_xgb.best_params_['learning_rate']
cvgamma = cv_xgb.best_params_['gamma']
best_xgb = XGBRegressor(n_estimators=cvntree,\
                       max_depth=cvdepth,\
                       subsample=cvsubsample,\
                       colsample_bytree=cvcolsample,\
                       learning_rate=cvlr,\
                    gamma=cvgamma,
                       random_state=929, seed=929, n_jobs=-1)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None

In [48]:
model = best_xgb
print(model)
rmse_co_desc['XGB'] = crossvalid(data.iloc[:,:-1], data.iloc[:,-1], model, 10, 'XGB')

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
             nthread=None, objective='reg:linear', random_state=929,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=929,
             silent=True, subsample=0.9)


  if getattr(data, 'base', None) is not None and \


RMSE_test: 23.935 STD_test: 0.528
RMSE_train: 23.007 STD_train: 0.115
Data size:(5567, 115)

[XGB] RMSE 23.935 (STD: 0.528) ... test
[XGB] RMSE 23.007 (STD: 0.115) ... train


### Table 4

In [49]:
pd.options.display.precision = 2

print(pd.DataFrame(rmse_co_nocond))

print(pd.DataFrame(rmse_co))

print(pd.DataFrame(rmse_co_desc))

            XGB
tes_mean  31.38
tes_sd     0.58
trn_mean  30.95
trn_sd     0.08
            XGB
tes_mean  24.52
tes_sd     0.74
trn_mean  23.56
trn_sd     0.09
            XGB
tes_mean  23.94
tes_sd     0.53
trn_mean  23.01
trn_sd     0.11
