# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF,Exponentiation,Matern,RationalQuadratic,ExpSineSquared,DotProduct,WhiteKernel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_a', 'Coef_c', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_b'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    k_opt,rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'kernel':[],'alpha':[],'cv_rmse':[],'cv_r2':[]}
        for k,a in itertools.product(hyperparams['kernel'], 
                                           hyperparams['alpha']):

            gaussian_processes = GaussianProcessRegressor(optimizer='fmin_l_bfgs_b',random_state=50,
            kernel=k, alpha=a, n_restarts_optimizer=1)

            k_opt, cv_error, cv_r2 = run_cv(gaussian_processes, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['kernel'].append(k_opt)
            cv_results['alpha'].append(a)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        k_opt = cv_results.iloc[0]['kernel']
        a_opt = cv_results.iloc[0]['alpha']
        
    else:
        k_opt = hyperparams['kernel'][0]
        a_opt = hyperparams['alpha'][0]

    gaussian_processes = GaussianProcessRegressor(
            optimizer='fmin_l_bfgs_b',random_state=50,
            kernel=k_opt, alpha=a_opt, n_restarts_optimizer=1)
    model = gaussian_processes.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",k_opt,",",a_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",k_opt,",",a_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []
    errors = []
    models = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)
        
        # Computing errors
        rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))

        errors.append(rmse_val)
        models.append(model)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    k_opt = models[np.argmin(errors)].kernel_
    
    return k_opt, rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'kernel': [1 * RBF(length_scale=1),
                        1 + RBF(length_scale=1),
                        Exponentiation(RBF(length_scale=1), exponent=2),
                        1* Matern(length_scale=1.0, nu=1.5),
                        1 + Matern(length_scale=1.0, nu=1.5),
                        Exponentiation(Matern(length_scale=1.0, nu=1.5), exponent=2),
                        1*RationalQuadratic(length_scale=1.0, alpha=1.5),
                        1+RationalQuadratic(length_scale=1.0, alpha=1.5),
                        Exponentiation(RationalQuadratic(length_scale=1.0, alpha=1.5),exponent=2),
                        DotProduct() + WhiteKernel(),
                        RBF(length_scale=100.0)+ WhiteKernel(noise_level=1)],
             'alpha':[1E-5,0.001,0.01,0.1],}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)












ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.6499439950709109
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  2.3883180005549462e-05




### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)



The shape of X is  (80, 50)












ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st



-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.4214553245004551
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  2.9813974751285917e-05




### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0        s_A   51.729561
1        s_B   59.910760
2        s_X   92.541192
3    density    0.920746
4   mean_A2B  126.044006
5   mean_A2X   51.569746
6   mean_B2X    0.226120
7   mean_X2X   75.264201
8    std_A2B    6.140295
9    std_A2X   17.648197
10   std_B2X   13.967425
11   std_X2X    3.509274
12     E_coh   33.972057
13        TF    2.221933
14        OF   12.249707
15       A_Z   12.150767
16       B_Z    9.949890
17       X_Z   91.454183
18       A_G    4.721854
19       B_G    1.232756
20     A_IEI   38.391765
21     B_IEI    0.860003
22     X_IEI    3.817711
23    A_IEII    0.004706
24    B_IEII    1.325940
25    X_IEII   24.989071
26      A_EA   12.418282
27      B_EA    3.303905
28      X_EA  103.886132
29    A_ChiP   16.845472
30    B_ChiP    1.949703
31    A_ChiA   20.970549
32    X_ChiA    4.797986
33    A_Rvdw   57.675309
34      B_Ra   13.090297
35      X_Ra   93.260694
36      B_MP   16.262954
37      A_BP   13.449987
38     A_Rho    1.321795










ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st



-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.4783677833598214
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  2.8766763973962887e-05




In [18]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
30   std_A2B    6.140295
31      X_MV    6.058738
32       A_B    5.295637
33    X_ChiA    4.797986
34       A_G    4.721854
35     B_CvM    3.909868
36     X_IEI    3.817711
37     B_Rho    3.675736
38   std_X2X    3.509274










-----------------------------------------------------
The R2 for  DotProduct(sigma_0=0.00477) + WhiteKernel(noise_level=1e-05) , 0.1  is  0.2539476556351173
The RMSE for  DotProduct(sigma_0=0.00477) + WhiteKernel(noise_level=1e-05) , 0.1  is  2.8636463740202358e-05




In [19]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
-----------------------------------------------------
The shape of X is  (80, 20)


















-----------------------------------------------------
The R2 for  DotProduct(sigma_0=0.000261) + WhiteKernel(noise_level=1e-05) , 0.001  is  0.6324901666046967
The RMSE for  DotProduct(sigma_0=0.000261) + WhiteKernel(noise_level=1e-05) , 0.001  is  1.9370325637694617e-05




In [20]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
-----------------------------------------------------
The shape of X is  (80, 10)




















-----------------------------------------------------
The R2 for  DotProduct(sigma_0=0.00166) + WhiteKernel(noise_level=1e-05) , 0.1  is  0.5240231064036531
The RMSE for  DotProduct(sigma_0=0.00166) + WhiteKernel(noise_level=1e-05) , 0.1  is  2.1549576124975075e-05




## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=16.6, nu=1.5) , 0.001  is  0.8514431305677771
The RMSE for  0.00316**2 + Matern(length_scale=16.6, nu=1.5) , 0.001  is  0.28322165805224986




### Drop correlated features

In [22]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)












-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=13.2, nu=1.5) , 0.001  is  0.8343703696578271
The RMSE for  0.00316**2 + Matern(length_scale=13.2, nu=1.5) , 0.001  is  0.29137288627805746




### Feature elimination

In [23]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   51.729561
1        s_B   59.910760
2        s_X   92.541192
3    density    0.920746
4   mean_A2B  126.044006
5   mean_A2X   51.569746
6   mean_B2X    0.226120
7   mean_X2X   75.264201
8    std_A2B    6.140295
9    std_A2X   17.648197
10   std_B2X   13.967425
11   std_X2X    3.509274
12     E_coh   33.972057
13        TF    2.221933
14        OF   12.249707
15       A_Z   12.150767
16       B_Z    9.949890
17       X_Z   91.454183
18       A_G    4.721854
19       B_G    1.232756
20     A_IEI   38.391765
21     B_IEI    0.860003
22     X_IEI    3.817711
23    A_IEII    0.004706
24    B_IEII    1.325940
25    X_IEII   24.989071
26      A_EA   12.418282
27      B_EA    3.303905
28      X_EA  103.886132
29    A_ChiP   16.845472
30    B_ChiP    1.949703
31    A_ChiA   20.970549
32    X_ChiA    4.797986
33    A_Rvdw   57.675309
34      B_Ra   13.090297
35      X_Ra   93.260694
36      B_MP   16.262954
37      A_BP   13.449987
38     A_Rho    1.321795




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)






-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=11.9, nu=1.5) , 0.01  is  0.8403862306617426
The RMSE for  0.00316**2 + Matern(length_scale=11.9, nu=1.5) , 0.01  is  0.28617774510434546
-----------------------------------------------------
The R2 is  0.8429121883357844
The RMSE is  0.2885593636462619




In [24]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
30   std_A2B    6.140295
31      X_MV    6.058738
32       A_B    5.295637
33    X_ChiA    4.797986
34       A_G    4.721854
35     B_CvM    3.909868
36     X_IEI    3.817711
37     B_Rho    3.675736
38   std_X2X    3.509274








-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=9.98, nu=1.5) , 0.01  is  0.8608647524288496
The RMSE for  0.00316**2 + Matern(length_scale=9.98, nu=1.5) , 0.01  is  0.27375800802259537
-----------------------------------------------------
The R2 is  0.8697484585001529
The RMSE is  0.27536124568078846




In [25]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
-----------------------------------------------------
The shape of X is  (80, 20)








-----------------------------------------------------
The R2 for  1.33**2 * RationalQuadratic(alpha=0.181, length_scale=4.78) , 1e-05  is  0.7773245018084795
The RMSE for  1.33**2 * RationalQuadratic(alpha=0.181, length_scale=4.78) , 1e-05  is  0.27791223660032044
-----------------------------------------------------
The R2 is  -0.36395227728544477
The RMSE is  0.8816079593920372


In [26]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
-----------------------------------------------------
The shape of X is  (80, 10)








-----------------------------------------------------
The R2 for  0.00339**2 + Matern(length_scale=3.02, nu=1.5) , 0.01  is  0.7268059885436353
The RMSE for  0.00339**2 + Matern(length_scale=3.02, nu=1.5) , 0.01  is  0.2890048046191134
-----------------------------------------------------
The R2 is  -0.7948530734579535
The RMSE is  1.0000012125438058




## Minmax scaling

In [27]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)






-----------------------------------------------------
The R2 for  0.637**2 * RationalQuadratic(alpha=0.116, length_scale=5.94) , 1e-05  is  0.8544259822207116
The RMSE for  0.637**2 * RationalQuadratic(alpha=0.116, length_scale=5.94) , 1e-05  is  0.07324478802438002




### Drop correlated features

In [28]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)








-----------------------------------------------------
The R2 for  0.412**2 * RBF(length_scale=2.57) , 1e-05  is  0.8495532201194836
The RMSE for  0.412**2 * RBF(length_scale=2.57) , 1e-05  is  0.07133884827735683




### Feature elimination

In [29]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   51.729561
1        s_B   59.910760
2        s_X   92.541192
3    density    0.920746
4   mean_A2B  126.044006
5   mean_A2X   51.569746
6   mean_B2X    0.226120
7   mean_X2X   75.264201
8    std_A2B    6.140295
9    std_A2X   17.648197
10   std_B2X   13.967425
11   std_X2X    3.509274
12     E_coh   33.972057
13        TF    2.221933
14        OF   12.249707
15       A_Z   12.150767
16       B_Z    9.949890
17       X_Z   91.454183
18       A_G    4.721854
19       B_G    1.232756
20     A_IEI   38.391765
21     B_IEI    0.860003
22     X_IEI    3.817711
23    A_IEII    0.004706
24    B_IEII    1.325940
25    X_IEII   24.989071
26      A_EA   12.418282
27      B_EA    3.303905
28      X_EA  103.886132
29    A_ChiP   16.845472
30    B_ChiP    1.949703
31    A_ChiA   20.970549
32    X_ChiA    4.797986
33    A_Rvdw   57.675309
34      B_Ra   13.090297
35      X_Ra   93.260694
36      B_MP   16.262954
37      A_BP   13.449987
38     A_Rho    1.321795




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  0.674**2 * RationalQuadratic(alpha=0.122, length_scale=4.74) , 0.001  is  0.8431531084261943
The RMSE for  0.674**2 * RationalQuadratic(alpha=0.122, length_scale=4.74) , 0.001  is  0.07400690225981804
-----------------------------------------------------
The R2 is  0.8218183796287433
The RMSE is  0.07797616132470543




In [30]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
30   std_A2B    6.140295
31      X_MV    6.058738
32       A_B    5.295637
33    X_ChiA    4.797986
34       A_G    4.721854
35     B_CvM    3.909868
36     X_IEI    3.817711
37     B_Rho    3.675736
38   std_X2X    3.509274








-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.053, length_scale=5.85) , 0.001  is  0.8416915381523784
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.053, length_scale=5.85) , 0.001  is  0.07477204296308418
-----------------------------------------------------
The R2 is  0.7963537886424159
The RMSE is  0.08556314841021466




In [31]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
20      B_Ra   13.090297
21      A_EA   12.418282
22        OF   12.249707
23       A_Z   12.150767
24   X_Kappa   10.287385
25       B_Z    9.949890
26     A_CvM    8.675611
27   A_Kappa    8.297802
28   B_Kappa    6.966805
29      B_MV    6.707130
-----------------------------------------------------
The shape of X is  (80, 20)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.0224, length_scale=3.59) , 1e-05  is  0.7926727336569065
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.0224, length_scale=3.59) , 1e-05  is  0.07121272900868511
-----------------------------------------------------
The R2 is  -2.101865715335016
The RMSE is  0.27465851913032513




In [32]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  126.044006
1       X_EA  103.886132
2       X_Ra   93.260694
3        s_X   92.541192
4        X_Z   91.454183
5   mean_X2X   75.264201
6        s_B   59.910760
7     A_Rvdw   57.675309
8        s_A   51.729561
9   mean_A2X   51.569746
10     A_IEI   38.391765
11     E_coh   33.972057
12    X_IEII   24.989071
13    A_ChiA   20.970549
14      B_Hf   20.432377
15   std_A2X   17.648197
16    A_ChiP   16.845472
17      B_MP   16.262954
18   std_B2X   13.967425
19      A_BP   13.449987
-----------------------------------------------------
The shape of X is  (80, 10)








-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.0175, length_scale=2.03) , 0.001  is  0.7422063301583497
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.0175, length_scale=2.03) , 0.001  is  0.07720945908937378
-----------------------------------------------------
The R2 is  -6.26186709717347
The RMSE is  0.3949049144572068


