# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF,Exponentiation,Matern,RationalQuadratic,ExpSineSquared,DotProduct,WhiteKernel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_b', 'Coef_c', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_a'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    k_opt,rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'kernel':[],'alpha':[],'cv_rmse':[],'cv_r2':[]}
        for k,a in itertools.product(hyperparams['kernel'], 
                                           hyperparams['alpha']):

            gaussian_processes = GaussianProcessRegressor(optimizer='fmin_l_bfgs_b',random_state=50,
            kernel=k, alpha=a, n_restarts_optimizer=1)

            k_opt, cv_error, cv_r2 = run_cv(gaussian_processes, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['kernel'].append(k_opt)
            cv_results['alpha'].append(a)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        k_opt = cv_results.iloc[0]['kernel']
        a_opt = cv_results.iloc[0]['alpha']
        
    else:
        k_opt = hyperparams['kernel'][0]
        a_opt = hyperparams['alpha'][0]

    gaussian_processes = GaussianProcessRegressor(
            optimizer='fmin_l_bfgs_b',random_state=50,
            kernel=k_opt, alpha=a_opt, n_restarts_optimizer=1)
    model = gaussian_processes.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",k_opt,",",a_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",k_opt,",",a_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []
    errors = []
    models = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)
        
        # Computing errors
        rmse_val = np.sqrt(mean_squared_error(y_cv_val, y_pred_val))

        errors.append(rmse_val)
        models.append(model)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    k_opt = models[np.argmin(errors)].kernel_
    
    return k_opt, rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'kernel': [1 * RBF(length_scale=1),
                        1 + RBF(length_scale=1),
                        Exponentiation(RBF(length_scale=1), exponent=2),
                        1* Matern(length_scale=1.0, nu=1.5),
                        1 + Matern(length_scale=1.0, nu=1.5),
                        Exponentiation(Matern(length_scale=1.0, nu=1.5), exponent=2),
                        1*RationalQuadratic(length_scale=1.0, alpha=1.5),
                        1+RationalQuadratic(length_scale=1.0, alpha=1.5),
                        Exponentiation(RationalQuadratic(length_scale=1.0, alpha=1.5),exponent=2),
                        DotProduct() + WhiteKernel(),
                        RBF(length_scale=100.0)+ WhiteKernel(noise_level=1)],
             'alpha':[1E-5,0.001,0.01,0.1],}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)










ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.5878222787820958
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  1.3055578095819063e-08




### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)



The shape of X is  (80, 50)










ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.32321243900694396
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  1.6204593806387867e-08




### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0        s_A   56.272893
1        s_B   51.629862
2        s_X   97.521570
3    density    1.587649
4   mean_A2B  135.546281
5   mean_A2X   54.501373
6   mean_B2X    0.133049
7   mean_X2X   77.101385
8    std_A2B    6.331587
9    std_A2X   19.689607
10   std_B2X   13.447665
11   std_X2X    3.078784
12     E_coh   36.615505
13        TF    1.134618
14        OF    9.747189
15       A_Z   13.107769
16       B_Z    9.046251
17       X_Z   87.153415
18       A_G    5.190708
19       B_G    1.023577
20     A_IEI   41.774757
21     B_IEI    1.145381
22     X_IEI    5.135719
23    A_IEII    0.027469
24    B_IEII    1.235456
25    X_IEII   19.320300
26      A_EA   12.189494
27      B_EA    3.297773
28      X_EA   87.903070
29    A_ChiP   18.434576
30    B_ChiP    1.889334
31    A_ChiA   21.738026
32    X_ChiA    7.287500
33    A_Rvdw   64.765740
34      B_Ra   10.925759
35      X_Ra   88.664521
36      B_MP   15.078645
37      A_BP   14.590560
38     A_Rho    1.556167










ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/st



-----------------------------------------------------
The R2 for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  0.4103109636584606
The RMSE for  RBF(length_scale=1e+05) + WhiteKernel(noise_level=1e-05) , 1e-05  is  1.5525410501928884e-08


In [18]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
30   std_A2B    6.331587
31       A_B    5.989468
32      B_MV    5.917000
33      X_MV    5.412506
34       A_G    5.190708
35     X_IEI    5.135719
36     B_Rho    3.339847
37      B_EA    3.297773
38     B_CvM    3.125268










ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


-----------------------------------------------------
The R2 for  0.00329**2 + RationalQuadratic(alpha=1e-05, length_scale=1e+05) , 1e-05  is  0.35769925481029147
The RMSE for  0.00329**2 + RationalQuadratic(alpha=1e-05, length_scale=1e+05) , 1e-05  is  1.5095901278190806e-08




In [19]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
-----------------------------------------------------
The shape of X is  (80, 20)














-----------------------------------------------------
The R2 for  DotProduct(sigma_0=0.00248) + WhiteKernel(noise_level=1e-05) , 0.1  is  0.452711868476672
The RMSE for  DotProduct(sigma_0=0.00248) + WhiteKernel(noise_level=1e-05) , 0.1  is  1.3128778265089964e-08




In [20]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
#run_model(X,Y,ml_model_prime,target)



     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
-----------------------------------------------------
The shape of X is  (80, 10)






















-----------------------------------------------------
The R2 for  DotProduct(sigma_0=0.00166) + WhiteKernel(noise_level=1e-05) , 0.1  is  0.5218126297861303
The RMSE for  DotProduct(sigma_0=0.00166) + WhiteKernel(noise_level=1e-05) , 0.1  is  1.2432738592925744e-08




## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)






ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=16.6, nu=1.5) , 0.01  is  0.8136000683909219
The RMSE for  0.00316**2 + Matern(length_scale=16.6, nu=1.5) , 0.01  is  0.34277709788849375




### Drop correlated features

In [22]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)












-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=12.6, nu=1.5) , 0.01  is  0.8017475342000077
The RMSE for  0.00316**2 + Matern(length_scale=12.6, nu=1.5) , 0.01  is  0.3414212154105168




### Feature elimination

In [23]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   56.272893
1        s_B   51.629862
2        s_X   97.521570
3    density    1.587649
4   mean_A2B  135.546281
5   mean_A2X   54.501373
6   mean_B2X    0.133049
7   mean_X2X   77.101385
8    std_A2B    6.331587
9    std_A2X   19.689607
10   std_B2X   13.447665
11   std_X2X    3.078784
12     E_coh   36.615505
13        TF    1.134618
14        OF    9.747189
15       A_Z   13.107769
16       B_Z    9.046251
17       X_Z   87.153415
18       A_G    5.190708
19       B_G    1.023577
20     A_IEI   41.774757
21     B_IEI    1.145381
22     X_IEI    5.135719
23    A_IEII    0.027469
24    B_IEII    1.235456
25    X_IEII   19.320300
26      A_EA   12.189494
27      B_EA    3.297773
28      X_EA   87.903070
29    A_ChiP   18.434576
30    B_ChiP    1.889334
31    A_ChiA   21.738026
32    X_ChiA    7.287500
33    A_Rvdw   64.765740
34      B_Ra   10.925759
35      X_Ra   88.664521
36      B_MP   15.078645
37      A_BP   14.590560
38     A_Rho    1.556167


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=11, nu=1.5) , 0.01  is  0.8101276941197897
The RMSE for  0.00316**2 + Matern(length_scale=11, nu=1.5) , 0.01  is  0.3382046911507881
-----------------------------------------------------
The R2 is  0.8101276898224646
The RMSE is  0.33820469379540957




In [24]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
30   std_A2B    6.331587
31       A_B    5.989468
32      B_MV    5.917000
33      X_MV    5.412506
34       A_G    5.190708
35     X_IEI    5.135719
36     B_Rho    3.339847
37      B_EA    3.297773
38     B_CvM    3.125268




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)




-----------------------------------------------------
The R2 for  0.00316**2 + Matern(length_scale=8.86, nu=1.5) , 0.01  is  0.8243921742665313
The RMSE for  0.00316**2 + Matern(length_scale=8.86, nu=1.5) , 0.01  is  0.31942879897835025
-----------------------------------------------------
The R2 is  0.8243920543011468
The RMSE is  0.3194288643059133




In [25]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
-----------------------------------------------------
The shape of X is  (80, 20)








-----------------------------------------------------
The R2 for  RationalQuadratic(alpha=0.26, length_scale=4.74) ** 2 , 1e-05  is  0.7376548476240048
The RMSE for  RationalQuadratic(alpha=0.26, length_scale=4.74) ** 2 , 1e-05  is  0.3386721107470151
-----------------------------------------------------
The R2 is  0.7523104755651913
The RMSE is  0.3433983910498886




In [26]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
-----------------------------------------------------
The shape of X is  (80, 10)








-----------------------------------------------------
The R2 for  RationalQuadratic(alpha=0.209, length_scale=2.9) ** 2 , 0.01  is  0.6814138691189833
The RMSE for  RationalQuadratic(alpha=0.209, length_scale=2.9) ** 2 , 0.01  is  0.3291749379506852
-----------------------------------------------------
The R2 is  -0.39574771904233064
The RMSE is  0.8806314910932126




## Minmax scaling

In [27]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)






-----------------------------------------------------
The R2 for  0.387**2 * RationalQuadratic(alpha=0.435, length_scale=3.86) , 1e-05  is  0.8079515376085019
The RMSE for  0.387**2 * RationalQuadratic(alpha=0.435, length_scale=3.86) , 1e-05  is  0.08230122691870613


### Drop correlated features

In [28]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)




ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.0394, length_scale=7.84) , 1e-05  is  0.8081806285065505
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.0394, length_scale=7.84) , 1e-05  is  0.07925798782847711




### Feature elimination

In [29]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   56.272893
1        s_B   51.629862
2        s_X   97.521570
3    density    1.587649
4   mean_A2B  135.546281
5   mean_A2X   54.501373
6   mean_B2X    0.133049
7   mean_X2X   77.101385
8    std_A2B    6.331587
9    std_A2X   19.689607
10   std_B2X   13.447665
11   std_X2X    3.078784
12     E_coh   36.615505
13        TF    1.134618
14        OF    9.747189
15       A_Z   13.107769
16       B_Z    9.046251
17       X_Z   87.153415
18       A_G    5.190708
19       B_G    1.023577
20     A_IEI   41.774757
21     B_IEI    1.145381
22     X_IEI    5.135719
23    A_IEII    0.027469
24    B_IEII    1.235456
25    X_IEII   19.320300
26      A_EA   12.189494
27      B_EA    3.297773
28      X_EA   87.903070
29    A_ChiP   18.434576
30    B_ChiP    1.889334
31    A_ChiA   21.738026
32    X_ChiA    7.287500
33    A_Rvdw   64.765740
34      B_Ra   10.925759
35      X_Ra   88.664521
36      B_MP   15.078645
37      A_BP   14.590560
38     A_Rho    1.556167










-----------------------------------------------------
The R2 for  RationalQuadratic(alpha=0.0211, length_scale=10.5) ** 2 , 0.001  is  0.8058460628565824
The RMSE for  RationalQuadratic(alpha=0.0211, length_scale=10.5) ** 2 , 0.001  is  0.07843473022150789
-----------------------------------------------------
The R2 is  0.815789684650753
The RMSE is  0.07935560057783646




In [30]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
30   std_A2B    6.331587
31       A_B    5.989468
32      B_MV    5.917000
33      X_MV    5.412506
34       A_G    5.190708
35     X_IEI    5.135719
36     B_Rho    3.339847
37      B_EA    3.297773
38     B_CvM    3.125268










-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.0391, length_scale=6.16) , 0.001  is  0.8060382869550266
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.0391, length_scale=6.16) , 0.001  is  0.0763455677818159
-----------------------------------------------------
The R2 is  0.8270321372864189
The RMSE is  0.0771541463373296




In [31]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
20       A_Z   13.107769
21   X_Kappa   12.936079
22      A_EA   12.189494
23      B_Ra   10.925759
24        OF    9.747189
25     A_CvM    9.336114
26       B_Z    9.046251
27   A_Kappa    8.888579
28   B_Kappa    7.588177
29    X_ChiA    7.287500
-----------------------------------------------------
The shape of X is  (80, 20)










-----------------------------------------------------
The R2 for  0.00316**2 + RationalQuadratic(alpha=0.0184, length_scale=3.74) , 1e-05  is  0.7586405204133269
The RMSE for  0.00316**2 + RationalQuadratic(alpha=0.0184, length_scale=3.74) , 1e-05  is  0.07689698854389693
-----------------------------------------------------
The R2 is  0.7586404692858638
The RMSE is  0.07689699363847312




In [32]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  135.546281
1        s_X   97.521570
2       X_Ra   88.664521
3       X_EA   87.903070
4        X_Z   87.153415
5   mean_X2X   77.101385
6     A_Rvdw   64.765740
7        s_A   56.272893
8   mean_A2X   54.501373
9        s_B   51.629862
10     A_IEI   41.774757
11     E_coh   36.615505
12    A_ChiA   21.738026
13   std_A2X   19.689607
14    X_IEII   19.320300
15      B_Hf   18.855125
16    A_ChiP   18.434576
17      B_MP   15.078645
18      A_BP   14.590560
19   std_B2X   13.447665
-----------------------------------------------------
The shape of X is  (80, 10)








-----------------------------------------------------
The R2 for  0.493**2 * RationalQuadratic(alpha=0.0957, length_scale=1.2) , 0.001  is  0.721931947247573
The RMSE for  0.493**2 * RationalQuadratic(alpha=0.0957, length_scale=1.2) , 0.001  is  0.07967535443733824




-----------------------------------------------------
The R2 is  0.7200319039098315
The RMSE is  0.07973078522845906


