# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_a', 'Coef_c', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_b'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        print(featureScores.nlargest(n,'Score'))  #print 10 best features
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'n_estimators':[],'criterion':[],'min_samples_split':[],
                      'min_samples_leaf':[],'max_features':[],'cv_rmse':[],'cv_r2':[]}
        for n,c,s,l,f in itertools.product(hyperparams['n_estimators'], 
                                           hyperparams['criterion'],hyperparams['min_samples_split'],
                                          hyperparams['min_samples_leaf'],hyperparams['max_features']):

            random_forests = RandomForestRegressor(
            n_estimators=n, criterion=c, min_samples_split=s, min_samples_leaf=l, max_features=f)

            cv_error, cv_r2 = run_cv(random_forests, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['n_estimators'].append(n)
            cv_results['criterion'].append(c)
            cv_results['min_samples_split'].append(s)
            cv_results['min_samples_leaf'].append(l)
            cv_results['max_features'].append(f)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        n_opt = cv_results.iloc[0]['n_estimators']
        c_opt = cv_results.iloc[0]['criterion']
        s_opt = cv_results.iloc[0]['min_samples_split']
        l_opt = cv_results.iloc[0]['min_samples_leaf']
        f_opt = cv_results.iloc[0]['max_features']
        
    else:
        n_opt = hyperparams['n_estimators'][0]
        c_opt = hyperparams['criterion'][0]
        s_opt = hyperparams['min_samples_split'][0]
        l_opt = hyperparams['min_samples_leaf'][0]
        f_opt = hyperparams['max_features'][0]

    random_forests = RandomForestRegressor(
            n_estimators=n_opt, criterion=c_opt, min_samples_split=s_opt, min_samples_leaf=l_opt, max_features=f_opt)
    model = random_forests.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    return rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'n_estimators': [10,100],
             'criterion':["squared_error", "absolute_error", "friedman_mse"],
            'min_samples_split':[2,4,8],
            'min_samples_leaf':[1,2,4,8],
            'max_features':["sqrt","log2",0.3,1]}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  10 , squared_error , 4 , 1 , 0.3  is  0.7952517014618457
The RMSE for  10 , squared_error , 4 , 1 , 0.3  is  1.710532379619482e-05


### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 1 , sqrt  is  0.7911047281514435
The RMSE for  10 , squared_error , 2 , 1 , sqrt  is  1.6592635470405666e-05


### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.068558
1        s_B  0.055535
2        s_X  0.001677
3    density  0.010372
4   mean_A2B  0.032260
5   mean_A2X  0.129708
6   mean_B2X  0.004168
7   mean_X2X  0.004187
8    std_A2B  0.001461
9    std_A2X  0.010932
10   std_X2X  0.001357
11     E_coh  0.023421
12        OF  0.003454
13       A_Z  0.028011
14       B_Z  0.004365
15       X_Z  0.084386
16       B_G  0.010691
17     B_IEI  0.001666
18     X_IEI  0.075030
19    B_IEII  0.012768
20    X_IEII  0.067367
21      A_EA  0.000534
22      B_EA  0.008834
23      X_EA  0.172309
24    A_ChiP  0.009972
25    B_ChiP  0.000626
26    A_ChiA  0.015945
27    X_ChiA  0.045401
28    A_Rvdw  0.000846
29      B_Ra  0.000826
30      A_BP  0.042298
31     B_Rho  0.011056
32      B_MV  0.002966
33      X_MV  0.013609
34      B_Hf  0.013316
35   A_Kappa  0.000660
36   B_Kappa  0.003657
37     A_CvM  0.002811
38     B_CvM  0.018401
39       B_B  0.004562
-----------------------------------------------------
The

In [18]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.025253
1        s_B  0.030726
2        s_X  0.053797
3    density  0.027343
4   mean_A2B  0.059548
5   mean_A2X  0.050768
6   mean_B2X  0.016601
7   mean_X2X  0.068987
8      E_coh  0.076207
9         OF  0.036188
10       A_Z  0.012421
11       B_Z  0.019614
12       X_Z  0.064217
13       B_G  0.013082
14     X_IEI  0.065970
15    B_IEII  0.016738
16    X_IEII  0.074445
17      X_EA  0.042651
18    B_ChiP  0.014624
19    A_ChiA  0.029235
20    X_ChiA  0.018174
21    A_Rvdw  0.020649
22      B_Ra  0.015668
23      A_BP  0.027511
24     B_Rho  0.020968
25      B_MV  0.015044
26      X_MV  0.028849
27      B_Hf  0.014989
28   B_Kappa  0.023397
29     B_CvM  0.016333
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 1 , sqrt  is  0.7491817306105445
The RMSE for  10 , squared_error , 2 , 1 , sqrt  is  1.7043259233265106e-05
-----

In [19]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.095299
1        s_B  0.019101
2        s_X  0.055738
3    density  0.023739
4   mean_A2B  0.015143
5   mean_A2X  0.004777
6   mean_X2X  0.069583
7      E_coh  0.096428
8         OF  0.017203
9        A_Z  0.004290
10     X_IEI  0.278233
11    X_IEII  0.077086
12      X_EA  0.110091
13    A_ChiA  0.017204
14    A_Rvdw  0.016870
15      B_Ra  0.007349
16     B_Rho  0.014532
17      B_MV  0.022668
18      X_MV  0.026789
19     B_CvM  0.027878
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , sqrt  is  0.7523853423225629
The RMSE for  100 , friedman_mse , 2 , 1 , sqrt  is  1.7226061523969107e-05
-----------------------------------------------------
The R2 is  0.6161463389922925
The RMSE is  2.1956883731591216e-05


In [20]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.060248
1       s_B  0.042981
2       s_X  0.117935
3  mean_A2B  0.114535
4  mean_X2X  0.117950
5     E_coh  0.078499
6        OF  0.053634
7     X_IEI  0.099611
8    X_IEII  0.096214
9      X_EA  0.218392
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , sqrt  is  0.7647063792663624
The RMSE for  100 , squared_error , 2 , 1 , sqrt  is  1.6751442499325257e-05
-----------------------------------------------------
The R2 is  0.7065142824891277
The RMSE is  1.942335123988546e-05


## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , log2  is  0.7686461140390373
The RMSE for  100 , absolute_error , 2 , 1 , log2  is  0.29776602491811627


In [22]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , log2  is  0.7535232449012561
The RMSE for  100 , friedman_mse , 2 , 1 , log2  is  0.30625327535314134


### Drop correlated features

In [23]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  10 , squared_error , 4 , 1 , sqrt  is  0.7464122116410496
The RMSE for  10 , squared_error , 4 , 1 , sqrt  is  0.31033264558869433


### Feature elimination

In [24]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.020149
1        s_B  0.013477
2        s_X  0.006082
3    density  0.028715
4   mean_A2B  0.133369
5   mean_A2X  0.063196
6   mean_B2X  0.007372
7    std_A2B  0.000813
8    std_A2X  0.001649
9    std_B2X  0.000768
10   std_X2X  0.000634
11     E_coh  0.018733
12        TF  0.006471
13        OF  0.020403
14       A_Z  0.000491
15       B_Z  0.011724
16       X_Z  0.004747
17     A_IEI  0.038612
18     B_IEI  0.001712
19     X_IEI  0.077741
20    A_IEII  0.000135
21    B_IEII  0.003881
22    X_IEII  0.144593
23      B_EA  0.043724
24      X_EA  0.107235
25    A_ChiP  0.002544
26    B_ChiP  0.001577
27    X_ChiA  0.017300
28    A_Rvdw  0.013472
29      B_Ra  0.003361
30      X_Ra  0.136794
31      B_MP  0.004314
32      A_BP  0.008619
33     B_Rho  0.015203
34      B_MV  0.007031
35      B_Hf  0.003464
36   A_Kappa  0.003907
37   B_Kappa  0.009860
38     A_CvM  0.003542
39     B_CvM  0.012587
-----------------------------------------------------
The

In [25]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.005538
1        s_B  0.096296
2        s_X  0.101094
3    density  0.023712
4   mean_A2B  0.015128
5   mean_A2X  0.030372
6   mean_B2X  0.005177
7    std_A2B  0.000422
8      E_coh  0.102020
9         TF  0.003095
10        OF  0.003620
11       B_Z  0.027109
12     A_IEI  0.006252
13     B_IEI  0.001054
14     X_IEI  0.038671
15    A_IEII  0.005069
16    X_IEII  0.144393
17      B_EA  0.000764
18      X_EA  0.133445
19    B_ChiP  0.001485
20    X_ChiA  0.088022
21    A_Rvdw  0.053637
22      B_Ra  0.005412
23      X_Ra  0.031492
24      B_MP  0.007777
25     B_Rho  0.002106
26      B_MV  0.030824
27      B_Hf  0.015896
28   B_Kappa  0.016467
29     A_CvM  0.003652
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 1 , 0.3  is  0.7484216043384077
The RMSE for  10 , squared_error , 2 , 1 , 0.3  is  0.29821854817740445
----------

In [26]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.027725
1        s_B  0.011792
2        s_X  0.114495
3    density  0.012997
4   mean_A2B  0.140459
5   mean_A2X  0.201742
6   mean_B2X  0.003895
7      E_coh  0.020957
8         TF  0.012851
9         OF  0.027187
10       B_Z  0.002365
11     A_IEI  0.025907
12     X_IEI  0.096877
13      X_EA  0.133274
14    X_ChiA  0.065126
15    A_Rvdw  0.025682
16     B_Rho  0.031485
17      B_MV  0.005570
18   B_Kappa  0.035256
19     A_CvM  0.004360
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  10 , squared_error , 8 , 1 , log2  is  0.8081713943327709
The RMSE for  10 , squared_error , 8 , 1 , log2  is  0.3070366431489904
-----------------------------------------------------
The R2 is  0.7664750630542634
The RMSE is  0.33666774214962497


In [27]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.082696
1       s_B  0.013888
2  mean_A2B  0.194902
3     E_coh  0.088780
4        OF  0.106007
5     X_IEI  0.195423
6      X_EA  0.164973
7    X_ChiA  0.016532
8    A_Rvdw  0.077050
9     B_Rho  0.059749
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 4 , 1 , 0.3  is  0.7937170414289682
The RMSE for  100 , friedman_mse , 4 , 1 , 0.3  is  0.2991541905837797
-----------------------------------------------------
The R2 is  0.7048925540415597
The RMSE is  0.3543109584857616


## Minmax scaling

In [28]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , log2  is  0.7358248531959299
The RMSE for  100 , friedman_mse , 2 , 1 , log2  is  0.08227613505313357


### Drop correlated features

In [29]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 4 , 1 , 0.3  is  0.6574020357562174
The RMSE for  10 , friedman_mse , 4 , 1 , 0.3  is  0.08401871062975376


### Feature elimination

In [30]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.008388
1        s_B  0.024456
2        s_X  0.069698
3    density  0.007600
4   mean_A2B  0.035305
5   mean_A2X  0.000165
6   mean_B2X  0.008323
7   mean_X2X  0.071282
8    std_A2B  0.000013
9    std_A2X  0.004836
10   std_B2X  0.003918
11   std_X2X  0.001195
12     E_coh  0.035239
13        TF  0.005789
14        OF  0.056861
15       B_Z  0.008809
16       X_Z  0.264486
17     A_IEI  0.002315
18     B_IEI  0.001891
19     X_IEI  0.071901
20    A_IEII  0.001031
21    B_IEII  0.005843
22    X_IEII  0.123343
23      B_EA  0.008212
24      X_EA  0.078683
25    A_ChiP  0.001147
26    B_ChiP  0.005975
27    A_ChiA  0.019450
28    X_ChiA  0.000531
29    A_Rvdw  0.003412
30      B_Ra  0.003692
31      B_MP  0.001535
32     B_Rho  0.032854
33      B_MV  0.000000
34      X_MV  0.004410
35      B_Hf  0.003136
36   A_Kappa  0.000021
37   B_Kappa  0.001602
38     A_CvM  0.010784
39     B_CvM  0.011871
-----------------------------------------------------
The

In [31]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.024248
1        s_B  0.032525
2        s_X  0.091382
3    density  0.015724
4   mean_A2B  0.037821
5   mean_A2X  0.072035
6   mean_B2X  0.006271
7   mean_X2X  0.078374
8      E_coh  0.038073
9         TF  0.006596
10        OF  0.028806
11       B_Z  0.013167
12       X_Z  0.066938
13     A_IEI  0.026006
14     X_IEI  0.086067
15    B_IEII  0.005380
16    X_IEII  0.082608
17      X_EA  0.089695
18    B_ChiP  0.005932
19    A_ChiA  0.033341
20    X_ChiA  0.016543
21    A_Rvdw  0.024585
22      B_Ra  0.007952
23      B_MP  0.007714
24     B_Rho  0.014616
25      B_MV  0.008359
26      X_MV  0.035629
27      B_Hf  0.011416
28   B_Kappa  0.015716
29     B_CvM  0.016481
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 1 , sqrt  is  0.7633449861839745
The RMSE for  10 , squared_error , 2 , 1 , sqrt  is  0.08253948762575748
--------

In [32]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.031090
1        s_B  0.025095
2        s_X  0.052514
3    density  0.012625
4   mean_A2B  0.183729
5   mean_A2X  0.213080
6   mean_B2X  0.008125
7   mean_X2X  0.019508
8      E_coh  0.027202
9         TF  0.008693
10        OF  0.007618
11       X_Z  0.026346
12     X_IEI  0.126684
13    B_IEII  0.030211
14    X_IEII  0.124294
15    B_ChiP  0.010341
16    A_ChiA  0.051223
17    A_Rvdw  0.007686
18     B_Rho  0.028350
19      B_Hf  0.005588
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 4 , 1 , log2  is  0.7357018580683086
The RMSE for  100 , friedman_mse , 4 , 1 , log2  is  0.0833617348566881
-----------------------------------------------------
The R2 is  0.7386381248632004
The RMSE is  0.09110529171039324


In [33]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.068948
1       s_B  0.071045
2       s_X  0.112332
3  mean_A2B  0.100423
4  mean_A2X  0.094618
5  mean_X2X  0.077905
6     E_coh  0.067217
7       X_Z  0.166418
8     X_IEI  0.140728
9    X_IEII  0.100365
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 0.3  is  0.7219080951995194
The RMSE for  100 , absolute_error , 2 , 1 , 0.3  is  0.0795977783406477
-----------------------------------------------------
The R2 is  0.3615649907786877
The RMSE is  0.10776182357875981
