# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_b', 'Coef_c', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_a'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        print(featureScores.nlargest(n,'Score'))  #print 10 best features
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'n_estimators':[],'criterion':[],'min_samples_split':[],
                      'min_samples_leaf':[],'max_features':[],'cv_rmse':[],'cv_r2':[]}
        for n,c,s,l,f in itertools.product(hyperparams['n_estimators'], 
                                           hyperparams['criterion'],hyperparams['min_samples_split'],
                                          hyperparams['min_samples_leaf'],hyperparams['max_features']):

            random_forests = RandomForestRegressor(
            n_estimators=n, criterion=c, min_samples_split=s, min_samples_leaf=l, max_features=f)

            cv_error, cv_r2 = run_cv(random_forests, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['n_estimators'].append(n)
            cv_results['criterion'].append(c)
            cv_results['min_samples_split'].append(s)
            cv_results['min_samples_leaf'].append(l)
            cv_results['max_features'].append(f)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        n_opt = cv_results.iloc[0]['n_estimators']
        c_opt = cv_results.iloc[0]['criterion']
        s_opt = cv_results.iloc[0]['min_samples_split']
        l_opt = cv_results.iloc[0]['min_samples_leaf']
        f_opt = cv_results.iloc[0]['max_features']
        
    else:
        n_opt = hyperparams['n_estimators'][0]
        c_opt = hyperparams['criterion'][0]
        s_opt = hyperparams['min_samples_split'][0]
        l_opt = hyperparams['min_samples_leaf'][0]
        f_opt = hyperparams['max_features'][0]

    random_forests = RandomForestRegressor(
            n_estimators=n_opt, criterion=c_opt, min_samples_split=s_opt, min_samples_leaf=l_opt, max_features=f_opt)
    model = random_forests.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    return rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'n_estimators': [10,100],
             'criterion':["squared_error", "absolute_error", "friedman_mse"],
            'min_samples_split':[2,4,8],
            'min_samples_leaf':[1,2,4,8],
            'max_features':["sqrt","log2",0.3,1]}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 4 , 1 , sqrt  is  0.7150912496141888
The RMSE for  100 , absolute_error , 4 , 1 , sqrt  is  1.0077144136816847e-08


### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , sqrt  is  0.7096140201983336
The RMSE for  100 , absolute_error , 2 , 1 , sqrt  is  1.0228644851876512e-08


### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.016217
1        s_B  0.022005
2        s_X  0.064469
3    density  0.017545
4   mean_A2B  0.051121
5   mean_A2X  0.068738
6   mean_B2X  0.014089
7   mean_X2X  0.060720
8    std_A2X  0.010125
9    std_X2X  0.010051
10     E_coh  0.047217
11        TF  0.015017
12        OF  0.029552
13       A_Z  0.009220
14       B_Z  0.016558
15       X_Z  0.043983
16       B_G  0.014274
17     A_IEI  0.013486
18     B_IEI  0.009722
19     X_IEI  0.048954
20    B_IEII  0.010622
21    X_IEII  0.029809
22      B_EA  0.014019
23      X_EA  0.069239
24    A_ChiP  0.008067
25    B_ChiP  0.009316
26    A_ChiA  0.016781
27    X_ChiA  0.014651
28    A_Rvdw  0.021431
29      B_Ra  0.011891
30      X_Ra  0.061354
31      B_MP  0.009716
32      A_BP  0.017039
33     B_Rho  0.019959
34      B_MV  0.008726
35      X_MV  0.027761
36      B_Hf  0.010663
37   B_Kappa  0.021762
38     A_CvM  0.014416
39     B_CvM  0.019714
-----------------------------------------------------
The

In [18]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.015685
1        s_B  0.031449
2        s_X  0.069498
3    density  0.023518
4   mean_A2B  0.070520
5   mean_A2X  0.046278
6   mean_B2X  0.013765
7   mean_X2X  0.049830
8      E_coh  0.091073
9         TF  0.016714
10        OF  0.028310
11       B_Z  0.020492
12       X_Z  0.062221
13     A_IEI  0.021861
14     B_IEI  0.013798
15     X_IEI  0.044201
16    B_IEII  0.017512
17    X_IEII  0.071802
18      X_EA  0.041904
19    A_ChiA  0.014668
20    X_ChiA  0.016878
21    A_Rvdw  0.031742
22      B_Ra  0.019073
23      X_Ra  0.040153
24      B_MP  0.023262
25      A_BP  0.023655
26     B_Rho  0.014129
27      X_MV  0.029821
28   B_Kappa  0.020897
29     B_CvM  0.015291
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , log2  is  0.7286927993506649
The RMSE for  100 , absolute_error , 2 , 1 , log2  is  9.919189570903163e-09
--

In [19]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.031660
1        s_B  0.037471
2        s_X  0.057387
3    density  0.036350
4   mean_A2B  0.069895
5   mean_A2X  0.057000
6   mean_X2X  0.086540
7      E_coh  0.098817
8         TF  0.026445
9         OF  0.044419
10       X_Z  0.085047
11     B_IEI  0.031129
12     X_IEI  0.073677
13    X_IEII  0.021345
14      X_EA  0.071669
15    A_ChiA  0.025832
16    A_Rvdw  0.032257
17      X_Ra  0.056810
18      X_MV  0.025936
19   B_Kappa  0.030313
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , sqrt  is  0.7424412250990297
The RMSE for  100 , absolute_error , 2 , 1 , sqrt  is  9.68108429343687e-09
-----------------------------------------------------
The R2 is  0.7027572522502206
The RMSE is  1.0087775349459164e-08


In [20]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.084484
1  mean_A2B  0.099376
2  mean_A2X  0.084159
3  mean_X2X  0.136790
4     E_coh  0.129703
5        OF  0.096108
6       X_Z  0.117392
7     X_IEI  0.098272
8    X_IEII  0.058712
9      X_Ra  0.095004
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , log2  is  0.6712961893149187
The RMSE for  100 , absolute_error , 2 , 1 , log2  is  9.853490567978308e-09
-----------------------------------------------------
The R2 is  0.6857607531223684
The RMSE is  1.0214606996950752e-08


## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 4 , 1 , log2  is  0.7256529506968343
The RMSE for  100 , absolute_error , 4 , 1 , log2  is  0.34887329100765346


In [22]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  10 , absolute_error , 2 , 1 , sqrt  is  0.7300372463767805
The RMSE for  10 , absolute_error , 2 , 1 , sqrt  is  0.3451602844326805


### Drop correlated features

In [23]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 0.3  is  0.6670366635336236
The RMSE for  100 , absolute_error , 2 , 1 , 0.3  is  0.35634693896923597


### Feature elimination

In [24]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.022969
1        s_B  0.022307
2        s_X  0.056658
3    density  0.023047
4   mean_A2B  0.069133
5   mean_A2X  0.041174
6   mean_B2X  0.015400
7   mean_X2X  0.104976
8    std_A2B  0.006693
9    std_A2X  0.006114
10   std_X2X  0.007201
11     E_coh  0.083726
12        TF  0.014595
13        OF  0.028202
14       A_Z  0.010414
15       B_Z  0.017421
16       X_Z  0.077164
17       B_G  0.008984
18     A_IEI  0.010522
19     B_IEI  0.013231
20     X_IEI  0.039574
21    B_IEII  0.017934
22    X_IEII  0.053025
23      X_EA  0.040939
24    B_ChiP  0.007327
25    A_ChiA  0.008459
26    X_ChiA  0.013370
27    A_Rvdw  0.012783
28      B_Ra  0.015469
29      X_Ra  0.041449
30      B_MP  0.007630
31      A_BP  0.007477
32     B_Rho  0.015242
33      B_MV  0.010523
34      X_MV  0.017665
35      B_Hf  0.011387
36   A_Kappa  0.004197
37   B_Kappa  0.017739
38     A_CvM  0.006138
39     B_CvM  0.011740
-----------------------------------------------------
The

In [25]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.008150
1        s_B  0.024000
2    density  0.013739
3   mean_A2B  0.027945
4   mean_A2X  0.135296
5   mean_B2X  0.023971
6   mean_X2X  0.026079
7    std_A2X  0.006331
8    std_X2X  0.005714
9      E_coh  0.042614
10        TF  0.018837
11        OF  0.031271
12       B_Z  0.046030
13       X_Z  0.058765
14     A_IEI  0.048070
15     B_IEI  0.019322
16     X_IEI  0.079870
17    B_IEII  0.007472
18    X_IEII  0.052987
19    B_ChiP  0.004326
20    A_ChiA  0.049835
21    X_ChiA  0.035305
22    A_Rvdw  0.028301
23      B_Ra  0.022053
24      X_Ra  0.075578
25      B_MP  0.056084
26      A_BP  0.005569
27     B_Rho  0.017545
28      B_Hf  0.009107
29   B_Kappa  0.019833
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , log2  is  0.7010809469735997
The RMSE for  100 , absolute_error , 2 , 1 , log2  is  0.3575412258060018
-----

In [26]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.034249
1        s_B  0.046948
2    density  0.029813
3   mean_A2B  0.076566
4   mean_A2X  0.084488
5   mean_B2X  0.026604
6   mean_X2X  0.087200
7      E_coh  0.124534
8         TF  0.029720
9         OF  0.045335
10       B_Z  0.023390
11       X_Z  0.094606
12     A_IEI  0.022039
13     X_IEI  0.053381
14    X_IEII  0.061175
15    A_ChiA  0.025063
16    X_ChiA  0.023668
17      X_Ra  0.054100
18     B_Rho  0.030109
19   B_Kappa  0.027012
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , log2  is  0.7324434852918892
The RMSE for  100 , squared_error , 2 , 1 , log2  is  0.3408676215127614
-----------------------------------------------------
The R2 is  0.692504312071586
The RMSE is  0.36331754479749595


In [27]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.064415
1  mean_A2B  0.091884
2  mean_A2X  0.105996
3  mean_X2X  0.116719
4     E_coh  0.092946
5        OF  0.054793
6       X_Z  0.203738
7     X_IEI  0.106410
8    X_IEII  0.088805
9      X_Ra  0.074295
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 2 , 1 , sqrt  is  0.7364208929219016
The RMSE for  10 , friedman_mse , 2 , 1 , sqrt  is  0.3275529888805771
-----------------------------------------------------
The R2 is  0.6874048526315097
The RMSE is  0.3551149438464788


## Minmax scaling

In [28]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , log2  is  0.7154245647334927
The RMSE for  100 , friedman_mse , 2 , 1 , log2  is  0.08259235411583014


### Drop correlated features

In [29]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , log2  is  0.7042663515799649
The RMSE for  100 , friedman_mse , 2 , 1 , log2  is  0.08391307854173612


### Feature elimination

In [30]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.029087
1        s_B  0.022848
2        s_X  0.047588
3    density  0.012638
4   mean_A2B  0.065850
5   mean_A2X  0.092873
6   mean_B2X  0.006099
7   mean_X2X  0.061790
8    std_A2X  0.007555
9      E_coh  0.028566
10        TF  0.006799
11        OF  0.016148
12       A_Z  0.006415
13       B_Z  0.005946
14       X_Z  0.097367
15       B_G  0.003713
16     A_IEI  0.030306
17     B_IEI  0.007936
18     X_IEI  0.099724
19    B_IEII  0.009355
20    X_IEII  0.065900
21      B_EA  0.004278
22      X_EA  0.014592
23    A_ChiP  0.005576
24    B_ChiP  0.003714
25    A_ChiA  0.034528
26    X_ChiA  0.009008
27    A_Rvdw  0.012346
28      B_Ra  0.006380
29      X_Ra  0.068490
30      B_MP  0.011119
31      A_BP  0.009849
32     B_Rho  0.011034
33      B_MV  0.004872
34      X_MV  0.036690
35      B_Hf  0.010147
36   A_Kappa  0.006816
37   B_Kappa  0.008083
38   X_Kappa  0.006804
39     B_CvM  0.011168
-----------------------------------------------------
The

In [31]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.018667
1        s_B  0.013632
2    density  0.032119
3   mean_A2B  0.030150
4   mean_A2X  0.138752
5   mean_B2X  0.013071
6   mean_X2X  0.115522
7      E_coh  0.044271
8         TF  0.024864
9        A_Z  0.009003
10       X_Z  0.108967
11       B_G  0.010875
12     A_IEI  0.019730
13     B_IEI  0.019573
14    B_IEII  0.018066
15    X_IEII  0.044715
16      X_EA  0.118592
17    B_ChiP  0.013230
18    A_ChiA  0.032763
19    A_Rvdw  0.012085
20      B_Ra  0.025694
21      B_MP  0.023373
22      A_BP  0.012493
23     B_Rho  0.032952
24      B_MV  0.005792
25      B_Hf  0.007775
26   A_Kappa  0.010881
27   B_Kappa  0.027498
28   X_Kappa  0.003317
29     B_CvM  0.011579
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 4 , 1 , sqrt  is  0.6410208132063082
The RMSE for  100 , friedman_mse , 4 , 1 , sqrt  is  0.08498836194779248
--------

In [32]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.022834
1        s_B  0.035789
2    density  0.015330
3   mean_A2B  0.114681
4   mean_A2X  0.109069
5   mean_X2X  0.091142
6      E_coh  0.069941
7         TF  0.016505
8        X_Z  0.153190
9      A_IEI  0.030977
10    B_IEII  0.014472
11    X_IEII  0.089559
12      X_EA  0.088547
13    A_ChiA  0.025809
14    A_Rvdw  0.040197
15      B_Ra  0.011650
16      B_MP  0.017437
17      A_BP  0.025226
18     B_Rho  0.014677
19      B_Hf  0.012967
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 4 , 1 , log2  is  0.6618764129778725
The RMSE for  100 , friedman_mse , 4 , 1 , log2  is  0.08339067888023043
-----------------------------------------------------
The R2 is  0.6858643991821475
The RMSE is  0.08243766777358864


In [33]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_A  0.049868
1       s_B  0.063272
2  mean_A2B  0.179719
3  mean_A2X  0.103358
4  mean_X2X  0.125586
5     E_coh  0.061740
6       X_Z  0.107805
7    X_IEII  0.147867
8      X_EA  0.129965
9     B_Rho  0.030821
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , sqrt  is  0.7215120286279206
The RMSE for  100 , friedman_mse , 2 , 1 , sqrt  is  0.08128361059348344
-----------------------------------------------------
The R2 is  0.7111489511373789
The RMSE is  0.08163550154887624
