# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_b', 'Coef_c', 'Coef_a'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_d'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        print(featureScores.nlargest(n,'Score'))  #print 10 best features
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'n_estimators':[],'criterion':[],'min_samples_split':[],
                      'min_samples_leaf':[],'max_features':[],'cv_rmse':[],'cv_r2':[]}
        for n,c,s,l,f in itertools.product(hyperparams['n_estimators'], 
                                           hyperparams['criterion'],hyperparams['min_samples_split'],
                                          hyperparams['min_samples_leaf'],hyperparams['max_features']):

            random_forests = RandomForestRegressor(
            n_estimators=n, criterion=c, min_samples_split=s, min_samples_leaf=l, max_features=f)

            cv_error, cv_r2 = run_cv(random_forests, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['n_estimators'].append(n)
            cv_results['criterion'].append(c)
            cv_results['min_samples_split'].append(s)
            cv_results['min_samples_leaf'].append(l)
            cv_results['max_features'].append(f)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        n_opt = cv_results.iloc[0]['n_estimators']
        c_opt = cv_results.iloc[0]['criterion']
        s_opt = cv_results.iloc[0]['min_samples_split']
        l_opt = cv_results.iloc[0]['min_samples_leaf']
        f_opt = cv_results.iloc[0]['max_features']
        
    else:
        n_opt = hyperparams['n_estimators'][0]
        c_opt = hyperparams['criterion'][0]
        s_opt = hyperparams['min_samples_split'][0]
        l_opt = hyperparams['min_samples_leaf'][0]
        f_opt = hyperparams['max_features'][0]

    random_forests = RandomForestRegressor(
            n_estimators=n_opt, criterion=c_opt, min_samples_split=s_opt, min_samples_leaf=l_opt, max_features=f_opt)
    model = random_forests.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    return rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'n_estimators': [10,100],
             'criterion':["squared_error", "absolute_error", "friedman_mse"],
            'min_samples_split':[2,4,8],
            'min_samples_leaf':[1,2,4,8],
            'max_features':["sqrt","log2",0.3,1]}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 0.3  is  0.827283215021097
The RMSE for  100 , absolute_error , 2 , 1 , 0.3  is  3.1790823140359916


### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , 0.3  is  0.8190212396217834
The RMSE for  100 , squared_error , 2 , 1 , 0.3  is  3.2835294484225974


### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.011419
1        s_B  0.037430
2        s_X  0.029880
3    density  0.059588
4   mean_A2B  0.021548
5   mean_A2X  0.019430
6   mean_B2X  0.005920
7   mean_X2X  0.032513
8    std_A2B  0.005422
9    std_A2X  0.009246
10   std_B2X  0.008586
11   std_X2X  0.004649
12     E_coh  0.021278
13        TF  0.005339
14        OF  0.037560
15       A_Z  0.002562
16       B_Z  0.060142
17       X_Z  0.142848
18     A_IEI  0.003493
19     B_IEI  0.003146
20     X_IEI  0.024868
21    B_IEII  0.012188
22    X_IEII  0.071674
23      B_EA  0.001524
24      X_EA  0.125424
25    B_ChiP  0.002944
26    A_ChiA  0.019223
27    X_ChiA  0.008598
28    A_Rvdw  0.011663
29      B_Ra  0.034527
30      X_Ra  0.034614
31      A_BP  0.007827
32     B_Rho  0.041691
33      B_MV  0.014848
34      X_MV  0.009172
35      B_Hf  0.004741
36   B_Kappa  0.005916
37   X_Kappa  0.005727
38     A_CvM  0.001818
39     B_CvM  0.039016
-----------------------------------------------------
The

In [18]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.024388
1        s_B  0.032096
2        s_X  0.020710
3    density  0.039917
4   mean_A2B  0.048186
5   mean_A2X  0.022559
6   mean_B2X  0.014882
7   mean_X2X  0.047622
8    std_A2X  0.017259
9    std_X2X  0.020920
10     E_coh  0.041490
11        TF  0.015659
12        OF  0.031208
13       B_Z  0.040081
14       X_Z  0.120586
15     A_IEI  0.009369
16     X_IEI  0.036617
17    B_IEII  0.015798
18    X_IEII  0.019535
19      X_EA  0.117776
20    B_ChiP  0.014071
21    A_ChiA  0.024106
22    X_ChiA  0.017598
23    A_Rvdw  0.014803
24      B_Ra  0.020031
25      X_Ra  0.042710
26     B_Rho  0.063346
27      B_MV  0.016836
28      X_MV  0.012089
29     B_CvM  0.037752
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 4 , 1 , log2  is  0.8464574204589639
The RMSE for  10 , friedman_mse , 4 , 1 , log2  is  3.163241944951811
------------

In [19]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.017961
1        s_B  0.003574
2        s_X  0.059582
3    density  0.065801
4   mean_A2B  0.105742
5   mean_A2X  0.004025
6   mean_X2X  0.061994
7    std_X2X  0.012786
8      E_coh  0.063069
9         OF  0.034813
10       B_Z  0.075808
11       X_Z  0.136901
12     A_IEI  0.011535
13    X_IEII  0.053513
14      X_EA  0.136904
15    A_ChiA  0.006114
16    X_ChiA  0.033147
17    A_Rvdw  0.014576
18      B_Ra  0.051230
19     B_CvM  0.050924
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 0.3  is  0.8104377430379512
The RMSE for  100 , absolute_error , 2 , 1 , 0.3  is  3.207826562530581
-----------------------------------------------------
The R2 is  0.7857824520250977
The RMSE is  3.4190230751756343


In [20]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_B  0.090684
1       s_X  0.077266
2  mean_A2B  0.104604
3  mean_X2X  0.104584
4     E_coh  0.090981
5        OF  0.080178
6       B_Z  0.075433
7       X_Z  0.171285
8      X_EA  0.129627
9     B_CvM  0.075358
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 2 , sqrt  is  0.7401503001189571
The RMSE for  10 , squared_error , 2 , 2 , sqrt  is  3.4080905264018013
-----------------------------------------------------
The R2 is  0.707815022175427
The RMSE is  3.6160612187310868


## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , 0.3  is  0.8369014199495345
The RMSE for  100 , friedman_mse , 2 , 1 , 0.3  is  0.28663365311628486


In [22]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 4 , 1 , 0.3  is  0.8018946850965183
The RMSE for  100 , absolute_error , 4 , 1 , 0.3  is  0.2911881710630327


### Drop correlated features

In [23]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 0.3  is  0.8173605214074204
The RMSE for  100 , absolute_error , 2 , 1 , 0.3  is  0.31452000927686646


### Feature elimination

In [24]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.018828
1        s_B  0.021535
2        s_X  0.030620
3    density  0.036011
4   mean_A2B  0.047558
5   mean_A2X  0.036103
6   mean_B2X  0.013196
7   mean_X2X  0.046999
8    std_A2B  0.008468
9    std_A2X  0.017698
10   std_B2X  0.008122
11   std_X2X  0.012947
12     E_coh  0.033907
13        TF  0.015127
14        OF  0.038184
15       A_Z  0.007002
16       B_Z  0.027125
17       X_Z  0.122105
18       B_G  0.005465
19     A_IEI  0.012228
20     B_IEI  0.006159
21     X_IEI  0.030646
22    B_IEII  0.011855
23    X_IEII  0.026100
24      A_EA  0.004224
25      X_EA  0.100521
26    A_ChiP  0.006851
27    B_ChiP  0.012222
28    A_ChiA  0.021289
29    X_ChiA  0.013449
30    A_Rvdw  0.009751
31      B_Ra  0.026532
32      X_Ra  0.030903
33     B_Rho  0.055898
34      B_MV  0.015406
35      X_MV  0.014957
36      B_Hf  0.009214
37   B_Kappa  0.011916
38     A_CvM  0.006850
39     B_CvM  0.026029
-----------------------------------------------------
The

In [25]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.001385
1        s_B  0.012940
2        s_X  0.021625
3    density  0.015934
4   mean_A2B  0.137313
5   mean_A2X  0.000881
6   mean_B2X  0.011173
7   mean_X2X  0.095371
8    std_A2B  0.001933
9    std_A2X  0.005278
10   std_B2X  0.003436
11   std_X2X  0.009862
12     E_coh  0.017448
13        TF  0.005259
14        OF  0.008133
15       B_Z  0.091193
16       X_Z  0.142281
17     A_IEI  0.025040
18     B_IEI  0.000729
19     X_IEI  0.076209
20    B_IEII  0.033529
21    X_IEII  0.060188
22    B_ChiP  0.023225
23    A_ChiA  0.088452
24    A_Rvdw  0.004112
25      B_Ra  0.004901
26     B_Rho  0.086189
27      B_MV  0.004621
28      X_MV  0.006856
29     B_CvM  0.004506
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , absolute_error , 4 , 1 , 0.3  is  0.7663816162014894
The RMSE for  10 , absolute_error , 4 , 1 , 0.3  is  0.3209674339605244
---------

In [26]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.034480
1        s_B  0.056594
2    density  0.031796
3   mean_A2B  0.086826
4   mean_A2X  0.060801
5   mean_B2X  0.018170
6   mean_X2X  0.126291
7    std_A2X  0.021339
8      E_coh  0.021927
9         OF  0.053653
10       B_Z  0.056041
11       X_Z  0.220099
12     X_IEI  0.017755
13    X_IEII  0.029917
14    B_ChiP  0.013358
15    A_ChiA  0.014183
16      B_Ra  0.025137
17     B_Rho  0.062926
18      X_MV  0.011305
19     B_CvM  0.037402
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  10 , absolute_error , 2 , 1 , 0.3  is  0.7178329069897816
The RMSE for  10 , absolute_error , 2 , 1 , 0.3  is  0.3089299188579175
-----------------------------------------------------
The R2 is  0.7580250415863289
The RMSE is  0.3262403455841206


In [27]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_B  0.121106
1  mean_A2B  0.157373
2  mean_A2X  0.177057
3  mean_X2X  0.075775
4     E_coh  0.095768
5        OF  0.086355
6       X_Z  0.077381
7    A_ChiA  0.043454
8     B_Rho  0.067788
9      X_MV  0.097943
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , sqrt  is  0.8089131088876933
The RMSE for  100 , absolute_error , 2 , 1 , sqrt  is  0.30909957372976365
-----------------------------------------------------
The R2 is  0.7490027633543264
The RMSE is  0.32815799096736387


## Minmax scaling

In [28]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  10 , absolute_error , 2 , 1 , 0.3  is  0.8670251999733172
The RMSE for  10 , absolute_error , 2 , 1 , 0.3  is  0.06163889350949852


### Drop correlated features

In [29]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , absolute_error , 4 , 1 , 0.3  is  0.8166542976126345
The RMSE for  100 , absolute_error , 4 , 1 , 0.3  is  0.07260205772268902


### Feature elimination

In [30]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.018708
1        s_B  0.035489
2        s_X  0.022515
3    density  0.039201
4   mean_A2B  0.050684
5   mean_A2X  0.035215
6   mean_B2X  0.008352
7   mean_X2X  0.056739
8    std_A2B  0.006451
9    std_A2X  0.012796
10   std_B2X  0.012629
11   std_X2X  0.012572
12     E_coh  0.038705
13        TF  0.014857
14        OF  0.035526
15       A_Z  0.006735
16       B_Z  0.040875
17       X_Z  0.094126
18     A_IEI  0.009921
19     B_IEI  0.005518
20     X_IEI  0.021904
21    B_IEII  0.012902
22    X_IEII  0.040431
23      B_EA  0.005613
24      X_EA  0.092628
25    A_ChiP  0.008010
26    B_ChiP  0.007632
27    A_ChiA  0.020621
28    X_ChiA  0.018196
29    A_Rvdw  0.007111
30      B_Ra  0.018499
31      X_Ra  0.063928
32     B_Rho  0.051636
33      B_MV  0.009071
34      X_MV  0.009465
35      B_Hf  0.005356
36   B_Kappa  0.010306
37   X_Kappa  0.009549
38     A_CvM  0.007473
39     B_CvM  0.022054
-----------------------------------------------------
The

In [31]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.022725
1        s_B  0.036726
2        s_X  0.046711
3    density  0.036088
4   mean_A2B  0.049637
5   mean_A2X  0.027899
6   mean_B2X  0.017989
7   mean_X2X  0.060823
8    std_A2B  0.011014
9    std_A2X  0.019581
10   std_X2X  0.017606
11     E_coh  0.037489
12        TF  0.016739
13        OF  0.027856
14       B_Z  0.044049
15       X_Z  0.093838
16     X_IEI  0.032231
17    B_IEII  0.010901
18    X_IEII  0.041156
19      X_EA  0.090184
20    B_ChiP  0.013167
21    A_ChiA  0.024109
22    X_ChiA  0.023466
23    A_Rvdw  0.018995
24      B_Ra  0.034956
25      X_Ra  0.024154
26     B_Rho  0.051874
27      B_MV  0.019043
28      X_MV  0.015936
29     B_CvM  0.033057
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , 0.3  is  0.7806031440737156
The RMSE for  100 , squared_error , 2 , 1 , 0.3  is  0.06959391140387751
--------

In [32]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.023079
1        s_B  0.021343
2        s_X  0.011658
3    density  0.036329
4   mean_A2B  0.051580
5   mean_A2X  0.040855
6   mean_X2X  0.037217
7      E_coh  0.035107
8         OF  0.036212
9        B_Z  0.060838
10       X_Z  0.188955
11     X_IEI  0.011587
12    X_IEII  0.050936
13      X_EA  0.165822
14    A_ChiA  0.036845
15      B_Ra  0.038110
16      X_Ra  0.034441
17     B_Rho  0.047221
18      B_MV  0.027996
19     B_CvM  0.043867
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 2 , 1 , sqrt  is  0.7561823156337781
The RMSE for  10 , friedman_mse , 2 , 1 , sqrt  is  0.06442553724919195
-----------------------------------------------------
The R2 is  0.8324494852262329
The RMSE is  0.06967600174085363


In [33]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_B  0.056022
1       s_X  0.060305
2   density  0.085454
3  mean_X2X  0.185902
4        OF  0.066367
5       X_Z  0.251430
6      X_EA  0.118340
7      B_Ra  0.093129
8     B_Rho  0.062074
9     B_CvM  0.020977
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 4 , 2 , log2  is  0.713831263729366
The RMSE for  10 , friedman_mse , 4 , 2 , log2  is  0.07966426501270088
-----------------------------------------------------
The R2 is  0.6583973589879744
The RMSE is  0.08257110177939408
