# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_b', 'Coef_a', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_c'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        print(featureScores.nlargest(n,'Score'))  #print 10 best features
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'n_estimators':[],'criterion':[],'min_samples_split':[],
                      'min_samples_leaf':[],'max_features':[],'cv_rmse':[],'cv_r2':[]}
        for n,c,s,l,f in itertools.product(hyperparams['n_estimators'], 
                                           hyperparams['criterion'],hyperparams['min_samples_split'],
                                          hyperparams['min_samples_leaf'],hyperparams['max_features']):

            random_forests = RandomForestRegressor(
            n_estimators=n, criterion=c, min_samples_split=s, min_samples_leaf=l, max_features=f)

            cv_error, cv_r2 = run_cv(random_forests, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['n_estimators'].append(n)
            cv_results['criterion'].append(c)
            cv_results['min_samples_split'].append(s)
            cv_results['min_samples_leaf'].append(l)
            cv_results['max_features'].append(f)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        n_opt = cv_results.iloc[0]['n_estimators']
        c_opt = cv_results.iloc[0]['criterion']
        s_opt = cv_results.iloc[0]['min_samples_split']
        l_opt = cv_results.iloc[0]['min_samples_leaf']
        f_opt = cv_results.iloc[0]['max_features']
        
    else:
        n_opt = hyperparams['n_estimators'][0]
        c_opt = hyperparams['criterion'][0]
        s_opt = hyperparams['min_samples_split'][0]
        l_opt = hyperparams['min_samples_leaf'][0]
        f_opt = hyperparams['max_features'][0]

    random_forests = RandomForestRegressor(
            n_estimators=n_opt, criterion=c_opt, min_samples_split=s_opt, min_samples_leaf=l_opt, max_features=f_opt)
    model = random_forests.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",n_opt,",",c_opt,",",s_opt,",",l_opt,",",f_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    return rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'n_estimators': [10,100],
             'criterion':["squared_error", "absolute_error", "friedman_mse"],
            'min_samples_split':[2,4,8],
            'min_samples_leaf':[1,2,4,8],
            'max_features':["sqrt","log2",0.3,1]}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , absolute_error , 4 , 1 , 0.3  is  0.6163435413684029
The RMSE for  100 , absolute_error , 4 , 1 , 0.3  is  0.010579104654806826


### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , sqrt  is  0.6206340471297189
The RMSE for  100 , friedman_mse , 2 , 1 , sqrt  is  0.00999729922200379


### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.013505
1        s_B  0.018819
2        s_X  0.047119
3    density  0.007471
4   mean_A2B  0.045888
5   mean_A2X  0.094831
6   mean_B2X  0.004231
7   mean_X2X  0.149365
8    std_A2B  0.003682
9    std_A2X  0.004879
10   std_B2X  0.005151
11   std_X2X  0.004631
12     E_coh  0.052148
13        TF  0.005955
14        OF  0.015545
15       B_Z  0.006879
16       X_Z  0.094694
17       B_G  0.007295
18     A_IEI  0.003524
19     B_IEI  0.003323
20     X_IEI  0.070725
21    B_IEII  0.002251
22    X_IEII  0.080246
23      B_EA  0.026824
24    A_ChiP  0.008863
25    B_ChiP  0.006445
26    A_ChiA  0.018743
27    X_ChiA  0.031486
28    A_Rvdw  0.004957
29      B_Ra  0.001427
30      X_Ra  0.083443
31      B_MP  0.005007
32      A_BP  0.002624
33     B_Rho  0.003922
34      B_MV  0.011345
35      X_MV  0.010752
36      B_Hf  0.007868
37   B_Kappa  0.022291
38   X_Kappa  0.008227
39     B_CvM  0.003621
-----------------------------------------------------
The

In [18]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.015436
1        s_B  0.016974
2        s_X  0.000674
3    density  0.030403
4   mean_A2B  0.072196
5   mean_A2X  0.076952
6   mean_B2X  0.007249
7   mean_X2X  0.068827
8    std_A2B  0.008281
9    std_A2X  0.007359
10   std_X2X  0.004959
11     E_coh  0.084482
12        TF  0.014322
13        OF  0.033173
14       B_Z  0.012751
15       X_Z  0.199687
16     A_IEI  0.007267
17     X_IEI  0.001786
18    B_IEII  0.007632
19    X_IEII  0.101515
20      B_EA  0.044490
21    B_ChiP  0.038219
22    A_Rvdw  0.013897
23      B_Ra  0.003223
24      X_Ra  0.004591
25      B_MP  0.022923
26      B_MV  0.019043
27      X_MV  0.057442
28   B_Kappa  0.018039
29     B_CvM  0.006208
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , 0.3  is  0.6698013048914349
The RMSE for  100 , friedman_mse , 2 , 1 , 0.3  is  0.009849590268089172
---------

In [19]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.008332
1        s_B  0.018033
2        s_X  0.058382
3    density  0.008758
4   mean_A2B  0.074396
5   mean_A2X  0.153903
6   mean_X2X  0.227226
7      E_coh  0.053162
8         OF  0.011027
9        X_Z  0.104759
10     X_IEI  0.067644
11    X_IEII  0.049926
12      B_EA  0.016219
13    B_ChiP  0.014550
14    A_Rvdw  0.006860
15      X_Ra  0.044339
16      B_MP  0.013108
17      B_MV  0.010520
18      X_MV  0.041946
19   B_Kappa  0.016910
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , sqrt  is  0.5934406982768725
The RMSE for  100 , squared_error , 2 , 1 , sqrt  is  0.009392067790178253
-----------------------------------------------------
The R2 is  0.6024285351932639
The RMSE is  0.009927502208708118


In [20]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_X  0.093754
1  mean_A2B  0.137371
2  mean_A2X  0.117100
3  mean_X2X  0.296935
4     E_coh  0.048948
5       X_Z  0.098902
6    X_IEII  0.081843
7    B_ChiP  0.034164
8      X_Ra  0.060709
9      B_MV  0.030273
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  10 , absolute_error , 2 , 1 , log2  is  0.3327209155143463
The RMSE for  10 , absolute_error , 2 , 1 , log2  is  0.010189607275625862
-----------------------------------------------------
The R2 is  0.43783663887255675
The RMSE is  0.010332869428157337


## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  10 , squared_error , 2 , 2 , 0.3  is  0.5117552296204634
The RMSE for  10 , squared_error , 2 , 2 , 0.3  is  0.26858446706846256


In [22]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 4 , 1 , 0.3  is  0.654807858856931
The RMSE for  100 , friedman_mse , 4 , 1 , 0.3  is  0.26724491541134776


### Drop correlated features

In [23]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , log2  is  0.6050760635105432
The RMSE for  100 , squared_error , 2 , 1 , log2  is  0.2685058137756022


### Feature elimination

In [24]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.027792
1        s_B  0.045745
2        s_X  0.036183
3    density  0.004112
4   mean_A2B  0.086140
5   mean_A2X  0.105564
6   mean_B2X  0.008779
7   mean_X2X  0.094555
8    std_A2X  0.005816
9    std_B2X  0.003046
10   std_X2X  0.005222
11     E_coh  0.062629
12        TF  0.006121
13        OF  0.014948
14       B_Z  0.002105
15       X_Z  0.067402
16       B_G  0.008856
17     A_IEI  0.004945
18     B_IEI  0.005246
19     X_IEI  0.024407
20    B_IEII  0.002522
21    X_IEII  0.069682
22      B_EA  0.017785
23      X_EA  0.002911
24    A_ChiP  0.007956
25    B_ChiP  0.011695
26    A_ChiA  0.007534
27    X_ChiA  0.049592
28    A_Rvdw  0.016607
29      B_Ra  0.003138
30      X_Ra  0.058459
31      B_MP  0.007662
32      A_BP  0.004787
33     B_Rho  0.007467
34      B_MV  0.010791
35      X_MV  0.055871
36      B_Hf  0.009200
37   B_Kappa  0.024956
38   X_Kappa  0.008860
39     B_CvM  0.002914
-----------------------------------------------------
The

In [25]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.009571
1        s_B  0.030742
2        s_X  0.046138
3    density  0.005946
4   mean_A2B  0.125140
5   mean_A2X  0.153090
6   mean_B2X  0.004762
7   mean_X2X  0.214095
8    std_A2X  0.004188
9    std_X2X  0.001621
10     E_coh  0.040549
11        TF  0.003311
12        OF  0.006378
13       X_Z  0.116854
14       B_G  0.011217
15     B_IEI  0.003900
16     X_IEI  0.027089
17    X_IEII  0.040316
18      B_EA  0.013762
19    B_ChiP  0.009799
20    A_ChiA  0.003870
21    X_ChiA  0.004451
22    A_Rvdw  0.004370
23      X_Ra  0.039232
24      B_MP  0.007107
25     B_Rho  0.002589
26      B_MV  0.012818
27      X_MV  0.035001
28      B_Hf  0.004639
29   B_Kappa  0.017457
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  10 , squared_error , 4 , 1 , 0.3  is  0.6159366170452449
The RMSE for  10 , squared_error , 4 , 1 , 0.3  is  0.24922617127476268
----------

In [26]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.004596
1        s_B  0.005833
2    density  0.003381
3   mean_A2B  0.183585
4   mean_A2X  0.243323
5   mean_X2X  0.258530
6    std_X2X  0.009592
7      E_coh  0.027905
8         TF  0.004149
9         OF  0.016466
10       X_Z  0.081413
11      B_EA  0.014607
12    B_ChiP  0.020237
13    A_Rvdw  0.008174
14      X_Ra  0.080234
15      B_MP  0.000543
16     B_Rho  0.003608
17      B_MV  0.001801
18      B_Hf  0.019506
19   B_Kappa  0.012516
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , sqrt  is  0.6183131644110667
The RMSE for  100 , squared_error , 2 , 1 , sqrt  is  0.24883932703018102
-----------------------------------------------------
The R2 is  0.6252838716567338
The RMSE is  0.2593211225515855


In [27]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_B  0.049930
1  mean_A2B  0.143039
2  mean_A2X  0.148477
3  mean_X2X  0.244481
4     E_coh  0.079085
5        OF  0.024337
6       X_Z  0.136114
7      B_EA  0.032954
8    B_ChiP  0.022315
9      X_Ra  0.119267
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  100 , absolute_error , 2 , 1 , 1  is  0.48170431004947756
The RMSE for  100 , absolute_error , 2 , 1 , 1  is  0.2532531062947911
-----------------------------------------------------
The R2 is  0.5154216942138229
The RMSE is  0.25677291503448857


## Minmax scaling

In [28]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  100 , friedman_mse , 2 , 1 , 0.3  is  0.5973160624072644
The RMSE for  100 , friedman_mse , 2 , 1 , 0.3  is  0.06650439208396082


### Drop correlated features

In [29]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 4 , 1 , 0.3  is  0.5611552915894443
The RMSE for  10 , friedman_mse , 4 , 1 , 0.3  is  0.06338714655682112


### Feature elimination

In [30]:
X=feature_elimination(X,Y,40,ml_model_prime,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.001782
1        s_B  0.002693
2        s_X  0.000000
3    density  0.009110
4   mean_A2B  0.115753
5   mean_A2X  0.229756
6   mean_B2X  0.000698
7   mean_X2X  0.163343
8    std_A2B  0.000493
9    std_A2X  0.006882
10   std_B2X  0.000382
11   std_X2X  0.005033
12     E_coh  0.034918
13        TF  0.002492
14        OF  0.011674
15       A_Z  0.001355
16       B_Z  0.005114
17       X_Z  0.067327
18       B_G  0.000088
19     A_IEI  0.000083
20     B_IEI  0.001952
21     X_IEI  0.013214
22    B_IEII  0.001508
23    X_IEII  0.167796
24      A_EA  0.000154
25      B_EA  0.009343
26    A_ChiP  0.000244
27    B_ChiP  0.022910
28    A_ChiA  0.001469
29    A_Rvdw  0.007056
30      B_Ra  0.000724
31      B_MP  0.000085
32     A_Rho  0.000306
33      B_MV  0.009333
34      X_MV  0.070730
35      B_Hf  0.005251
36   A_Kappa  0.000000
37   B_Kappa  0.013080
38     A_CvM  0.008584
39     B_CvM  0.007288
-----------------------------------------------------
The

In [31]:
X=feature_elimination(X,Y,30,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.005411
1        s_B  0.005520
2        s_X  0.071218
3    density  0.008444
4   mean_A2B  0.118307
5   mean_A2X  0.160500
6   mean_B2X  0.003458
7   mean_X2X  0.237087
8    std_A2X  0.009401
9    std_B2X  0.002470
10   std_X2X  0.003116
11     E_coh  0.046421
12        TF  0.004548
13        OF  0.014135
14       X_Z  0.118768
15       B_G  0.009728
16     A_IEI  0.004211
17     B_IEI  0.007196
18     X_IEI  0.025200
19    X_IEII  0.018749
20      B_EA  0.014372
21    B_ChiP  0.013068
22    A_ChiA  0.003512
23    A_Rvdw  0.004689
24      B_MP  0.006196
25      B_MV  0.014858
26      X_MV  0.049208
27      B_Hf  0.004063
28   B_Kappa  0.014509
29     B_CvM  0.001637
-----------------------------------------------------
The shape of X is  (80, 30)
-----------------------------------------------------
The R2 for  100 , squared_error , 2 , 1 , 0.3  is  0.6508968565517499
The RMSE for  100 , squared_error , 2 , 1 , 0.3  is  0.06253506038994433
--------

In [32]:
X=feature_elimination(X,Y,20,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature     Score
0        s_A  0.013126
1        s_B  0.013850
2        s_X  0.049327
3    density  0.008376
4   mean_A2B  0.138862
5   mean_A2X  0.071020
6   mean_X2X  0.231596
7      E_coh  0.053221
8         OF  0.009530
9        X_Z  0.165606
10       B_G  0.012250
11     B_IEI  0.005837
12     X_IEI  0.048274
13    X_IEII  0.040342
14      B_EA  0.014900
15    B_ChiP  0.013609
16      B_MP  0.009535
17      B_MV  0.019766
18      X_MV  0.069246
19   B_Kappa  0.011728
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  10 , squared_error , 4 , 2 , 0.3  is  0.5899988756510315
The RMSE for  10 , squared_error , 4 , 2 , 0.3  is  0.05880875037932017
-----------------------------------------------------
The R2 is  0.4811707422669907
The RMSE is  0.07056853108057042


In [33]:
X=feature_elimination(X,Y,10,ml_model,method='rfe')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

    Feature     Score
0       s_X  0.183994
1   density  0.011547
2  mean_A2B  0.119056
3  mean_A2X  0.114251
4  mean_X2X  0.243878
5     E_coh  0.096120
6        OF  0.022291
7      B_EA  0.030939
8      X_MV  0.149384
9   B_Kappa  0.028540
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  10 , friedman_mse , 4 , 1 , 0.3  is  0.5376504063148901
The RMSE for  10 , friedman_mse , 4 , 1 , 0.3  is  0.058833823017968606
-----------------------------------------------------
The R2 is  0.36087768023542766
The RMSE is  0.07043214516675138
