# Model selection

In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

In [3]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.feature_selection import RFE, SelectKBest, f_regression

In [4]:
import seaborn as sns

Import data from csv files into a dataframe

In [5]:
data = pd.read_csv('Combined_all_3rd_fit.csv', encoding='cp1252')
print(data.columns.values)

['Name' 'Coef_a' 'Coef_b' 'Coef_c' 'Coef_d' 'A_site' 'B_site' 'X_site'
 'Spacegroup' 'Ehull' 'BulkModulus' 'Energy' 'ZPE' 's_A' 's_B' 's_X'
 'density' 'mean_A2B' 'mean_A2X' 'mean_B2X' 'mean_X2X' 'std_A2B' 'std_A2X'
 'std_B2X' 'std_X2X' 'E_coh' 'TF' 'OF' 'A_Z' 'B_Z' 'X_Z' 'A_M' 'B_M' 'X_M'
 'A_G' 'B_G' 'X_G' 'A_IEI' 'B_IEI' 'X_IEI' 'A_IEII' 'B_IEII' 'X_IEII'
 'A_EA' 'B_EA' 'X_EA' 'A_ChiP' 'B_ChiP' 'X_ChiP' 'A_ChiA' 'X_ChiA'
 'A_Rvdw' 'B_Rvdw' 'X_Rvdw' 'A_Rc' 'B_Rc' 'X_Rc' 'A_Ra' 'B_Ra' 'X_Ra'
 'A_MP' 'B_MP' 'X_MP' 'A_BP' 'B_BP' 'X_BP' 'A_Rho' 'B_Rho' 'A_MV' 'B_MV'
 'X_MV' 'A_Hf' 'B_Hf' 'X_Hf' 'A_Hv' 'B_Hv' 'X_Hv' 'A_Kappa' 'B_Kappa'
 'X_Kappa' 'A_CvM' 'B_CvM' 'X_CvM' 'A_B' 'B_B' 'X_B' 'A_MendeleevNo'
 'B_MendeleevNo' 'X_MendeleevNo']


Drop the unnecessary columns and fill the empty cells with zero

In [6]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup','BulkModulus',
           'Ehull','Energy','ZPE','Coef_b', 'Coef_a', 'Coef_d'], axis=1, inplace = True)
data.fillna(0, inplace= True)
d=data.copy()
columns = list(d.columns.values)
print(d.shape)

(80, 77)


Set the target variable to machine learn

In [7]:
target = 'Coef_c'

Define various methods to be used for buildig and validating the models

In [8]:
"""
    Functions to perform scaling
    
    """
def standard_scaling(target):
    data_std=d.copy()
    data_std[columns]= StandardScaler().fit_transform(d[columns])
    Y = data_std[target] 
    X = data_std.drop([target], axis=1)
    return X,Y

def minmax_scaling(target):
    data_mm=d.copy()
    data_mm[columns]= MinMaxScaler().fit_transform(d[columns])
    Y = data_mm[target] 
    X = data_mm.drop([target], axis=1)
    return X,Y

In [9]:
"""
    Function to build machine learning models by hyper parameter tuning. It also plots scatter plot
    
    """
def build_cv_model(X,Y,b_drop=False,target='Coef_d'):
    if b_drop :
        correlated_features = ["A_Rc","A_Ra","A_M","A_MP","A_MV","A_MendeleevNo","A_Hf","A_Hv","B_Rc",
                       "B_Rvdw","B_M","B_BP","B_MendeleevNo","B_Hv","X_Rc","X_Rvdw","X_M","X_BP","X_MP",
                       "X_MendeleevNo","X_Hf","X_Hv","X_G","X_B","X_CvM","X_ChiP"]
        X.drop(labels=correlated_features, axis=1, inplace=True)
        
    print("The shape of X is ",X.shape)
    
    ml_model, cv_results = train_model(X,Y,hyperparams=hyperparams, cv=True, return_cv=True)
    
    return ml_model

In [10]:
def run_model(X,Y,model,target='Coef_d'):
    rmse,r2 = run_cv(model, X, Y, n_cv = 5)
    print("-----------------------------------------------------")
    print("The R2 is ",r2)
    print("The RMSE is ",rmse)

In [11]:
"""
    Function to perform feature elimination using select K best of RFE method. It also prints the 
    feature scores obtained as:
    SelectKBest: The scores obtained from the selector
    RFE: The feature importance obtained from the RFR model 
    
    """
def feature_elimination(X,Y,n,estimator,method='skb'):
    if method=='skb':
        bestfeatures = SelectKBest(score_func=f_regression, k=10)
        fit = bestfeatures.fit(X,Y)
        dfscores = pd.DataFrame(fit.scores_)
        dfcolumns = pd.DataFrame(X.columns)
        #concat two dataframes for better visualization 
        featureScores = pd.concat([dfcolumns,dfscores],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureScores.nlargest(n,'Score')['Feature'].values]
    elif method=='rfe':
        selector = RFE(estimator, n_features_to_select=n, step=1)
        selector = selector.fit(X, Y)
        dfscores = pd.DataFrame(selector.ranking_)
        dfcolumns = pd.DataFrame(selector.feature_names_in_)
        #concat two dataframes for better visualization 
        featureSelection = pd.concat([dfcolumns,dfscores],axis=1)
        featureSelection.columns = ['Feature','Score']  #naming the dataframe columns
        X=X[featureSelection.nsmallest(n,'Score')['Feature'].values]
        estimator.fit(X,Y)
        importance = pd.DataFrame(estimator.feature_importances_)
        featureNames = pd.DataFrame(X.columns.values)
        featureScores = pd.concat([featureNames,importance],axis=1)
        featureScores.columns = ['Feature','Score']  #naming the dataframe columns
        
    print(featureScores)
    print("-----------------------------------------------------")
    return X

In [12]:
def train_model(X, y, hyperparams = None, cv = False, return_cv = False):
    """
    Function to train the ML model on the given data X (features) and y (target property). 
    If hyperparams argument is passed, all possible combinations of alpha and kernel values
    will be tried to find combination with minimum CV error. The final  model is trained
    using the fixed kernel and alpha params as determined using CV.
    
    """
    
    if cv:
        cv_results = {'alpha':[],'cv_rmse':[],'cv_r2':[]}
        for a in hyperparams['alpha']:

            lasso = Lasso(alpha=a)

            cv_error, cv_r2 = run_cv(lasso, X, y, n_cv = 5)
            cv_results['cv_rmse'].append(cv_error)
            cv_results['cv_r2'].append(cv_r2)
            cv_results['alpha'].append(a)

        cv_results = pd.DataFrame(cv_results)
        cv_results = cv_results.sort_values('cv_rmse')
        a_opt = cv_results.iloc[0]['alpha']
        
    else:
        a_opt = hyperparams['alpha'][0]

    lasso = Lasso(alpha=a_opt)
    model = lasso.fit(X,y)
    print("-----------------------------------------------------")
    print("The R2 for ",a_opt," is ",cv_results.iloc[0]['cv_r2'])
    print("The RMSE for ",a_opt," is ",cv_results.iloc[0]['cv_rmse'])
    return [model, cv_results] if return_cv else model


def run_cv(ml_model, X, y, n_cv = 5):
    """
    Function to run Cross-validation
    """
    kf = KFold(n_splits=n_cv,shuffle=True,random_state=50)
    y_val = []
    y_pred = []

    for idx, (train, val) in enumerate(kf.split(X)):
        
        X_cv_train = X.values[train]
        X_cv_val = X.values[val]

        y_cv_train = y.values[train]
        y_cv_val = y.values[val]    

        # Model fit and prediction
        model = ml_model.fit(X_cv_train, y_cv_train)
        y_pred_val = model.predict(X_cv_val)
        
        y_val.append(y_cv_val)
        y_pred.append(y_pred_val)

    # Computing errors
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    
    return rmse, r2

Define the hyperparameters to be tuned for the given algorithm

In [13]:
hyperparams={'alpha': [0.0001,0.0005,0.001,0.005,0.01,0.02,0.05,0.1,1,2,4,8]}

## Without scaling

In [14]:
data_std=d.copy()
Y = data_std[target] 
X = data_std.drop([target], axis=1)

In [15]:
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  0.0001  is  0.24379997503697431
The RMSE for  0.0001  is  0.011425317392015228


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

### Drop correlated features

In [16]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  0.0001  is  -0.07228355496194672
The RMSE for  0.0001  is  0.010921035083269274


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### Feature elimination

In [17]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   35.416986
1        s_B   50.869033
2        s_X  224.085608
3    density    3.438603
4   mean_A2B  342.561672
5   mean_A2X   96.188443
6   mean_B2X    0.414239
7   mean_X2X  144.584059
8    std_A2B    2.548061
9    std_A2X   16.241729
10   std_B2X   10.809447
11   std_X2X    1.814497
12     E_coh   42.583723
13        TF    3.212161
14        OF    5.799041
15       A_Z   12.628214
16       B_Z    9.440231
17       X_Z  200.923811
18       A_G    3.038836
19       B_G    1.727768
20     A_IEI   27.313322
21     B_IEI    4.995951
22     X_IEI   20.065186
23    A_IEII    0.476045
24    B_IEII    0.037003
25    X_IEII    9.352237
26      A_EA    5.343736
27      B_EA   14.477919
28      X_EA   43.047947
29    A_ChiP   11.969555
30    B_ChiP    7.492487
31    A_ChiA   14.678545
32    X_ChiA   30.952649
33    A_Rvdw   44.335733
34      B_Ra    3.679989
35      X_Ra  143.785064
36      B_MP    9.089462
37      A_BP   11.579900
38     A_Rho    1.106778


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [18]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
30    B_ChiP    7.492487
31        OF    5.799041
32       A_B    5.431259
33      A_EA    5.343736
34     B_Rho    5.310746
35     B_IEI    4.995951
36      B_MV    4.896555
37      B_Ra    3.679989
38   density    3.438603


In [19]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  0.0005  is  0.3145638817182349
The RMSE for  0.0005  is  0.01090447264692752


In [20]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  0.0001  is  -0.3626530838042661
The RMSE for  0.0001  is  0.0145135464104277
-----------------------------------------------------
The R2 is  -0.3626530838042661
The RMSE is  0.0145135464104277


## Standard scaling

In [21]:
X,Y = standard_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)
-----------------------------------------------------
The R2 for  0.005  is  0.5727372233594823
The RMSE for  0.005  is  0.20573521928326802


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### Drop correlated features

In [22]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-----------------------------------------------------
The R2 for  0.005  is  0.5174888503986625
The RMSE for  0.005  is  0.23469266539185613


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


### Feature elimination

In [23]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   35.416986
1        s_B   50.869033
2        s_X  224.085608
3    density    3.438603
4   mean_A2B  342.561672
5   mean_A2X   96.188443
6   mean_B2X    0.414239
7   mean_X2X  144.584059
8    std_A2B    2.548061
9    std_A2X   16.241729
10   std_B2X   10.809447
11   std_X2X    1.814497
12     E_coh   42.583723
13        TF    3.212161
14        OF    5.799041
15       A_Z   12.628214
16       B_Z    9.440231
17       X_Z  200.923811
18       A_G    3.038836
19       B_G    1.727768
20     A_IEI   27.313322
21     B_IEI    4.995951
22     X_IEI   20.065186
23    A_IEII    0.476045
24    B_IEII    0.037003
25    X_IEII    9.352237
26      A_EA    5.343736
27      B_EA   14.477919
28      X_EA   43.047947
29    A_ChiP   11.969555
30    B_ChiP    7.492487
31    A_ChiA   14.678545
32    X_ChiA   30.952649
33    A_Rvdw   44.335733
34      B_Ra    3.679989
35      X_Ra  143.785064
36      B_MP    9.089462
37      A_BP   11.579900
38     A_Rho    1.106778


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-----------------------------------------------------
The R2 for  0.005  is  0.534117804907998
The RMSE for  0.005  is  0.21892441234928592
-----------------------------------------------------
The R2 is  0.534117804907998
The RMSE is  0.21892441234928592


In [24]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
30    B_ChiP    7.492487
31        OF    5.799041
32       A_B    5.431259
33      A_EA    5.343736
34     B_Rho    5.310746
35     B_IEI    4.995951
36      B_MV    4.896555
37      B_Ra    3.679989
38   density    3.438603


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [25]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
-----------------------------------------------------
The shape of X is  (80, 20)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-----------------------------------------------------
The R2 for  0.02  is  0.38006315049453965
The RMSE for  0.02  is  0.2847276587538069
-----------------------------------------------------
The R2 is  0.30571884229073243
The RMSE is  0.29269243939369977


In [26]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  0.02  is  -0.14648142613108683
The RMSE for  0.02  is  0.37968725978390094
-----------------------------------------------------
The R2 is  -0.27923492476918543
The RMSE is  0.3851267679553882


## Minmax scaling

In [27]:
X,Y = minmax_scaling(target)
ml_model=build_cv_model(X,Y,False,target)

The shape of X is  (80, 76)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


-----------------------------------------------------
The R2 for  0.0005  is  0.6332717849642497
The RMSE for  0.0005  is  0.052188154036029


### Drop correlated features

In [28]:
ml_model_prime=build_cv_model(X,Y,True,target)

The shape of X is  (80, 50)
-----------------------------------------------------
The R2 for  0.0005  is  0.5538503879236123
The RMSE for  0.0005  is  0.058537690719815676


### Feature elimination

In [29]:
X=feature_elimination(X,Y,40,ml_model_prime,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0        s_A   35.416986
1        s_B   50.869033
2        s_X  224.085608
3    density    3.438603
4   mean_A2B  342.561672
5   mean_A2X   96.188443
6   mean_B2X    0.414239
7   mean_X2X  144.584059
8    std_A2B    2.548061
9    std_A2X   16.241729
10   std_B2X   10.809447
11   std_X2X    1.814497
12     E_coh   42.583723
13        TF    3.212161
14        OF    5.799041
15       A_Z   12.628214
16       B_Z    9.440231
17       X_Z  200.923811
18       A_G    3.038836
19       B_G    1.727768
20     A_IEI   27.313322
21     B_IEI    4.995951
22     X_IEI   20.065186
23    A_IEII    0.476045
24    B_IEII    0.037003
25    X_IEII    9.352237
26      A_EA    5.343736
27      B_EA   14.477919
28      X_EA   43.047947
29    A_ChiP   11.969555
30    B_ChiP    7.492487
31    A_ChiA   14.678545
32    X_ChiA   30.952649
33    A_Rvdw   44.335733
34      B_Ra    3.679989
35      X_Ra  143.785064
36      B_MP    9.089462
37      A_BP   11.579900
38     A_Rho    1.106778


In [30]:
X=feature_elimination(X,Y,30,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
30    B_ChiP    7.492487
31        OF    5.799041
32       A_B    5.431259
33      A_EA    5.343736
34     B_Rho    5.310746
35     B_IEI    4.995951
36      B_MV    4.896555
37      B_Ra    3.679989
38   density    3.438603


In [31]:
X=feature_elimination(X,Y,20,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
20       A_Z   12.628214
21    A_ChiP   11.969555
22      A_BP   11.579900
23   std_B2X   10.809447
24       B_Z    9.440231
25    X_IEII    9.352237
26      B_MP    9.089462
27   A_Kappa    8.458841
28     A_CvM    8.388239
29      B_Hf    8.046263
-----------------------------------------------------
The shape of X is  (80, 20)
-----------------------------------------------------
The R2 for  0.001  is  0.3644758930379193
The RMSE for  0.001  is  0.06862870865468035
--

In [32]:
X=feature_elimination(X,Y,10,ml_model,method='skb')
ml_model=build_cv_model(X,Y,False,target)
run_model(X,Y,ml_model_prime,target)

     Feature       Score
0   mean_A2B  342.561672
1        s_X  224.085608
2        X_Z  200.923811
3   mean_X2X  144.584059
4       X_Ra  143.785064
5   mean_A2X   96.188443
6        s_B   50.869033
7     A_Rvdw   44.335733
8       X_EA   43.047947
9      E_coh   42.583723
10       s_A   35.416986
11   X_Kappa   32.559689
12    X_ChiA   30.952649
13     A_IEI   27.313322
14     X_IEI   20.065186
15      X_MV   19.114335
16   B_Kappa   19.043771
17   std_A2X   16.241729
18    A_ChiA   14.678545
19      B_EA   14.477919
-----------------------------------------------------
The shape of X is  (80, 10)
-----------------------------------------------------
The R2 for  0.001  is  -0.003189766610636538
The RMSE for  0.001  is  0.09108317503661613
-----------------------------------------------------
The R2 is  -0.1221836534287522
The RMSE is  0.09179048515802878
