In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
import seaborn as sns 
import re
 
import warnings # supress warnings
warnings.filterwarnings('ignore')

In [2]:
import sklearn

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, scale, StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_selection import RFE, SelectKBest, f_regression

from sklearn.neural_network import MLPClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF, Matern, RationalQuadratic
from sklearn.gaussian_process.kernels import Kernel, Hyperparameter, ConstantKernel
from sklearn.gaussian_process.kernels import GenericKernelMixin

from sklearn import metrics
from sklearn.metrics import classification_report

from sklearn.base import clone

In [3]:
data = pd.read_csv('Combined_all_3rd.csv', encoding='cp1252')   

In [4]:
data.columns.values

array(['Name', 'Coef_a', 'Coef_b', 'Coef_c', 'Coef_d', 'A_site', 'B_site',
       'X_site', 'Spacegroup', 'Ehull', 'BulkModulus', 'Energy', 'ZPE',
       's_A', 's_B', 's_X', 'density', 'mean_A2B', 'mean_A2X', 'mean_B2X',
       'mean_X2X', 'std_A2B', 'std_A2X', 'std_B2X', 'std_X2X', 'E_coh',
       'TF', 'OF', 'A_Z', 'B_Z', 'X_Z', 'A_M', 'B_M', 'X_M', 'A_G', 'B_G',
       'X_G', 'A_IEI', 'B_IEI', 'X_IEI', 'A_IEII', 'B_IEII', 'X_IEII',
       'A_EA', 'B_EA', 'X_EA', 'A_ChiP', 'B_ChiP', 'X_ChiP', 'A_ChiA',
       'X_ChiA', 'A_Rvdw', 'B_Rvdw', 'X_Rvdw', 'A_Rc', 'B_Rc', 'X_Rc',
       'A_Ra', 'B_Ra', 'X_Ra', 'A_MP', 'B_MP', 'X_MP', 'A_BP', 'B_BP',
       'X_BP', 'A_Rho', 'B_Rho', 'A_MV', 'B_MV', 'X_MV', 'A_Hf', 'B_Hf',
       'X_Hf', 'A_Hv', 'B_Hv', 'X_Hv', 'A_Kappa', 'B_Kappa', 'X_Kappa',
       'A_CvM', 'B_CvM', 'X_CvM', 'A_B', 'B_B', 'X_B', 'A_MendeleevNo',
       'B_MendeleevNo', 'X_MendeleevNo'], dtype=object)

In [5]:
data.drop(['Name', 'A_site', 'B_site', 'X_site', 'Spacegroup', 'Ehull','BulkModulus', 'Energy','s_A','s_B','s_X','Coef_b', 'Coef_c', 'Coef_d'], axis=1, inplace = True)

In [6]:
data.fillna(0, inplace= True)

In [7]:
data.columns.values

array(['Coef_a', 'ZPE', 'density', 'mean_A2B', 'mean_A2X', 'mean_B2X',
       'mean_X2X', 'std_A2B', 'std_A2X', 'std_B2X', 'std_X2X', 'E_coh',
       'TF', 'OF', 'A_Z', 'B_Z', 'X_Z', 'A_M', 'B_M', 'X_M', 'A_G', 'B_G',
       'X_G', 'A_IEI', 'B_IEI', 'X_IEI', 'A_IEII', 'B_IEII', 'X_IEII',
       'A_EA', 'B_EA', 'X_EA', 'A_ChiP', 'B_ChiP', 'X_ChiP', 'A_ChiA',
       'X_ChiA', 'A_Rvdw', 'B_Rvdw', 'X_Rvdw', 'A_Rc', 'B_Rc', 'X_Rc',
       'A_Ra', 'B_Ra', 'X_Ra', 'A_MP', 'B_MP', 'X_MP', 'A_BP', 'B_BP',
       'X_BP', 'A_Rho', 'B_Rho', 'A_MV', 'B_MV', 'X_MV', 'A_Hf', 'B_Hf',
       'X_Hf', 'A_Hv', 'B_Hv', 'X_Hv', 'A_Kappa', 'B_Kappa', 'X_Kappa',
       'A_CvM', 'B_CvM', 'X_CvM', 'A_B', 'B_B', 'X_B', 'A_MendeleevNo',
       'B_MendeleevNo', 'X_MendeleevNo'], dtype=object)

In [8]:
columns = ['ZPE']

# MinMAx Scaling

In [9]:
data_std=data.copy()
scalerZPE=MinMaxScaler()
scalerZPE.fit(data[columns])
data_std[columns]= scalerZPE.transform(data[columns])
Y = data_std['ZPE'] 
X = data_std.drop(['ZPE'], axis=1)

In [10]:
corrmat = X.corr()
top_corr_features = corrmat.index
correlated_features = set()
#plt.figure(figsize=(20,20))
#plot heat map
#g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn" )

In [11]:
for i in range(len(X.columns)):
    for j in range(i):
        if abs(corrmat.iloc[i, j]) > 0.9:
            colname = corrmat.columns[i]
            correlated_features.add(colname)
len(correlated_features)
print(correlated_features)

{'A_Hv', 'X_Hv', 'X_Rc', 'X_B', 'X_BP', 'X_Ra', 'A_M', 'X_Hf', 'B_BP', 'X_MP', 'X_MendeleevNo', 'X_CvM', 'A_Ra', 'B_M', 'A_Rc', 'A_MendeleevNo', 'X_IEII', 'A_MV', 'A_BP', 'X_ChiA', 'B_Hv', 'A_Hf', 'X_M'}


In [12]:
data_std.drop(labels=correlated_features, axis=1, inplace=True)
data_std.shape
Y = data_std['ZPE'] 
X = data_std.drop(['ZPE'], axis=1)

In [13]:
data_std.to_csv('scaled_ZPE.csv') 

In [14]:
sisso_out = pd.read_csv('sisso_out_ZPE.csv')

In [15]:
sisso_out.columns.values

array(['Calculated', 'descriptor_1', 'descriptor_2', 'descriptor_3',
       'descriptor_4', 'descriptor_5', 'descriptor_6', 'descriptor_7',
       'Predicted'], dtype=object)

In [16]:
columns = ['Calculated']

In [17]:
sisso_out[columns]=scalerZPE.inverse_transform(sisso_out[columns])

In [18]:
columns = ['Predicted']

In [19]:
sisso_out[columns]=scalerZPE.inverse_transform(sisso_out[columns])

In [20]:
sisso_out.to_csv('unscaled_ZPE.csv') 