In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
raw = pd.read_excel('../data/stability_paper_data/full_features.xlsx', 
                    sheet_name='Dataset with Generated Features', 
                    index_col=0)

In [3]:
# raw = pd.read_excel('1-s2.0-S2352340918305092-mmc2.xlsx', sheet_name='Dataset with Generated Features')
df = raw.copy()
df.drop(['Material Composition', 'Formation_energy'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,goldschmidt_TF,goldschmidt_TF_ionic,octahedral_factor,octahedral_factor_ionic,A_O,B_O,A_B,num_of_atoms_host_Asite0,shannon_radii_host_Asite0,host_Asite0_Ionic Radius (angstroms),...,Bsite_NdUnfilled_range,Bsite_NdValence_range,Bsite_NfUnfilled_range,Bsite_NfValence_range,Bsite_NpUnfilled_range,Bsite_NpValence_range,Bsite_NsUnfilled_range,Bsite_NsValence_range,Bsite_NUnfilled_range,EnergyAboveHull
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.021823,0.976828,0.414286,0.385714,2.86125,1.98,2.04125,7,1.44,1.26,...,0,0,0,0,0,0,0,0,0,29.747707
2,0.987385,0.889057,0.378571,0.464286,2.695,1.93,1.825,4,1.2,1.13,...,0,0,0,0,0,0,0,0,0,106.702335
3,0.976009,0.90836,0.452857,0.392857,2.8075,2.034,2.0415,6,1.34,1.0,...,0,0,0,0,0,0,0,0,0,171.608093
4,1.026809,0.865275,0.342857,0.492857,2.73,1.88,1.81,4,1.2,1.13,...,0,0,0,0,0,0,0,0,0,284.89819
5,0.909001,0.916519,0.452857,0.392857,2.61475,2.034,1.84875,6,1.083,1.03,...,0,0,0,0,0,0,0,0,0,270.007913


In [4]:
from sklearn.feature_selection import VarianceThreshold

# remove features with 0 variance
vt = VarianceThreshold()
vt.fit(df)
df = df.loc[:, vt.variances_ > 1e-8 ]
df.shape

(1929, 792)

In [5]:
init_x = df.loc[:, df.columns !='EnergyAboveHull']
init_y = df.loc[:, 'EnergyAboveHull']

# remove low correlated features
low_corr = []
for col in init_x.columns:
    if abs(init_x[col].corr(init_y)) < 1e-2:
        low_corr.append(col)
        
from collections import defaultdict
corr_df = init_x.corr()
# get features that have correlation > 0.95 
def high_corr(corr):
    raw_corrs = defaultdict(float)
    for col in corr.columns:
        index = corr[col].index
        for pos in range(len(index)):
            if np.abs(corr[col][pos]) >= 0.90 and index[pos] != col:
                raw_corrs[(col, index[pos])] =  corr[col][pos]
    
    # remove duplicates by score
    result = defaultdict(float)
    for key,value in raw_corrs.items():
        if value not in result.values():
            result[key] = value
            
    return result

high_corrs = high_corr(corr_df)

# keep only one of the features that are highly correlated
# keep the one that has highest correlation with target
df.drop(low_corr, axis=1, inplace=True)

for key, val in high_corrs.items():
    try: 
        if np.abs(df[key[0]].corr(df['EnergyAboveHull'])) > np.abs(df[key[1]].corr(df['EnergyAboveHull'])):
            df = df.loc[:, df.columns != key[1]]
        else: 
            df = df.loc[:, df.columns != key[0]]
    except KeyError:
        continue
        
print(df.shape)

(1929, 397)


In [6]:
class_df = df.copy()
class_df['class'] = 0
class_df.loc[class_df['EnergyAboveHull'] > 40, 'class'] = 1
class_df.drop(['EnergyAboveHull'], axis=1, inplace=True)

In [7]:
class_df['class'].value_counts(dropna=False)

1    1362
0     567
Name: class, dtype: int64

In [8]:
X_class = class_df.loc[:, class_df.columns !='class'].copy()
y_class = class_df.loc[:, 'class'].copy()

In [9]:
from sklearn.feature_selection import f_regression, f_classif, SelectKBest, RFECV, RFE
from sklearn.model_selection import train_test_split

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class)

f_values, p_values = f_classif(X_train_class, y_train_class)

pval_df = pd.DataFrame({'features': X_class.columns, 'p_val': p_values})
pval_df = pval_df[pval_df['p_val'] >= 0.05].copy()

class_df.drop(pval_df['features'], axis=1, inplace=True)
X_class.drop(pval_df['features'], axis=1, inplace=True)

In [10]:
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class)

In [11]:
# rfe_gb = RFECV(GradientBoostingClassifier(random_state=0), scoring='f1')
# rfe_gb.fit(X_train_class, y_train_class)
# print('Number of Features:',  rfe_gb.n_features_)

In [12]:
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, KFold, StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from imblearn.over_sampling import RandomOverSampler, SMOTE
# rfe_et_class = RFECV(ExtraTreesClassifier(random_state=0), cv=StratifiedKFold(5, shuffle=True, random_state=0), scoring='f1')
# rfe_et_class.fit(X_train_class, y_train_class)
# print('Number of Features:',  rfe_et_class.n_features_)

In [13]:
%%time
rfe_tree = RFECV(DecisionTreeClassifier(random_state=0), cv=StratifiedKFold(5, shuffle=True, random_state=0), scoring='f1')
rfe_tree.fit(X_train_class, y_train_class)
print('Number of Features:',  rfe_tree.n_features_)

Number of Features: 217
CPU times: user 1min 20s, sys: 391 ms, total: 1min 20s
Wall time: 1min 21s


In [14]:
X_tree = X_class.iloc[:, rfe_tree.support_]

In [15]:
def append_composition_features_1(X, composition_series):
    composition_df = composition_series.to_frame('composition')
    
    a_ref_compositions = ['Ba', 'Pr', 'Y', 'La', 'Sr',]
    b_ref_compositions = ['Fe', 'V', 'Ni', 'Mn',]
    
    def _ref_composition(composition,
                         ref_compositions):
        for c in ref_compositions:
            if c in composition:
                return c
        return 'Other'
    
    composition_df['a_category'] = composition_df['composition'].apply(
        lambda x: _ref_composition(x, a_ref_compositions))
    composition_df['b_category'] = composition_df['composition'].apply(
        lambda x: _ref_composition(x, b_ref_compositions))
    
    composition_df = composition_df.assign(
        **pd.get_dummies(composition_df['a_category'], prefix='a_'))
    composition_df = composition_df.assign(
        **pd.get_dummies(composition_df['b_category'], prefix='b_'))
    
    additional_features = composition_df.drop(
        ['composition', 'a_category', 'b_category'], axis=1)
    X_new = pd.concat((X, additional_features), axis=1)
    return X_new
    
    
def append_composition_features_2(X, composition_series):
    """
    Indicator feature for whether A-site includes 'Pr', 'La', 'Y'
    Another indicator feature for when A-site includes 'Pr' or 'Y' and B-site includes 'V'
    """
    composition_df = composition_series.to_frame('composition')
    
    def _get_aside_feature(composition):
        for c in ['Pr', 'La', 'Y']:
            if c in composition:
                return 1

        return 0

    def _get_bside_feature(composition):

        if 'V' not in composition:
            return 0

        for c in ['Pr', 'Y']:
            if c in composition:
                return 1

        return 0
    
    composition_df['a_ind'] = composition_df['composition'].apply(_get_aside_feature)
    composition_df['b_ind'] = composition_df['composition'].apply(_get_bside_feature)
    
    additional_features = composition_df[['a_ind', 'b_ind']]
    X_new = pd.concat((X, additional_features), axis=1)
    return X_new

In [16]:
# additional_features = composition_df.drop(['composition', 'a_category', 'b_category'], axis=1)
# additional_features = composition_df[['a_ind', 'b_ind']]

In [17]:
import xgboost
xgb = xgboost.XGBClassifier(random_state=0, n_estimators=400, max_depth=5)

In [18]:

def oversample(sampling_strategy, X, y):
    xtrains = []
    ytrains = []
    xtests = []
    ytests = []
    f1scores=[]
    acc = []
    if sampling_strategy == 'smote':
        sampler = SMOTE()
    elif sampling_strategy == 'over':
        sampler = RandomOverSampler(random_state=828)
        
    kf = StratifiedKFold(n_splits=5)
    for train_index, test_index in kf.split(X, y):
        X_train, y_train = X[train_index], y.iloc[train_index]
        X_test, y_test = X[test_index], y.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sampler.fit_sample(X_train, y_train)
        xgb.fit(X_train_oversampled, y_train_oversampled)
        y_pred = xgb.predict(X_test)
        acc.append(xgb.score(X_test, y_test))
        f1scores.append(f1_score(y_test, y_pred))
        #xtrains.append(X_train_oversampled)
        #ytrains.append(y_train_oversampled)
        #xtests.append(X_test)
        #ytests.append(y_test)
    # return np.array(xtrains), np.array(ytrains), np.array(xtests), np.array(ytests)
    return acc, f1scores

#xtrains, ytrains, xtests, ytests = oversample('over', X_class.to_numpy(), y_class)
acc, f1scores = oversample('over', X_tree.to_numpy(), y_class)
print('Acc: ' + str(np.mean(acc)))
print('f1: ' + str(np.mean(f1scores)))

Acc: 0.7481017428167689
f1: 0.8125096008538227


In [19]:
composition_series = raw['Material Composition']

X_tree_1 = append_composition_features_1(X_tree, composition_series)
X_tree_2 = append_composition_features_2(X_tree, composition_series)

X_tree_1.shape, X_tree_2.shape

((1929, 228), (1929, 219))

In [20]:
print('Composition Feature 1:')
acc, f1scores = oversample('over', X_tree_1.to_numpy(), y_class)
print('Acc: ' + str(np.mean(acc)))
print('f1: ' + str(np.mean(f1scores)))
print('\n')

print('Composition Feature 2:')
acc, f1scores = oversample('over', X_tree_2.to_numpy(), y_class)
print('Acc: ' + str(np.mean(acc)))
print('f1: ' + str(np.mean(f1scores)))
print('\n')

Composition Feature 1:
Acc: 0.7522495121458853
f1: 0.8163506451889386


Composition Feature 2:
Acc: 0.7512213175425612
f1: 0.8155965844759157




In [21]:
# oversampled train dataset for the first fold is xtrains[0]
# ytrains[0] are the corresponding labels for xtrains[0]
# firstfoldtrain = pd.DataFrame(xtrains[0])
# firstfoldtrain['class'] = ytrains[0]
# firstfoldtrain.columns = class_df.columns
# firstfoldtrain

## Reg

In [22]:
# X_reg = df.loc[:, df.columns !='EnergyAboveHull']
# y_reg = df.loc[:, 'EnergyAboveHull']

# X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg)

# # remove statistically insignificant features based on lin reg
# f_values, p_values = f_regression(X_train_reg, y_train_reg)
# pval_df = pd.DataFrame({'features': X_reg.columns, 'p_val': p_values})
# pval_df = pval_df[pval_df['p_val'] >= 0.05]

# X_reg.drop(pval_df['features'], axis=1, inplace=True)
# X_reg.shape

# # apply inverse hyperbolic sine transform to normalize data
# # logarithmic apply to test data as well because model is fitted to log data
# y_reg_log = y_reg.apply(lambda x: np.arcsinh(x))

# X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg_log)

# %%time

# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# rfe_log = RFECV(DecisionTreeRegressor(random_state=0), cv=KFold(10, shuffle=True), scoring='r2')
# rfe_log.fit(X_train_reg, y_train_reg)
# print('Number of Features:',  rfe_log.n_features_)

# X_dtr = X_reg.iloc[:, rfe_log.support_]
# X_dtr_train, X_dtr_test, y_dtr_train, y_dtr_test = train_test_split(X_dtr, y_reg_log)

In [23]:
# %%time

# from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, ExtraTreesRegressor
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# grid = RandomizedSearchCV(ExtraTreesRegressor(random_state=0), param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# grid.fit(X_dtr_train, y_dtr_train)
# grid.best_params_

In [24]:
tree = ExtraTreesRegressor(random_state=0, max_depth=20, n_estimators=1800, min_samples_split=2, min_samples_leaf=1,
                          max_features='auto', bootstrap=False)

In [26]:
# oversample for regression
import smogn

def oversample_reg(df, model):
    dftrains=[]
    dftests=[]
    r2 = []
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    for train_index, test_index in kf.split(df):
        try:
            df_train = df.iloc[train_index]
            df_test = df.iloc[test_index]
            df_smogn = smogn.smoter(data = df_train.reset_index(drop=True), y = 'EnergyAboveHull')
            X_train_smogn = df_smogn.loc[:, df_smogn.columns!='EnergyAboveHull']
            y_train_smogn = df_smogn['EnergyAboveHull']
            X_test = df_test.loc[:, df_test.columns!='EnergyAboveHull']
            y_test = df_test['EnergyAboveHull']
            model.fit(X_train_smogn, y_train_smogn)
            r2.append(model.score(X_test, y_test))
        #dftrains.append(df_smogn)
        #dftests.append(df_test)
        except:
            continue
        
    #return dftrains, dftests
    return r2

In [27]:
%%time 
r2 = oversample_reg(df, tree)
np.mean(r2)

dist_matrix: 100%|##########| 201/201 [03:03<00:00,  1.09it/s]
synth_matrix: 100%|##########| 201/201 [00:12<00:00, 16.15it/s]
r_index: 100%|##########| 168/168 [00:05<00:00, 32.42it/s]
dist_matrix: 100%|##########| 194/194 [02:50<00:00,  1.14it/s]
synth_matrix: 100%|##########| 194/194 [00:11<00:00, 16.29it/s]
r_index: 100%|##########| 190/190 [00:06<00:00, 30.23it/s]
dist_matrix: 100%|##########| 198/198 [02:59<00:00,  1.11it/s]
synth_matrix: 100%|##########| 198/198 [00:12<00:00, 16.28it/s]
r_index: 100%|##########| 177/177 [00:05<00:00, 32.58it/s]
dist_matrix: 100%|##########| 191/191 [02:51<00:00,  1.11it/s]
synth_matrix: 100%|##########| 191/191 [00:17<00:00, 10.77it/s]
r_index: 100%|##########| 7/7 [00:00<00:00, 23.12it/s]
dist_matrix: 100%|##########| 201/201 [03:01<00:00,  1.11it/s]
synth_matrix: 100%|##########| 201/201 [00:11<00:00, 17.37it/s]
r_index: 100%|##########| 168/168 [00:04<00:00, 33.86it/s]

CPU times: user 17min 44s, sys: 3.99 s, total: 17min 48s
Wall time: 17min 48s





0.6343180021942851

In [28]:
df_1 = append_composition_features_1(df, composition_series)
df.shape, df_1.shape

((1929, 397), (1929, 408))

In [30]:
%%time

# added composition features 
r2 = oversample_reg(df_1, tree)
np.mean(r2)

dist_matrix: 100%|##########| 201/201 [03:11<00:00,  1.05it/s]
synth_matrix: 100%|##########| 201/201 [00:13<00:00, 15.44it/s]
r_index: 100%|##########| 168/168 [00:05<00:00, 28.42it/s]
dist_matrix: 100%|##########| 194/194 [03:01<00:00,  1.07it/s]
synth_matrix: 100%|##########| 194/194 [00:12<00:00, 15.79it/s]
r_index: 100%|##########| 190/190 [00:06<00:00, 30.39it/s]
dist_matrix: 100%|##########| 198/198 [03:10<00:00,  1.04it/s]
synth_matrix: 100%|##########| 198/198 [00:12<00:00, 16.08it/s]
r_index: 100%|##########| 177/177 [00:05<00:00, 30.17it/s]
dist_matrix: 100%|##########| 191/191 [02:56<00:00,  1.08it/s]
synth_matrix: 100%|##########| 191/191 [00:19<00:00,  9.81it/s]
r_index: 100%|##########| 7/7 [00:00<00:00, 45.52it/s]
dist_matrix: 100%|##########| 201/201 [03:16<00:00,  1.02it/s]
synth_matrix: 100%|##########| 201/201 [00:11<00:00, 16.91it/s]
r_index: 100%|##########| 168/168 [00:05<00:00, 33.39it/s]


CPU times: user 18min 41s, sys: 4.91 s, total: 18min 46s
Wall time: 18min 48s


0.7397315296261919