In [1]:
import sys
import os
import numpy as np
import pandas as pd
import pymatgen as mg
import matplotlib.pyplot as plt
%matplotlib notebook
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.ticker import MultipleLocator
import ternary
import seaborn as sn
import re

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# add modules
module_dir = '../modules'
if module_dir not in sys.path:
    sys.path.append(module_dir)
#imports from ../modules
from pickle_tools import save_pickle, load_pickle
import model_eval as mv
import feature_select as fs
import file_load as fl
from BCA_featurizer import BCA_Featurizer, GenericFeaturizer
from bca_plotting import get_coords_from_comp, add_colorbar, plot_labeled_ternary, featurize_simplex, \
    estimator_ternary_heatmap, scatter_over_heatmap, draw_guidelines


# add alepython directory
ale_dir = 'c:\\users\\jdhuang\\onedrive - colorado school of mines\\python\\cloned_repos\\ALEpython'
if ale_dir not in sys.path:
    sys.path.append(ale_dir)
from alepython import ale


%load_ext autoreload
%autoreload 2

Created MatProjCalc instance
Created MatProjCalc instance


In [2]:
basedatadir = '../data'
plotdir = '../images'
datadirs = next(os.walk(basedatadir))[1]

plt.rcParams['font.sans-serif'] = 'Helvetica'
plt.rcParams['image.cmap'] = 'inferno'

# Load and aggregate data

In [3]:
%matplotlib notebook
init_df = fl.load_dir(os.path.join(basedatadir,'init_files_Cadigan'))
supp_df = fl.load_dir(os.path.join(basedatadir,'new_files_20190416'))
test_df = fl.load_dir(os.path.join(basedatadir,'test_20190530'))

<IPython.core.display.Javascript object>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# combine datasets
init_df['batch'] = 'initial'
supp_df['batch'] = 'supplementary'
test_df['batch'] = 'test'
init_df['batchdir']= os.path.join(basedatadir,'init_files_Cadigan')
supp_df['batchdir'] = os.path.join(basedatadir,'new_files_20190416')
test_df['batchdir'] = os.path.join(basedatadir,'test_20190530')

data = pd.concat([init_df,supp_df,test_df],ignore_index=True)
data = data.rename(columns={'max_NH3_Prod_rate':'max_rate'})
data['max_flow'] = data.apply(lambda x: fl.load_data(os.path.join(x['batchdir'],x['filename']))['tot-flow'].max(),axis=1)

# axes for max rate ternaries
fig1, axes1 = plt.subplots(1,3,figsize=(9.5,2.5))
# axes for max ppm ternaries
fig2, axes2 = plt.subplots(1,3,figsize=(9.5,2.5))

# normalize to B2CA control
b2ca_norm_rate = 190
b2ca_norm_ppm = 5293
data['norm_max_rate'] = 0
for (batch, bdf),ax1,ax2 in zip(data.groupby('batch'),axes1,axes2):
    # normalize each batch based on the B2CA control
    rate_factor = b2ca_norm_rate/bdf[bdf['formula']=='Ba2 Ca1 Al2 O6']['max_rate'].max()
    if batch=='test':
        # very low B2CA control rate in test batch.  Believe this is due both to reactor setup and B2CA batch
        # apply square root to the factor to account for this (we shouldn't normalize for the bad B2CA batch, only for the reactor setup)
        bdf['norm_max_rate'] = bdf['max_rate']*(rate_factor**0.5)
    else:
        bdf['norm_max_rate'] = bdf['max_rate']*rate_factor
    ppm_factor = b2ca_norm_ppm/bdf[bdf['formula']=='Ba2 Ca1 Al2 O6']['max_NH3_ppm'].max()
    bdf['norm_max_ppm'] = bdf['max_NH3_ppm']*ppm_factor
    data.loc[bdf.index,'norm_max_rate'] = bdf['norm_max_rate']
    data.loc[bdf.index,'norm_max_ppm'] = bdf['norm_max_ppm']
    data.loc[bdf.index,'rate_factor'] = rate_factor
    data.loc[bdf.index,'ppm_factor'] = ppm_factor
    
    plot_labeled_ternary(bdf['composition'],bdf['norm_max_rate'],ax=ax1,point_labelsize=6,corner_labelsize=9,s=10)
    ax1.set_title(batch,x=0.1,y=0.7,bbox=dict(facecolor='wheat',boxstyle='round'),size=10)

    plot_labeled_ternary(bdf['composition'],bdf['norm_max_ppm']/100,ax=ax2,point_labelsize=6,corner_labelsize=9,s=10)
    ax2.set_title(batch,x=0.1,y=0.7,bbox=dict(facecolor='wheat',boxstyle='round'),size=10)

fig1.suptitle('Max Production Rates (mmol/g*h)')
fig2.suptitle('Max Conversion Rates (ppm*100)')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5, 0.98, 'Max Conversion Rates (ppm*100)')

# Clean initial/supplementary data to produce training set

In [5]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(8,3.5))

# add missing rates based on Adam's ternary plot
add_BCA_rates = {'BCA413':138,'BCA100':8,'BCA210':15}
# full_data = intermediate dataset
full_data = data.copy()
for bs, rate in add_BCA_rates.items():
    comp = mg.Composition(fl.BCA_formula_from_str(bs))
    full_data = full_data.append(pd.Series(['none',comp,comp.formula,fl.BCA_str_from_comp(comp),rate,np.nan,
                                              'manual','none',rate,np.nan,1,1],
                                             index=['filename', 'composition', 'formula', 'BCA_str', 'max_rate','max_NH3_ppm',
                                                    'batch', 'batchdir', 'norm_max_rate', 'norm_max_ppm','rate_factor', 'ppm_factor']),
                                   ignore_index=True)
    
# training/validation data     
tv_data = full_data[full_data['batch'].isin(['initial','supplementary','manual'])]
# remove C5A3 - overlaps C12A7 and has anomalously low value
# remove B5C3A2? - not in Adam's ternary
tv_data = tv_data[~tv_data['BCA_str'].isin(['B0C5A3'])]#, 'B5C3A2'])]
    
# select max rate for each composition
tv_data['composition_max'] = tv_data.groupby('formula')['norm_max_rate'].transform(max)
tv_data = tv_data[tv_data['norm_max_rate']==tv_data['composition_max']]
"commented out line below - note that this leaves 2 B2CA data points in the training set! Did this to give B2CA more weight"
# in case of multiple entries matching max, only keep one (only applies to B2CA at the moment)
# tv_data = tv_data[tv_data['filename']==tv_data.groupby('formula')['filename'].transform(min)]

# plot init/supp data as loaded
is_data = data[data['batch'].isin(['initial','supplementary'])]
plot_labeled_ternary(is_data['composition'],is_data['norm_max_rate'],ax=ax1,s=20,point_labeloffset=[0,0.015,0],
                     point_labelsize=7,corner_labelsize=9,add_labeloffset=0.02
                    )
# plot cleaned/added data
plot_labeled_ternary(tv_data['composition'],tv_data['norm_max_rate'],ax=ax2,s=20,point_labeloffset=[0,0.015,0],
                     point_labelsize=7,corner_labelsize=9,add_labeloffset=0.02
                    )

# dataset with fully processed training, validation, and test data
all_data = pd.concat([tv_data,data[(data['batch']=='test') & (data['BCA_str']!='B2C1A1')]],sort=True)

#plot all data
fgi2, axes = plt.subplots(1,2,figsize=(8,3.5))
plot_labeled_ternary(all_data['composition'],all_data['norm_max_rate'],ax=axes[0],s=20,point_labeloffset=[0,0.015,0],
                     point_labelsize=7,corner_labelsize=9,add_labeloffset=0.02
                    )
plot_labeled_ternary(all_data['composition'],all_data['max_rate'],ax=axes[1],s=20,point_labeloffset=[0,0.015,0],
                     point_labelsize=7,corner_labelsize=9,add_labeloffset=0.02
                    )


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TernaryAxesSubplot: -9223371884288038980

# Featurize compositions

In [6]:
bf = BCA_Featurizer()
bf.set_n_jobs(1)

print('Number of available features:', len(bf.feature_labels()))

tv_features = bf.featurize_dataframe(tv_data,col_id="composition",inplace=False)

"Set up train and validation datasets"
# val_idx = [16, 12, 4, 22]
# split init set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(tv_features.loc[:,'MO_ratio':],tv_features[['max_rate','norm_max_rate']],
                                                  test_size=0.15,random_state=19)

# X_train = tv_features[~tv_features.index.isin(val_idx)].loc[:,'MO_ratio':]
# y_train = tv_features[~tv_features.index.isin(val_idx)].loc[:,['max_rate','norm_max_rate']]
# X_val = tv_features.loc[val_idx,'MO_ratio':]
# y_val = tv_features.loc[val_idx,['max_rate','norm_max_rate']]

# check all features are numeric
print("Feature datatypes:", X_train.dtypes.unique())
# remove zero-var features
zero_var = list(X_train.columns[X_train.var()==0])
X_train = X_train.drop(zero_var,axis=1)
print("Zero variance features removed:", zero_var)
# remove null features
nulls = list(X_train.columns[pd.isnull(X_train).max()])
X_train = X_train.drop(nulls,axis=1)
X_val_gen = X_val.drop(nulls,axis=1)
print("Null features removed:", nulls)
print('\nFinal number of features:', len(X_train.columns))
Xy_train = pd.concat([X_train,y_train],axis=1)
Xy_val = pd.concat([X_val,y_val],axis=1)
Xy_tv = pd.concat([Xy_train,Xy_val],sort=True)

"Set up test dataset"
test_data = data[(data['batch']=='test') & (data['BCA_str']!='B2C1A1')]
test_features = bf.featurize_dataframe(test_data,col_id="composition",inplace=False)
# remove features that had zero variance in training data
X_test = test_features.loc[:,'MO_ratio':]
X_test = X_test.drop(zero_var+nulls,axis=1)
y_test = test_features[['max_rate','norm_max_rate']]
Xy_test = pd.concat([X_test,y_test],axis=1)

"Full dataset"
Xy_all = pd.concat([Xy_tv,Xy_test],sort=True)

Number of available features: 104


HBox(children=(IntProgress(value=0, description='BCA_Featurizer', max=22, style=ProgressStyle(description_widt…


Feature datatypes: [dtype('float64')]
Zero variance features removed: ['M_ValenceElec_s_mean', 'M_ValenceElec_d_mean', 'M_ValenceElec_f_mean', 'M_ValenceElec_d_frac', 'M_ValenceElec_f_frac']
Null features removed: ['BCA_bulk_mod_mean', 'BCA_bulk_mod_std', 'BCA_shear_mod_mean', 'BCA_shear_mod_std']

Final number of features: 95


HBox(children=(IntProgress(value=0, description='BCA_Featurizer', max=4, style=ProgressStyle(description_width…




### <span style="color:Red"> Figure S2: Train, test, and validation data</span>

In [7]:
fig, axes = plt.subplots(1,3,figsize=(7.25,2))

dfs = [tv_features.loc[X_train.index,:], tv_features.loc[X_val.index,:], test_features]
for ax,df,title in zip(axes,dfs,['Train','Validation','Test']):

    tax = plot_labeled_ternary(df['composition'],df['norm_max_rate'],
                               s=20,point_labeloffset=[-0.01,0.015,0],add_labeloffset=0.06,
                               label_points=False,corner_labelsize=8.5,vlim=(0,200),ax=ax,
                              )
    tax.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7)
    tax.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7,clockwise=True)
    # plot b2ca with outline (hard to see with light color fill)
    b2ca = df[df['BCA_str']=='B2C1A1']
    if len(b2ca) > 0:
        tax.scatter([get_coords_from_comp(c) for c in b2ca['composition']],c=b2ca['norm_max_rate'],vmin=0,vmax=200,
                    edgecolor='k',lw=0.5,s=25
                   )

    draw_guidelines(tax,color='k',alpha=0.4,lw=0.6,zorder=0)
    ax.set_title(title,x=-0.05,y=0.83,size=9,ha='left',va='center',bbox={'boxstyle':'round','facecolor':'white'})


add_colorbar(fig,label='Rate (mmol/g$\cdot$h)',vlim=(0,200),label_kwargs={'size':9},tick_params={'labelsize':8},
            subplots_adjust={'left':0.05,'right':0.85,'wspace':0.35},cbrect=[0.9,0.18,0.015,0.7])

# fig.savefig(os.path.join(plotdir,'FigS2_TrainValTestData.jpg'),dpi=500)

<IPython.core.display.Javascript object>

# Pairplots

In [27]:
# pairplots of response vs. physical model features
ppvars = ['norm_max_rate','ON_BondEnergyDelta_mean','M_WorkFunction_mean']
sn.pairplot(Xy_tv,vars=ppvars)

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x1ba8e15e608>

# 1. Physical model

In [8]:
target = 'norm_max_rate'

model_features1 = ['ON_BondEnergyDelta_mean','M_WorkFunction_mean']

rs = RobustScaler()

rf1 = RandomForestRegressor(random_state=11)
rfp1 = Pipeline([('scale',rs),('rf',rf1)])

## 1.1 Hyperparameter optimization via grid search

In [9]:
X_gs = X_train.loc[:,model_features1].values
y_gs = y_train[target].values
w_gs = np.log(y_gs)

# due to small sample size, CV is quite sensitive to train-validation splits
# use repeated CV with multiple different splits to get relatively unbiased results
gs_rf1 = mv.GridSearchRepeatedCV(rfp1,param_grid={'rf__n_estimators':[10,20,30,40,50],
                                                'rf__max_features':np.arange(1,len(model_features1)+1,1).astype(int),
                                                'rf__max_depth':[3,5,10,25]
                                               }
                               )
gs_rf1.fit(X_gs,y_gs,repeat=4,n_splits=4,random_state=17,
            sample_weight=w_gs
          )

In [10]:
gs_rf1.plot_grid_results(fixed_params={'rf__max_features':gs_rf1.best_params_['rf__max_features']})

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2385b9b7608>

In [11]:
# plot grid search results
%matplotlib notebook
fig = plt.figure(figsize=(8,3))
ax1 = fig.add_subplot(121,projection='3d')
ax2 = fig.add_subplot(122)
#facecolors='none' seems to be breaking plt.scatter() in 3d n mpl 3.1.1
# gs_rf1.plot_grid_results(ax=ax1,colorbar=False)
gs_rf1.plot_grid_results(fixed_params={'rf__max_features':gs_rf1.best_params_['rf__max_features']},ax=ax2)
fig.tight_layout()
fig.subplots_adjust(wspace=0.4)

<IPython.core.display.Javascript object>

## 1.2 Cross validation with optimized hyperparameters

### <span style="color:Red"> Figure S3: Model validation PVA</span>

In [12]:
rfp1.set_params(**gs_rf1.best_params_)

label_fs = 9
tick_fs = 8
fig, axes = plt.subplots(1,2,figsize=(7.25,3))#,sharex=True,sharey=True)

w = np.log(y_train[target].values)

mv.repeated_KFold_pva(rfp1,X_train.loc[:,model_features1].values,y=y_train[target].values,repeat=4,
                      sample_weight=w,n_splits=4,plot_type='mean',ax=axes[0],random_state=17,show_metrics=['r2'],
                      marker='o',facecolors='white',edgecolors='k',s=25,zorder=1,line_kw={'zorder':0},
                      text_kw={'size':label_fs,'y':0.85}
                     )

# get actual amd predicted y to calculate unweighted MAE
actuals, preds, test_r2, test_mae = mv.repeated_KFold_cv(rfp1,X_train.loc[:,model_features1].values,y=y_train[target].values,
                                                         repeat=4,sample_weight=w,n_splits=4,random_state=17)
custom_cv_score = np.mean(test_r2)

axes[0].text(0.05,0.76,'MAE: {}'.format(np.round(mean_absolute_error(actuals,preds),1)),
             transform=axes[0].transAxes,size=label_fs,va='top'
            )


# plot validation (holdout) pva
rfp1.fit(X_train.loc[:,model_features1].values,y_train[target].values,**{'rf__sample_weight':w})
mv.plot_pva(rfp1,X_val.loc[:,model_features1].values,y=y_val[target].values,ax=axes[1],
            sample_weight=np.log(y_val[target]),show_metrics=['r2'],legend=False,
            marker='o',facecolors='white',edgecolors='k',s=25,zorder=1,line_kw={'zorder':0},
            text_kw={'size':label_fs,'y':0.85}
           )
# get unweighted mae
val_mae = mean_absolute_error(y_val[target],rfp1.predict(X_val.loc[:,model_features1]))
axes[1].text(0.05,0.76,'MAE: {}'.format(np.round(np.mean(val_mae),1)),
             transform=axes[1].transAxes,size=label_fs,va='top'
            )

custom_val_score = rfp1.score(X_val.loc[:,model_features1].values,y=y_val[target].values)

for ax in axes:
    ax.tick_params(axis='both',labelsize=tick_fs)
    ax.set_xlabel('Actual rate (mmol/g$\cdot$h)',size=label_fs)
    ax.set_ylabel('Predicted rate (mmol/g$\cdot$h)',size=label_fs)
    
axes[0].legend(loc='lower right',fontsize=label_fs)

# axes[0].set_title('4-Fold Cross Validation',size=label_fs+1)
# axes[1].set_title('Holdout Validation',size=label_fs+1)
for ax, let in zip(axes,['a','b']):
    ax.text(0.05,0.95,f'({let})',weight='bold',size=label_fs+1,transform=ax.transAxes,va='top')

    
fig.tight_layout()
# fig.savefig(os.path.join(plotdir,'FigS3_ModelValidationPVA.jpg'),dpi=500)

<IPython.core.display.Javascript object>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


## 1.3 Generate prediction heatmap 

In [15]:
fig, ax = plt.subplots()

w = np.log(Xy_tv[target].values)
rfp1.fit(Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,**{'rf__sample_weight':w})
mv.plot_pva(rfp1,Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,ax=ax)
ax.set_xlim(0,200)
ax.set_ylim(0,200)
    
fig.tight_layout()
fig.subplots_adjust(top=0.88)
fig.suptitle('Training for Heatmap',size=14) 

<IPython.core.display.Javascript object>

Text(0.5, 0.98, 'Training for Heatmap')

In [14]:
# generate heatmap features
scale = 50
coords, X_simplex = featurize_simplex(scale,bf)
X_hm1 = X_simplex.loc[:,model_features1]

HBox(children=(IntProgress(value=0, description='BCA_Featurizer', max=1326, style=ProgressStyle(description_wi…




In [16]:
fig, ax = plt.subplots(figsize=(3.54,3.54*7/9))
vlim = (0,200)
tax = estimator_ternary_heatmap(50,rfp1,use_X=X_hm1,ax=ax,add_labeloffset=0.02,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=10)

add_colorbar(ax=ax,vlim=vlim,cbrect=[0.8,0.2,0.04,0.68],
             subplots_adjust={'left':0.07,'wspace':0.35, 'hspace':0.25, 'right':0.7},
             label='Predicted Rate (mmol/g$\cdot$h)',label_kwargs={'size':10},tick_params={'labelsize':10}
            )

<IPython.core.display.Javascript object>

## 1.4 Test set

In [17]:
# ensure model is fit to Xy_tv
w = np.log(Xy_tv[target].values)
rfp1.fit(Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,**{'rf__sample_weight':w})

fig, axes = plt.subplots(1,2,figsize=(7,3))

mv.plot_pva(rfp1,X_test.loc[:,model_features1].values,y_test[target],
            sample_weight=np.log(y_test[target]),ax=axes[0]
           )
custom_test_score = rfp1.score(X_test.loc[:,model_features1].values,y_test[target],sample_weight=np.log(y_test[target]))
# unweighted MAE
mae = mean_absolute_error(y_test[target],rfp1.predict(X_test.loc[:,model_features1]))
print('Unweighted mae:',mae)

# plot only the good points and get the MAE
good_index = [1,2,3]
mv.plot_pva(rfp1,X_test.iloc[good_index,:][model_features1].values,y_test.iloc[good_index,:][target],
            sample_weight=np.log(y_test.iloc[good_index,:][target]),ax=axes[1]
           )
mae = mean_absolute_error(y_test.iloc[good_index,:][target],rfp1.predict(X_test.iloc[good_index,:][model_features1]))
print('Unweighted mae:',mae)

fig.tight_layout()

<IPython.core.display.Javascript object>

Unweighted mae: 20.33447471484255
Unweighted mae: 9.472734376990921


### <span style="color:Red"> Figure S4: test compositions predicted vs. actual</span>

In [18]:
fig, axes = plt.subplots(1,2,figsize=(7.25,2.9))
label_fs = 9
tick_fs = 8

# ensure model is fit to Xy_tv
w = np.log(Xy_tv[target].values)
rfp1.fit(Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,**{'rf__sample_weight':w})

# PVA
mv.plot_pva(rfp1,X_test.loc[:,model_features1].values,y_test[target].values,ax=axes[0],
            sample_weight=np.log(y_test[target]),show_metrics=['r2'],legend=True,
            marker='o',facecolors='white',edgecolors='k',s=25,zorder=1,line_kw={'zorder':0},
            text_kw={'size':label_fs,'y':0.85}
           )
# get unweighted mae
mae = mean_absolute_error(y_test[target],rfp1.predict(X_test.loc[:,model_features1]))
axes[0].text(0.05,0.76,'MAE: {}'.format(np.round(np.mean(mae),1)),
             transform=axes[0].transAxes,size=label_fs,va='top'
            )
axes[0].set_ylim(0,100)
axes[0].legend(fontsize=label_fs,loc='lower right')
axes[0].set_xlabel('Actual Rate (mmol/g$\cdot$h)',size=label_fs)
axes[0].set_ylabel('Predicted Rate (mmol/g$\cdot$h)',size=label_fs)
axes[0].tick_params(labelsize=tick_fs)

axes[0].text(0.05,0.95,'(a)',va='top',transform=axes[0].transAxes,weight='bold',size=label_fs+1)
axes[0].annotate('B$_{21}$C$_{16}$A$_3$',(10,59),xytext=(16,53),size=label_fs,arrowprops={'arrowstyle':'-','lw':0.7})

# heatmap
vlim = (0,200)
# tax = plot_labeled_ternary(test_data['composition'],test_data['norm_max_rate'],ax=axes[1])
tax = scatter_over_heatmap(test_data['composition'],test_data['norm_max_rate'],50,rfp1,
                           hmap_use_X=X_hm1,point_labeloffset=[0,0.03,0],
                           scatter_labels=test_data['BCA_str'].map(lambda x:fl.pretty_string_from_comp(mg.Composition(fl.BCA_formula_from_str(x)))),
                           ax=axes[1],add_labeloffset=0.03,vlim=vlim,cmap=plt.cm.inferno,
                           corner_labelsize=label_fs,markersize=6,scatter_labelsize=tick_fs
                          )

tax.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7)
tax.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7,clockwise=True)

axes[1].text(0.05,0.95,'(b)',va='top',transform=axes[1].transAxes,weight='bold',size=label_fs+1)

add_colorbar(ax=axes[1],vlim=vlim,cbrect=[0.9,0.26,0.02,0.6],
             subplots_adjust={'left':0.1,'wspace':0.35, 'hspace':0.25, 'right':0.82,'bottom':0.2},
             label='Rate (mmol/g$\cdot$h)',label_kwargs={'size':label_fs},tick_params={'labelsize':tick_fs}
            )

# fig.savefig(os.path.join(plotdir,'FigS4_TestPVA.jpg'),dpi=500)

<IPython.core.display.Javascript object>

## 1.5 Re-fit to full dataset & make heatmap

In [19]:
fig, ax = plt.subplots()

w = np.log(Xy_all[target].values)
rfp1.fit(Xy_all.loc[:,model_features1].values,Xy_all[target].values,**{'rf__sample_weight':w})
mv.plot_pva(rfp1,Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,ax=ax)
ax.set_xlim(0,200)
ax.set_ylim(0,200)
    
fig.tight_layout()
fig.subplots_adjust(top=0.88)
fig.suptitle('Training for Heatmap',size=14) 

<IPython.core.display.Javascript object>

Text(0.5, 0.98, 'Training for Heatmap')

### <span style="color:Red"> Figure 1: Megafigure</span>

In [22]:
label_fs = 9
tick_fs = 8
vlim = (0,200)

fig = plt.figure(figsize=(7.25,5.5))
gs = fig.add_gridspec(2,23)
ax1 = fig.add_subplot(gs[0,1:10])
ax2 = fig.add_subplot(gs[0,12:21])
# cax = fig.add_subplot(gs[0,21])
ax3 = fig.add_subplot(gs[1,1:10])
ax4 = fig.add_subplot(gs[1,12:])

fig.subplots_adjust(hspace=0.15,wspace=10,left = 0.05,top=0.92)

# ax1: Ternary scatter of data
#------------------------------
tax1 = plot_labeled_ternary(all_data['composition'],all_data['norm_max_rate'],add_labeloffset=0.05,
                            label_points=False,corner_labelsize=label_fs,edgecolor='k',ax=ax1,linewidth=0.,
                            cmap=plt.cm.plasma,marker='o',s=35#10*np.log(all_data['norm_max_rate']),vlim=vlim
                           )

# ax1.annotate('B2CA',(0.39,0.235),xytext=(0.37,0.39),transform=ax1.transAxes,arrowprops={'arrowstyle':'-'},
#              ha='left',va='top',size=label_fs,bbox={'facecolor':'white'})

tax1.scatter([[0.25,0.25,0.5]],s=45,c=[190],edgecolor='k')
tax1.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7)
tax1.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7,clockwise=True)

draw_guidelines(tax1,color='k',alpha=0.5,lw=0.6,zorder=0)


# ax2: ternary heatmap of model predictions
#------------------------------------------
# placeholder for faster plotting
# tax2 = plot_labeled_ternary(all_data['composition'],all_data['norm_max_rate'],s=50,
#                      add_labeloffset=0.05,label_points=False,corner_labelsize=9,edgecolor='k',ax=ax2)


# ensure model is fit to Xy_all
w = np.log(Xy_all[target].values)
rfp1.fit(Xy_all.loc[:,model_features1].values,Xy_all[target].values,**{'rf__sample_weight':w})

tax2 = estimator_ternary_heatmap(50,rfp1,use_X=X_hm1,ax=ax2,add_labeloffset=0.05,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=label_fs)

draw_guidelines(tax2,color='white',lw=0.7,alpha=0.7)

tax2.ticks(multiple=tax2.get_scale()/4,tick_formats='',offset=0.03,lw=0.7)
tax2.ticks(multiple=tax2.get_scale()/4,tick_formats='',offset=0.03,lw=0.7,clockwise=True)

add_colorbar(fig,cbrect=[0.88,0.58,0.015,0.33],vlim=(0,200),subplots_adjust={},
             label='Production Rate (mmol/g$\cdot$h)',label_kwargs={'size':label_fs},tick_params={'labelsize':tick_fs})


# ax3: phase diagram
#---------------------------
tax3 = ternary.TernaryAxesSubplot(ax=ax3)
tern_labels = ['CaO','Al$_2$O$_3$','BaO']
add_labeloffset=0.05
tax3.right_corner_label(tern_labels[0],fontsize=label_fs,va='center',offset=0.08+add_labeloffset)
tax3.top_corner_label(tern_labels[1],fontsize=label_fs,va='center',offset=0.05+add_labeloffset)
tax3.left_corner_label(tern_labels[2],fontsize=label_fs,va='center',offset=0.08+add_labeloffset)
tax3.boundary(linewidth=1)
ax3.axis('off')
tax3._redraw_labels()

# load phase diagram data and plot
ped_data = pd.read_csv(os.path.join(basedatadir,'BCA_PED_coords.csv'),skipfooter=2,engine='python')
for col in ['start','end']:
    ped_data[f'{col}_comp'] = ped_data[f'{col}'].map(lambda x: mg.Composition(fl.BCA_formula_from_str(x)))
    ped_data[f'{col}_coords'] = ped_data[f'{col}_comp'].map(get_coords_from_comp)
for i, row in ped_data.iterrows():
    if row['type']=='boundary':
        lw=0.5
    elif row['type']=='tie':
        lw=0.3
    tax3.line(row['start_coords'],row['end_coords'],lw=lw,c='k')

# tax3.annotate('BA',(-0.03,0.5,0.5),ha='right',va='center',size=label_fs-0.5)
# tax3.annotate('B$_3$A',(-0.03,0.25,0.75),ha='right',va='center',size=label_fs-0.5)
# tax3.annotate('CaO + B$_3$A\n+ BA',(0.27,0.28,0.45),ha='center',va='center',size=label_fs-4)

tax3.annotate('BA$_6$',(-0.015,0.857,0.143),ha='right',va='center',size=label_fs-0.5)
tax3.annotate('BA',(-0.03,0.5,0.5),ha='right',va='center',size=label_fs-0.5)
tax3.annotate('B$_3$A',(-0.03,0.25,0.75),ha='right',va='center',size=label_fs-0.5)
# tax3.annotate('B$_4$A',(-0.015,0.2,0.8),ha='right',va='center',size=label_fs-0.5)
tax3.annotate('B$_7$A',(-0.015,0.125,0.875),ha='right',va='center',size=label_fs-0.5)

# tax3.annotate('CaO + B$_3$A\n+ BA',(0.27,0.28,0.45),ha='center',va='center',size=label_fs-0.5)
tax3.annotate('C$_3$A',(0.77,0.26,0),ha='left',va='center',size=label_fs-0.5)
tax3.annotate('C$_{12}$A$_7$',(0.63,0.385,0),ha='left',va='center',size=label_fs-0.5)
tax3.annotate('CA',(0.52,0.51,0),ha='left',va='center',size=label_fs-0.5)
tax3.annotate('CA$_2$',(0.34,0.67,0),ha='left',va='center',size=label_fs-0.5)
tax3.annotate('CA$_6$',(0.15,0.857,0),ha='left',va='center',size=label_fs-0.5)

tax3.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7)
tax3.ticks(multiple=0.25,tick_formats='',offset=0.03,lw=0.7,clockwise=True)


# ax4: volcano plot
#---------------------------
ax4.errorbar(Xy_all['ON_BondEnergyDelta_mean'],Xy_all['norm_max_rate'],yerr=Xy_all['norm_max_rate']*0.15+5,
            marker='.',ms=8,ls='',capsize=0,c='k', elinewidth=1,markerfacecolor='white'
           )
ax4.set_xlim(385,430)
ax4.set_ylim(0,250)
ax4.set_xlabel('$\Delta \overline{\mathrm{BE}}_{\mathrm{MO-MN}}$ (kJ/mol)',size=label_fs)
ax4.set_ylabel('Production Rate (mmol/g$\cdot$h)',size=label_fs)

ax4.annotate('Ru/B$_2$CA',(399,189),xytext=(397,189),size=label_fs-1,arrowprops={'arrowstyle':'-'},va='center',ha='right')
ax4.annotate('Ru/CaO',(422,54),xytext=(420,52),size=label_fs-1,ha='right',arrowprops={'arrowstyle':'-'},va='center')
ax4.annotate('Ru/C$_3$A',(421,67),xytext=(419,72),size=label_fs-1,ha='right',arrowprops={'arrowstyle':'-'},va='center')
ax4.xaxis.set_minor_locator(MultipleLocator(2.5))
ax4.yaxis.set_minor_locator(MultipleLocator(10))
ax4.tick_params(axis='both',which='both',direction='in',labelsize=tick_fs)

# add a rough fit to highlight the shape
# exclude C3A, CaO, BaO
fit_data = Xy_all[(380<Xy_all['ON_BondEnergyDelta_mean']) & (Xy_all['ON_BondEnergyDelta_mean']<421.7)]
# exclude deviating points 
ignore = [8,22,25]
fit_data = fit_data[~fit_data.index.isin(ignore)]
fit = np.polyfit(fit_data['ON_BondEnergyDelta_mean'],fit_data['norm_max_rate'],deg=4)
x_fit = np.arange(388,415,1)
y_fit = np.polyval(fit,x_fit)
ax4.fill_between(x_fit,y_fit*0.85-10,y_fit*1.15+10,alpha=0.25,edgecolor='none')

# ax4 inset: pure metal volcano plot
#-------------------------------------
ax4i = fig.add_axes([0.735,0.32,0.16,0.16])
volcano_data = pd.read_csv(os.path.join(basedatadir,'MetalVolcanoData.csv'),skipfooter=2,engine='python')

elmt_data = volcano_data[volcano_data['Element']!='Curve']
curve_data = volcano_data[volcano_data['Element']=='Curve']
# plot curve
ax4i.plot(curve_data['E'],curve_data['TOF'],c='k',zorder=0,lw=0.5)
# plot pure metals
ax4i.scatter(elmt_data[elmt_data['Element']!='CoMo']['E'],elmt_data[elmt_data['Element']!='CoMo']['TOF'],c='k',s=4)
# plot CoMo
# ax4i.scatter(elmt_data[elmt_data['Element']=='CoMo']['E'],elmt_data[elmt_data['Element']=='CoMo']['TOF'],edgecolors='k',
#            facecolors='white',zorder=1,s=5
#           )
# element labels
for el in ['Fe','Mo']:
    el_pt = elmt_data[elmt_data['Element']==el]
    ax4i.text(el_pt['E']+5,el_pt['TOF'],el,size=7,ha='left',va='top')
for el in ['Co','Ni']:
    el_pt = elmt_data[elmt_data['Element']==el]
    ax4i.text(el_pt['E']-5,el_pt['TOF'],el,size=7,ha='right',va='top') 
ax4i.text(10,20,'Ru',size=7,ha='left',va='top')
ax4i.text(15,3,'Os',size=7,ha='left',va='top')

ax4i.set_yscale('log')
ax4i.tick_params(axis='both',direction='in',length=2)
ax4i.set_xticks(np.arange(-100,76,50))
ax4i.set_yticks([1e-4,1e-2,1])
ax4i.set_xticklabels([])
ax4i.set_yticklabels([])
ax4i.set_xlabel('$\Delta E$ (kJ/mol)',size=7,x=0.75,labelpad=0)
ax4i.set_ylabel('TOF (s$^{-1}$)',size=7,y=0.75,labelpad=0)



for ax,let in zip([ax1,ax2,ax3,ax4],['a','b','c','d']):
    ax.text(0.05,0.95,f'({let})',transform=ax.transAxes,size=label_fs+1,weight='bold',ha='left',va='top')

fig.savefig(os.path.join(plotdir,'Fig1_Megafigure.jpg'),dpi=500)
# fig.savefig(os.path.join(plotdir,'Fig1_Megafigure.eps'),dpi=500,format='eps')

<IPython.core.display.Javascript object>

## 1.6 ALE plots

### <span style="color:Red"> Figure 4: ALE plots</span>

In [24]:
# ensure model is trained on Xy_all
w = np.log(Xy_all[target].values)
rfp1.fit(Xy_all.loc[:,model_features1].values,Xy_all[target].values,**{'rf__sample_weight':w})
# 'train_set' is actually full ternary feature map - get predictions for full composition space
train_set = X_hm1.loc[:,model_features1]
fig, axes = plt.subplots(1,2,figsize=(7.25,2.7))
bins = 25
feature_labels = ['$\Delta \overline{\mathrm{BE}}_{\mathrm{MO-MN}}$ (kJ/mol)', 'Mean BCA Work Function (eV)']
for feature,xlabel,ax in zip(model_features1,feature_labels,axes):
    
    quantiles = np.percentile(train_set[feature], [1. / bins * i * 100 for i in range(0, bins + 1)])
    ALE = ale._first_order_ale_quant(rfp1.predict, train_set,feature,quantiles)

    ale._first_order_quant_plot(ax,quantiles,ALE-np.min(ALE),c='k')
    # plot distribution of actual training points in rugplot
    sn.rugplot(Xy_all[feature],ax=ax,c='k',alpha=0.5)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Rate ALE (mmol/g$\cdot$h)')
    # make room for rug plot at bottom and empty space at top
    ax.set_ylim(ax.get_ylim()[0]-10,ax.get_ylim()[1]+5)
    

for ax,let in zip(axes,['(a)','(b)']):
    # set same ylims for both
    ax.set_ylim(np.min([ax.get_ylim()[0] for ax in axes]), np.max([ax.get_ylim()[1] for ax in axes]))
    ax.axhline(0,c='k',ls='-',lw=1,alpha=0.75)
    ax.text(0.03,0.95,let,transform=ax.transAxes,va='top',ha='left',size=11,weight='bold')

# mark B2CA feature values
b2ca_features = tv_features.loc[tv_features['BCA_str']=='B2C1A1',model_features1].iloc[0]
for ax,feature in zip(axes,model_features1):
    ax.axvline(b2ca_features[feature],ls=':',c='b')
axes[0].text(398,25,'Ru/B$_2$CA',ha='right',color='b')
axes[1].text(3.29,75,'Ru/B$_2$CA',ha='left',color='b')

fig.tight_layout()
fig.savefig(os.path.join(plotdir,'Fig2_ALE_plots.jpg'),dpi=500)

<IPython.core.display.Javascript object>

In [43]:
"For presentation - ALE plots without rugplots"
# ensure model is trained on Xy_all
w = np.log(Xy_all[target].values)
rfp1.fit(Xy_all.loc[:,model_features1].values,Xy_all[target].values,**{'rf__sample_weight':w})
# 'train_set' is actually full ternary feature map - get predictions for full composition space
train_set = X_hm1.loc[:,model_features1]
fig, axes = plt.subplots(1,2,figsize=(7.25,2.7))
bins = 25
feature_labels = ['$\Delta \mathrm{BE}_{\mathrm{O-N}}$ (kJ/mol)', 'Mean BCA Work Function (eV)']
for feature,xlabel,ax in zip(model_features1,feature_labels,axes):
    
    quantiles = np.percentile(train_set[feature], [1. / bins * i * 100 for i in range(0, bins + 1)])
    ALE = ale._first_order_ale_quant(rfp1.predict, train_set,feature,quantiles)

    ale._first_order_quant_plot(ax,quantiles,ALE-np.min(ALE),c='k')
    # plot distribution of actual training points in rugplot
#     sn.rugplot(Xy_all[feature],ax=ax,c='k',alpha=0.5)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Rate ALE (mmol/g$\cdot$h)')
    # make room for rug plot at bottom and empty space at top
    ax.set_ylim(ax.get_ylim()[0]-10,ax.get_ylim()[1]+5)
    

for ax,let in zip(axes,['(a)','(b)']):
    # set same ylims for both
    ax.set_ylim(np.min([ax.get_ylim()[0] for ax in axes]), np.max([ax.get_ylim()[1] for ax in axes]))
    ax.axhline(0,c='k',ls='-',lw=1,alpha=0.75)
    ax.text(0.03,0.95,let,transform=ax.transAxes,va='top',ha='left',size=11,weight='bold')

# mark B2CA feature values
b2ca_features = tv_features.loc[tv_features['BCA_str']=='B2C1A1',model_features1].iloc[0]
for ax,feature in zip(axes,model_features1):
    ax.axvline(b2ca_features[feature],ls=':',c='b')
axes[0].text(398,25,'Ru/B$_2$CA',ha='right',color='b')
axes[1].text(3.29,75,'Ru/B$_2$CA',ha='left',color='b')

fig.tight_layout()
fig.savefig(os.path.join(plotdir,'ALE_plots_norugplot.jpg'),dpi=500)

<IPython.core.display.Javascript object>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


### <span style="color:Red"> Figure S5: ALE plots with and without test compositions</span>

In [25]:

fig, axes = plt.subplots(2,3,figsize=(7.5,4.5))
# 'train_set' is actually full ternary feature map
train_set = X_hm1.loc[:,model_features1]
bins = 25
feature_labels = ['$\Delta \overline{\mathrm{BE}}_{\mathrm{MO-MN}}$ (kJ/mol)', 'Mean Work Function (eV)']

#vlim for heatmaps
vlim = (0,200)


# train on tv data
w = np.log(Xy_tv[target].values)
rfp1.fit(Xy_tv.loc[:,model_features1].values,Xy_tv[target].values,**{'rf__sample_weight':w})


for feature,xlabel,ax in zip(model_features1,feature_labels,axes[0]):
    
    quantiles = np.percentile(train_set[feature], [1. / bins * i * 100 for i in range(0, bins + 1)])
    ALE = ale._first_order_ale_quant(rfp1.predict, train_set,feature,quantiles)
    # normalize min value to zero
    ale._first_order_quant_plot(ax,quantiles,ALE-np.min(ALE),c='k')
    # plot distribution of actual training/val points in rugplot
    sn.rugplot(Xy_tv[feature],ax=ax,c='k',alpha=0.5)
    ax.set_xlabel(xlabel,size=9)
    ax.set_ylabel('ALE (mmol/g$\cdot$h)',size=9)
    # make room for rug plot at bottom and empty space at top
    ax.set_ylim(ax.get_ylim()[0]-10,ax.get_ylim()[1]+5)
    
    
# heatmap based on tv data
tax1 = estimator_ternary_heatmap(50,rfp1,use_X=X_hm1,ax=axes[0,2],add_labeloffset=0.1,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=9)
tax1.ticks(multiple=50/4,tick_formats='',offset=0.04,lw=0.6)
tax1.ticks(multiple=50/4,tick_formats='',offset=0.04,lw=0.6,clockwise=True)

# train on all data
w = np.log(Xy_all[target].values)
rfp1.fit(Xy_all.loc[:,model_features1].values,Xy_all[target].values,**{'rf__sample_weight':w})

for feature,xlabel,ax in zip(model_features1,feature_labels,axes[1]):
    
    quantiles = np.percentile(train_set[feature], [1. / bins * i * 100 for i in range(0, bins + 1)])
    ALE = ale._first_order_ale_quant(rfp1.predict, train_set,feature,quantiles)
    # normalize min value to zero
    ale._first_order_quant_plot(ax,quantiles,ALE-np.min(ALE),c='k')
    # plot distribution of actual training/val/test points in rugplot
    sn.rugplot(Xy_tv[feature],ax=ax,c='k',alpha=0.5)
    sn.rugplot(Xy_test[feature],ax=ax,c='r',alpha=0.5)
    ax.set_xlabel(xlabel,size=9)
    ax.set_ylabel('ALE (mmol/g$\cdot$h)',size=9)
    # make room for rug plot at bottom and empty space at top
    ax.set_ylim(ax.get_ylim()[0]-10,ax.get_ylim()[1]+5)
    

for ax in axes[:,0:2].ravel():
    # set same ylims for all plots
    ax.set_ylim(np.min([ax.get_ylim()[0] for ax in axes.ravel()]), np.max([ax.get_ylim()[1] for ax in axes.ravel()]))
    # set tick label size
    ax.tick_params(axis='both',labelsize=8)
    # zero line
    ax.axhline(0,c='k',ls='-',lw=1,alpha=0.75)

# heatmap based on all data
tax2 = estimator_ternary_heatmap(50,rfp1,use_X=X_hm1,ax=axes[1,2],add_labeloffset=0.1,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=9)
tax2.ticks(multiple=50/4,tick_formats='',offset=0.04,lw=0.6)
tax2.ticks(multiple=50/4,tick_formats='',offset=0.04,lw=0.6,clockwise=True)
    
    
fig.tight_layout()
fig.subplots_adjust(hspace=0.65,top=0.9)
    
# create axes for row titles    
big_axes = [fig.add_subplot(211),fig.add_subplot(212)]
for big_ax in big_axes:
    # Turn off axis lines and ticks of the big subplot 
    big_ax.tick_params(labelcolor=(1.,1.,1., 0.0), top=False, bottom=False, left=False, right=False)
    # removes the white frame
    big_ax._frameon = False
    
big_axes[0].set_title(r'$\bf{(a)}$ Initial Model',y=1.06,bbox={'facecolor':'white','boxstyle':'round'},size=11)
big_axes[1].set_title(r'$\bf{(b)}$ Retrained with Test Compositions',y=1.06,bbox={'facecolor':'white','boxstyle':'round'},
                      size=11)
    

add_colorbar(fig=fig,vlim=vlim,cbrect=[0.91,0.155,0.015,0.75],
         subplots_adjust={'left':0.1,'wspace':0.55, 'hspace':0.65, 'right':0.84},
         label='Predicted Rate (mmol/g$\cdot$h)',label_kwargs={'size':9},tick_params={'labelsize':8}
        )

fig.savefig(os.path.join(plotdir,'FigS5_ALE_retraining.jpg'),dpi=500)

<IPython.core.display.Javascript object>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


## 1.7 Feature correlations

### <span style="color:Red"> Figure S7: Feature-feature correlations</span>

In [75]:
fig,axes = plt.subplots(1,2,figsize=(7.25,3))

tick_fs = 7
label_fs = 8

# ONBED correlations
sorted_features = fs.rank_correlation(X_simplex.loc[:,'MO_ratio':],response_col='ON_BondEnergyDelta_mean')
sorted_features = ['ON_BondEnergyDelta_mean'] + sorted_features[:9]

corr = np.corrcoef(X_simplex.loc[:,sorted_features],rowvar=False)

mp = axes[0].imshow(np.abs(corr),aspect='equal',cmap=plt.cm.jet)

axes[0].set_xticks(np.arange(0,len(sorted_features)))
axes[0].set_yticks(np.arange(0,len(sorted_features)))
axes[0].set_xticklabels(sorted_features,rotation=45,ha='right')
axes[0].set_yticklabels(sorted_features)
axes[0].tick_params(labelsize=tick_fs)

divider = make_axes_locatable(axes[0])
cax = divider.append_axes("right", size="5%", pad=0.05)
cb = fig.colorbar(mp,cax=cax)
cb.ax.tick_params(labelsize=tick_fs)
cb.set_label('Abs. Correlation Coef.',size=label_fs)

axes[0].set_xticks(np.arange(-0.5,len(sorted_features),1),minor=True)
axes[0].set_yticks(np.arange(-0.5,len(sorted_features),1),minor=True)
axes[0].tick_params(which='minor',length=0)
axes[0].grid(which='minor',c='w',lw=1.5,alpha=0.8)

# work function correlations
features = ['M_WorkFunction_mean',
            'MO_BondEnergy_mean',
            'MO_BondIonicity_mean',
            'MO_ratio',
            'M_BandCenter',
            'BCA_BandCenter',
            'M_CationX_mean',
            'M_H_vap_mean',
            'M_ValenceElec_p_frac',
            'M_ValenceElec_s_frac',
            'M_ValenceEnergy_mean',
            'M_X_mean',
            'M_bulk_mod_mean',
            'M_cohesive_energy_mean',
            'oxide_Hf_mean']

sorted_features = ['M_WorkFunction_mean']+fs.rank_correlation(X_simplex.loc[:,features],response_col='M_WorkFunction_mean')
sorted_features = sorted_features[:10]
corr = np.corrcoef(X_simplex.loc[:,sorted_features],rowvar=False)

mp = axes[1].imshow(np.abs(corr),aspect='equal',cmap=plt.cm.jet)

axes[1].set_xticks(np.arange(0,len(sorted_features)))
axes[1].set_yticks(np.arange(0,len(sorted_features)))
axes[1].set_xticklabels(sorted_features,rotation=45,ha='right')
axes[1].set_yticklabels(sorted_features)
axes[1].tick_params(labelsize=tick_fs)

divider = make_axes_locatable(axes[1])
cax = divider.append_axes("right", size="5%", pad=0.05)
cb = fig.colorbar(mp,cax=cax)
cb.ax.tick_params(labelsize=tick_fs)
cb.set_label('Abs. Correlation Coef.',size=label_fs)

axes[1].set_xticks(np.arange(-0.5,len(sorted_features),1),minor=True)
axes[1].set_yticks(np.arange(-0.5,len(sorted_features),1),minor=True)
axes[1].tick_params(which='minor',length=0)
axes[1].grid(which='minor',c='w',lw=1.5,alpha=0.8)

fig.tight_layout()
# fig.savefig(os.path.join(plotdir,'FigS7_FeatureCorrelations.jpg'),dpi=500)

<IPython.core.display.Javascript object>


invalid value encountered in true_divide


invalid value encountered in true_divide



# 2. Models with automatically selected features

## 2a. Select from BCA feature set

In [76]:
# automatically select from BCA_Featurizer features
try:
    sfs_bf = load_pickle('sfs_bf.pkl')
except FileNotFoundError:
    sfs_bf_features = list(X_train.columns.values)

    rs = RobustScaler()
    rf_sfs = RandomForestRegressor(n_estimators=25,max_depth=10,random_state=11)
    rfp_sfs = Pipeline([('scale',rs),('rf',rf_sfs)])

    # ensure that SFS instance uses same CV folds as everything else
    cv = mv.repeating_KFold(4,4,random_state=17)
    
    sfs_bf = SFS(rfp_sfs,
              k_features=10,
              forward=True,
              floating=True,
              verbose=2,
              scoring='r2',
              cv=cv,
              n_jobs=4)

    sfs_bf.fit(X_train.loc[:,sfs_bf_features].values,y_train['norm_max_rate'].values,custom_feature_names=sfs_bf_features)
    save_pickle(sfs_bf,'sfs_bf.pkl')


Trying to unpickle estimator RobustScaler from version 0.19.1 when using version 0.21.3. This might lead to breaking code or invalid results. Use at your own risk.


Trying to unpickle estimator DecisionTreeRegressor from version 0.19.1 when using version 0.21.3. This might lead to breaking code or invalid results. Use at your own risk.


Trying to unpickle estimator RandomForestRegressor from version 0.19.1 when using version 0.21.3. This might lead to breaking code or invalid results. Use at your own risk.


Trying to unpickle estimator Pipeline from version 0.19.1 when using version 0.21.3. This might lead to breaking code or invalid results. Use at your own risk.



In [79]:
# perform grid search optimization for each selected k-feature set
rf2a = RandomForestRegressor(random_state=11)
rfp2a = Pipeline([('scale',rs),('rf',rf2a)])

try:
    results_bf = load_pickle('sfs_results_bf.pkl')
except FileNotFoundError:
    hypers = []
    cv_scores = []
    val_scores = []
    test_scores = []
    for k,v in sfs_bf.subsets_.items():
        # grid search hyperparams
        X_gs = X_train.loc[:,v['feature_names']].values
        y_gs = y_train[target].values
        w_gs = np.log(y_gs)

        gs_rf2 = mv.GridSearchRepeatedCV(rfp2a,param_grid={'rf__n_estimators':[10,20,30,40,50],
                                                        'rf__max_features':np.arange(1,k+1,1).astype(int),
                                                        'rf__max_depth':[3,5,10,25]
                                                       }
                                       )
        gs_rf2.fit(X_gs,y_gs,repeat=4,n_splits=4,random_state=17,
                    sample_weight=w_gs
                  )

        rfp2a.set_params(**gs_rf2.best_params_)
        hypers.append(gs_rf2.best_params_)
        print(f'Best hyperparams for {k} features:',gs_rf2.best_params_)

        # get cv scores
        act,pred,agg_r2,agg_mae = mv.repeated_KFold_cv(rfp2a,X_train.loc[:,list(v['feature_names'])].values,y_train[target].values,
                                                       repeat=4,sample_weight=np.log(y_train[target]).values,
                                                       n_splits=4,random_state=17
                                                       )
        cv_scores.append(np.mean(agg_r2))
        # get holdout score
        rfp2a.fit(X_train.loc[:,v['feature_names']].values,y_train[target].values,
                rf__sample_weight=np.log(y_train[target]).values
               )
        val_scores.append(rfp2a.score(X_val.loc[:,v['feature_names']],y_val[target],
                                    sample_weight=np.log(y_val[target]).values
                                    )
                         )

        # get test score
        rfp2a.fit(Xy_tv.loc[:,v['feature_names']].values,Xy_tv[target].values,
               rf__sample_weight=np.log(Xy_tv[target]).values
               )
        test_scores.append(rfp2a.score(X_test.loc[:,v['feature_names']],y_test[target],
                                     sample_weight=np.log(y_test[target]).values
                                     )
                          )

    results_bf = pd.DataFrame(np.array([list(sfs_bf.subsets_.keys()),[sub['feature_names'] for sub in sfs_bf.subsets_.values()],
                                        hypers,cv_scores,val_scores,test_scores]).T,
                              columns=['n_features','features',\
                                       'hypers','cv_score','val_score','test_score']
                             )
    save_pickle(results_bf,'sfs_results_bf.pkl')

In [80]:
# compare performance to physical mode
fig, axes = plt.subplots(1,3,figsize=(9,2.5),sharey=True)
axes[0].plot(results_bf['n_features'],results_bf['cv_score'],marker='o',fillstyle='none')
axes[1].plot(results_bf['n_features'],results_bf['val_score'],marker='o',fillstyle='none')
axes[2].plot(results_bf['n_features'],results_bf['test_score'],marker='o',fillstyle='none')

axes[0].axhline(custom_cv_score,c='k',ls='--')
axes[1].axhline(custom_val_score,c='k',ls='--')
axes[2].axhline(custom_test_score,c='k',ls='--')

fig.tight_layout()

<IPython.core.display.Javascript object>

## 2b. Select from generic feature set

In [81]:
gf = GenericFeaturizer()
gf.set_n_jobs(1)
print('Total generic features:',len(gf.feature_labels()))

"Set up train and validation datasets"
tv_features_gen = gf.featurize_dataframe(tv_data,col_id="composition",inplace=False)
X_train_gen = tv_features_gen.loc[X_train.index,'M_HOMO_energy':]
X_val_gen = tv_features_gen.loc[X_val.index,'M_HOMO_energy':]

# check all features are numeric
print("Feature datatypes:", X_train_gen.dtypes.unique())
# remove zero-var features
zero_var = list(X_train_gen.columns[X_train_gen.var()==0])
X_train_gen = X_train_gen.drop(zero_var,axis=1)
X_val_gen = X_val_gen.drop(zero_var,axis=1)
print("Zero variance features removed:", zero_var)
# remove null features
nulls = list(X_train_gen.columns[pd.isnull(X_train_gen).max()])
X_train_gen = X_train_gen.drop(nulls,axis=1)
X_val_gen = X_val_gen.drop(nulls,axis=1)
print("Null features removed:", nulls)
print('\nFinal number of features:', len(X_train_gen.columns))
Xy_train_gen = pd.concat([X_train_gen,y_train],axis=1)
Xy_val_gen = pd.concat([X_val_gen,y_val],axis=1)
Xy_tv_gen = pd.concat([Xy_train_gen,Xy_val_gen])

"Set up test dataset"
test_features_gen = gf.featurize_dataframe(test_data,col_id="composition",inplace=False)
# remove features that had zero variance in training data
X_test_gen = test_features_gen.loc[:,'M_HOMO_energy':]
X_test_gen = X_test_gen.drop(zero_var+nulls,axis=1)
y_test = test_features_gen[['max_rate','norm_max_rate']]
Xy_test_gen = pd.concat([X_test_gen,y_test],axis=1)

"Full dataset"
Xy_all_gen = pd.concat([Xy_tv_gen,Xy_test_gen])

Total generic features: 380


HBox(children=(IntProgress(value=0, description='GenericFeaturizer', max=22, style=ProgressStyle(description_w…


Feature datatypes: [dtype('float64')]
Zero variance features removed: ['M MagpieData minimum Column', 'M MagpieData minimum NsValence', 'M MagpieData maximum NsValence', 'M MagpieData range NsValence', 'M MagpieData mean NsValence', 'M MagpieData avg_dev NsValence', 'M MagpieData mode NsValence', 'M MagpieData minimum NpValence', 'M MagpieData minimum NdValence', 'M MagpieData maximum NdValence', 'M MagpieData range NdValence', 'M MagpieData mean NdValence', 'M MagpieData avg_dev NdValence', 'M MagpieData mode NdValence', 'M MagpieData minimum NfValence', 'M MagpieData maximum NfValence', 'M MagpieData range NfValence', 'M MagpieData mean NfValence', 'M MagpieData avg_dev NfValence', 'M MagpieData mode NfValence', 'M MagpieData minimum NValence', 'M MagpieData minimum NsUnfilled', 'M MagpieData maximum NsUnfilled', 'M MagpieData range NsUnfilled', 'M MagpieData mean NsUnfilled', 'M MagpieData avg_dev NsUnfilled', 'M MagpieData mode NsUnfilled', 'M MagpieData minimum NpUnfilled', 'M Ma

HBox(children=(IntProgress(value=0, description='GenericFeaturizer', max=4, style=ProgressStyle(description_wi…




In [88]:
try:
    sfs_gen = load_pickle('sfs_dgen.pkl')
except FileNotFoundError:
    sfs_gen_features = list(X_train_gen.columns.values)

    rs = RobustScaler()
    rf_sfs_gen = RandomForestRegressor(n_estimators=25,max_depth=10,random_state=11)
    rfp_sfs_gen = Pipeline([('scale',rs),('rf',rf_sfs_gen)])

    cv = mv.repeating_KFold(4,4,random_state=17)

    sfs_gen = SFS(rfp_sfs_gen,
              k_features=10,
              forward=True,
              floating=True,
              verbose=2,
              scoring='r2',
              cv=cv,
              n_jobs=4)

    sfs_gen.fit(X_train_gen.loc[:,sfs_gen_features].values,y_train['norm_max_rate'].values,custom_feature_names=sfs_gen_features)
    save_pickle(sfs_gen,'sfs_gen.pkl')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   21.0s
[Parallel(n_jobs=4)]: Done 245 out of 245 | elapsed:   31.8s finished

[2020-01-25 21:28:43] Features: 1/10 -- score: 0.14125245270973735[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   20.7s
[Parallel(n_jobs=4)]: Done 244 out of 244 | elapsed:   32.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 out of   1 | elapsed:    0.5s finished

[2020-01-25 21:29:16] Features: 2/10 -- score: 0.28168694929840904[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed

Dumped pickle to sfs_gen.pkl


In [91]:
# perform grid search optimization for each selected k-feature set

try:
    results_gen = load_pickle('sfs_results_gen.pkl')
except FileNotFoundError:
    rf2b = RandomForestRegressor(random_state=11)
    rfp2b = Pipeline([('scale',rs),('rf',rf2b)])

    hypers = []
    cv_scores = []
    val_scores = []
    test_scores = []
    for k,v in sfs_gen.subsets_.items():
        # grid search hyperparams
        X_gs = X_train_gen.loc[:,v['feature_names']].values
        y_gs = y_train[target].values
        w_gs = np.log(y_gs)

        gs_rf2 = mv.GridSearchRepeatedCV(rfp2b,param_grid={'rf__n_estimators':[10,20,30,40,50],
                                                        'rf__max_features':np.arange(1,k+1,1).astype(int),
                                                        'rf__max_depth':[3,5,10,25]
                                                       }
                                       )
        gs_rf2.fit(X_gs,y_gs,repeat=4,n_splits=4,random_state=17,
                    sample_weight=w_gs
                  )

        rfp2b.set_params(**gs_rf2.best_params_)
        hypers.append(gs_rf2.best_params_)
        print(f'Best hyperparams for {k} features:',gs_rf2.best_params_)

        # get cv scores
        act,pred,agg_r2,agg_mae = mv.repeated_KFold_cv(rfp2b,X_train_gen.loc[:,list(v['feature_names'])].values,y_train[target].values,
                                                       repeat=4,sample_weight=np.log(y_train[target]).values,
                                                       n_splits=4,random_state=17
                                                       )
        cv_scores.append(np.mean(agg_r2))
        # get holdout score
        rfp2b.fit(X_train_gen.loc[:,v['feature_names']].values,y_train[target].values,
                rf__sample_weight=np.log(y_train[target]).values
               )
        val_scores.append(rfp2b.score(X_val_gen.loc[:,v['feature_names']],y_val[target],
                                    sample_weight=np.log(y_val[target]).values
                                    )
                         )

        # get test score
        rfp2b.fit(Xy_tv_gen.loc[:,v['feature_names']].values,Xy_tv[target].values,
               rf__sample_weight=np.log(Xy_tv[target]).values
               )
        test_scores.append(rfp2b.score(X_test_gen.loc[:,v['feature_names']],y_test[target],
                                     sample_weight=np.log(y_test[target]).values
                                     )
                          )

    results_gen = pd.DataFrame(np.array([list(sfs_gen.subsets_.keys()),[sub['feature_names'] for sub in sfs_gen.subsets_.values()],
                                         hypers,cv_scores,val_scores,test_scores]).T,
                              columns=['n_features','features',
                                       'hypers','cv_score','val_score','test_score']
                          )

    save_pickle(results_gen,'sfs_results_gen.pkl')

Best hyperparams for 1 features: {'rf__n_estimators': 40, 'rf__max_features': 1, 'rf__max_depth': 5}
Best hyperparams for 2 features: {'rf__n_estimators': 40, 'rf__max_features': 1, 'rf__max_depth': 10}
Best hyperparams for 3 features: {'rf__n_estimators': 40, 'rf__max_features': 1, 'rf__max_depth': 10}
Best hyperparams for 4 features: {'rf__n_estimators': 40, 'rf__max_features': 2, 'rf__max_depth': 5}
Best hyperparams for 5 features: {'rf__n_estimators': 40, 'rf__max_features': 2, 'rf__max_depth': 10}
Best hyperparams for 6 features: {'rf__n_estimators': 40, 'rf__max_features': 2, 'rf__max_depth': 5}
Best hyperparams for 7 features: {'rf__n_estimators': 40, 'rf__max_features': 3, 'rf__max_depth': 10}
Best hyperparams for 8 features: {'rf__n_estimators': 40, 'rf__max_features': 3, 'rf__max_depth': 5}
Best hyperparams for 9 features: {'rf__n_estimators': 40, 'rf__max_features': 3, 'rf__max_depth': 10}
Best hyperparams for 10 features: {'rf__n_estimators': 40, 'rf__max_features': 5, 'rf_

### <span style="color:red"> Fig S6: Physical vs. empirical model scores </span>

In [92]:
tick_fs = 8
label_fs = 9
fig, axes = plt.subplots(1,3,figsize=(7.25,2))
for result,label in zip([results_bf,results_gen],['BCA features','Generic features']):
    for col, ax in zip(['cv_score','val_score','test_score'],axes):
        p = ax.plot(result['n_features'],result[col].values,zorder=0)
        ax.plot(result['n_features'],result[col],label=label,
                ls='',marker='o',markerfacecolor='white',ms=5,c=p[0].get_color(),zorder=1)
#     axes[1].plot(result['n_features'],result['val_score'],marker='o',fillstyle='none')
#     axes[2].plot(result['n_features'],result['test_score'],marker='o',fillstyle='none',label=label)

axes[0].axhline(custom_cv_score,c='k',ls='--',zorder=0,label='Physical model')
axes[1].axhline(custom_val_score,c='k',ls='--',zorder=0)
axes[2].axhline(custom_test_score,c='k',ls='--',zorder=0)

for ax,title in zip(axes,['Cross Validation','Holdout','Test']):
    ax.set_xlabel('Number of features',size=label_fs)
    ax.set_ylabel('$r^2$',size=label_fs)
    ax.set_xticks(np.arange(2,11,2))
    ax.tick_params(axis='both',labelsize=tick_fs)
#     ax.set_title(title,size=label_fs)
#     ax.set_ylim(0,1)
    
axes[0].set_ylim(0.5,0.85)
# axes[1].set_ylim(0,1)
# axes[2].set_ylim(0.15,0.7)
    
axes[0].legend(fontsize=tick_fs)

for ax, let in zip(axes,['a','b','c']):
    ax.text(-0.4,0.95,f'({let})',weight='bold',size=label_fs+1,transform=ax.transAxes,va='top')

fig.tight_layout()
# fig.savefig(os.path.join(plotdir,'FigS6_EmpiricalModelScores.jpg'),dpi=500)

<IPython.core.display.Javascript object>

In [93]:
# generate heatmap generic features
scale = 50
coords, X_simplex_gen = featurize_simplex(scale,gf)

HBox(children=(IntProgress(value=0, description='GenericFeaturizer', max=1326, style=ProgressStyle(description…




In [94]:
# make heatmaps for empirical models
fig, axes = plt.subplots(1,2,figsize=(7.25,2.75))
label_fs=9
tick_fs=8

vlim = (0,200)
# BCA features
n_features = 10
rfp2a.set_params(**results_bf.loc[n_features-1,'hypers'])
rfp2a.fit(Xy_tv.loc[:,sfs_bf.subsets_[n_features]['feature_names']].values,Xy_tv[target].values,
         rf__sample_weight=np.log(Xy_tv_gen[target]).values
        )

X_hm2a = X_simplex.loc[:,sfs_bf.subsets_[n_features]['feature_names']]
tax1 = estimator_ternary_heatmap(50,rfp2a,use_X=X_hm2a,ax=axes[0],add_labeloffset=0.02,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=label_fs
                               )
tax1.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7)
tax1.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7,clockwise=True)
draw_guidelines(tax1,c='white',lw=0.7,alpha=0.7)


# generic features
n_features = 10
rfp2b.set_params(**results_gen.loc[n_features-1,'hypers'])
rfp2b.fit(Xy_tv_gen.loc[:,sfs_gen.subsets_[n_features]['feature_names']].values,Xy_tv[target].values,
         rf__sample_weight=np.log(Xy_tv_gen[target]).values
        )

vlim = (0,200)
X_hm2b = X_simplex_gen.loc[:,sfs_gen.subsets_[n_features]['feature_names']]
tax2 = estimator_ternary_heatmap(50,rfp2b,use_X=X_hm2b,ax=axes[1],add_labeloffset=0.02,vlim=vlim,cmap=plt.cm.inferno,
                               labelsize=label_fs
                               )
tax2.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7)
tax2.ticks(multiple=50/4,tick_formats='',offset=0.03,lw=0.7,clockwise=True)

draw_guidelines(tax2,c='white',lw=0.7,alpha=0.7)


add_colorbar(fig=fig,vlim=vlim,cbrect=[0.9,0.19,0.02,0.68],
             subplots_adjust={'left':0.07,'wspace':0.35, 'hspace':0.25, 'right':0.82},
             label='Predicted Rate (mmol/g$\cdot$h)',label_kwargs={'size':label_fs},tick_params={'labelsize':tick_fs}
            )

<IPython.core.display.Javascript object>

# Citations for packages used

In [None]:
# packages found by makecite:
# matplotlib, numpy, pandas, scipy, seaborn, sklearn

In [None]:
#need to cite manually: alepython, ternary, mlxtend, pymatgen, matminer, featurizers

In [26]:
citations = [
    # alepython
    "@software{alepython,"
      "author       = {Maxime Jumelle},"
      "title        = {ALEPython: Python Accumulated Local Effects package},"
      "month        = sep,"
      "year         = 2019,"
      "version      = {0.1.0},"
      "url          = {https://github.com/MaximeJumelle/alepython/}"
    "}",
    # ternary 
    "@software{marc_2019_2628066,"
      "author       = {Marc Harper and "
                      "Bryan Weinstein and "
                      "tgwoodcock and "
                      "Cory Simon and "
                      "chebee7i and "
                      "Wiley Morgan and "
                      "Vince Knight and "
                      "Nick Swanson-Hysell and "
                      "Matthew Evans and "
                      "jl-bernal and "
                      "ZGainsforth and "
                      "The Gitter Badger and "
                      "SaxonAnglo and "
                      "Maximiliano Greco and "
                      "Guido Zuidhof},"
      "title        = {python-ternary: Ternary plots in Python},"
      "month        = apr,"
      "year         = 2019,"
      "publisher    = {Zenodo},"
      "version      = {1.0.6},"
      "doi          = {10.5281/zenodo.2628066},"
      "url          = {https://doi.org/10.5281/zenodo.2628066}"
    "}",
    # mlxtend
    "@article{raschkas_2018_mlxtend,"
      "author       = {Sebastian Raschka},"
      "title        = {MLxtend: Providing machine learning and data science "
                      "utilities and extensions to Python’s  "
                      "scientific computing stack},"
      "journal      = {The Journal of Open Source Software},"
      "volume       = {3},"
      "number       = {24},"
      "month        = apr,"
      "year         = 2018,"
      "publisher    = {The Open Journal},"
      "doi          = {10.21105/joss.00638},"
      "url          = {http://joss.theoj.org/papers/10.21105/joss.00638}"
    "}",
    # matminer
    "@article{WARD201860,"
        "title = {Matminer: An open source toolkit for materials data mining},"
        "journal = {Computational Materials Science},"
        "volume = {152},"
        "pages = {60 - 69},"
        "year = {2018},"
        "issn = {0927-0256},"
        "doi = {https://doi.org/10.1016/j.commatsci.2018.05.018},"
        "url = {http://www.sciencedirect.com/science/article/pii/S0927025618303252},"
        "author = {Logan Ward and Alexander Dunn and Alireza Faghaninia and Nils E.R. Zimmermann and Saurabh Bajaj and Qi Wang and Joseph Montoya and Jiming Chen and Kyle Bystrom and Maxwell Dylla and Kyle Chard and Mark Asta and Kristin A. Persson and G. Jeffrey Snyder and Ian Foster and Anubhav Jain},"
        "keywords = {Data mining, Open source software, Machine learning, Materials informatics},"
    "}",
    # 
]
# featurizer citations (includes pymatgen)
citations += bf.citations()
# makecite 
citations += [
    "@software{makecite,"
      "author       = {Adrian Price-Whelan and "
                      "Alexandar Mechev and "
                      "Brigitta Sipocz and "
                      "Griffin Hosseinzadeh and "
                      "jumeroag and "
                      "Eric Bellm},"
      "title        = {adrn/makecite v0.5},"
      "month        = nov,"
      "year         = 2019,"
      "publisher    = {Zenodo},"
      "version      = {v0.5},"
      "doi          = {10.5281/zenodo.3533303},"
      "url          = {https://doi.org/10.5281/zenodo.3533303}"
    "}"]

In [28]:
with open('../citations/notebook_package_citations.bib','w') as f:
    f.write('\n\n'.join(citations))

In [24]:
# get bibtex tags for latex file
# these citations are far from perfect! quite a bit of manual cleanup to do after generating
with open(os.path.join('../citations/package_citations.bib'),'r') as f:
    cite_text = f.read()
entries = cite_text.split('@')
entries = [e for e in entries if e.find('{')>0]
get_tag = lambda x: x[x.find('{')+1:x.find(',')]
tags = [get_tag(e) for e in entries]
print('\n'.join(['\\cite{{{}}}'.format(tag) for tag in tags if len(tag)>0]))
print('\nBad entries:',[e for e in entries if len(get_tag(e))==0])

Created MatProjCalc instance
Created MatProjCalc instance
\cite{Hunter:2007}
\cite{numpy:2011}
\cite{pandas:2010}
\cite{scipy:2001}
\cite{seaborn:2018}
\cite{scikit-learn:2011}
\cite{alepython}
\cite{marc_2019_2628066}
\cite{raschkas_2018_mlxtend}
\cite{WARD201860}
\cite{Holzl1979}
\cite{doi:10.1080/00222346908205102}
\cite{Butler1978}
\cite{Jain2013}
\cite{Ong2012b}
\cite{Ong_2015}
\cite{PhysRevA.55.191}
\cite{Ward2016}
\cite{Zohourian2017}
\cite{deml_ohayre_wolverton_stevanovic_2016}
\cite{doi:10.1063/1.323539}
\cite{ward_agrawal_choudary_wolverton_2016}
\cite{Kittel}
\cite{AngstromSciences}

Bad entries: ['misc{, title = {{Knowledgedoor Cohesive energy handbook}}, url = {http://www.knowledgedoor.com/2/elements{\\_}handbook/cohesive{\\_}energy.html}}\n\n']
