In [None]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2
plt.style.use('tableau-colorblind10')

import matplotlib.font_manager as fm

font_names = [f.name for f in fm.fontManager.ttflist]

plt.rcParams['font.family'] = 'DejaVu Serif'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2
plt.style.use('tableau-colorblind10')

In [None]:
def get_figure(figsize=(5.5,4)):

    fig, ax = plt.subplots(figsize=figsize)

    ax.xaxis.set_tick_params(which='major', size=7, width=1.5, direction='in', top='on')
    ax.xaxis.set_tick_params(which='minor', size=4, width=1.5, direction='in', top='on')
    ax.yaxis.set_tick_params(which='major', size=7, width=1.5, direction='in', right='on')
    ax.yaxis.set_tick_params(which='minor', size=4, width=1.5, direction='in', right='on')

    ax.tick_params(bottom=True, top=True, left=True, right=True)
    ax.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)
    ax.tick_params(direction='in')

    return fig, ax 

In [None]:
fivefold_df = pd.read_csv('./fivefold_CV_results_updated.csv')
# print(fivefold_df)

In [None]:
fig, ax = get_figure(figsize=(6.5,4.5))

cmap = plt.get_cmap('viridis', 5)
colors = [cmap(i) for i in range(0,3)]

xvalues1 = np.arange(0,len(fivefold_df.loc[[1,4,7,10]]['model']))
xvalues2 = np.arange(0,len(fivefold_df.loc[[2,5,8,11]]['model']))
xvalues3 = np.arange(0,len(fivefold_df.loc[[3,6,9,12]]['model']))
width = 0.25

xvalues_line = np.linspace(-1.25,3.5)

ax.bar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['rmse_avg'],color=colors[0],width=0.25, edgecolor='black', linewidth=2,label='uESE-1')
ax.errorbar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['rmse_avg'],yerr=fivefold_df.loc[[1,4,7,10]]['rmse_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues2,fivefold_df.loc[[2,5,8,11]]['rmse_avg'],color=colors[1],width=0.25, edgecolor='black', linewidth=2, label='uESE-2')
ax.errorbar(xvalues2,fivefold_df.loc[[2,5,8,11]]['rmse_avg'],yerr=fivefold_df.loc[[2,5,8,11]]['rmse_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['rmse_avg'],color=colors[2],width=0.25, edgecolor='black', linewidth=2, label='uESE-2,3')
ax.errorbar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['rmse_avg'],yerr=fivefold_df.loc[[3,6,9,12]]['rmse_std'],fmt='none',capsize=5,c='black')

ax.bar(-1,fivefold_df.loc[[0]]['rmse_avg'],color='gray',width=0.25, edgecolor='black', linewidth=2)
ax.errorbar(-1,fivefold_df.loc[[0]]['rmse_avg'],yerr=fivefold_df.loc[[0]]['rmse_std'],fmt='none',capsize=5,c='black')

ax.plot(xvalues_line,[fivefold_df.loc[[0]]['rmse_avg'] for _ in range(0,len(xvalues_line))],c='black',linestyle='--',linewidth=2)

ax.set_xticks([-1,0,1,2,3])
ax.set_xticklabels(['Baseline',r'Theory ($\Delta G^{calc}_\mathrm{solv}$)','Feature-informed','Difference', 'Ratio'])
ax.tick_params(bottom=True, top=False, left=True, right=False)
ax.xaxis.set_tick_params(which='major', size=7, width=2, direction='out')
plt.xticks(rotation=45,ha='right')
ax.set_xlabel('Model',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')
ax.set_ylim(0.0,1.3)
plt.legend(frameon=False)
plt.savefig('./Figures/comparison_RMSE_fivefold_CV_updated.pdf',bbox_inches='tight')
plt.show()

In [None]:
fig, ax = get_figure(figsize=(6.5,4.5))

cmap = plt.get_cmap('viridis', 5)
colors = [cmap(i) for i in range(0,3)]

xvalues1 = np.arange(0,len(fivefold_df.loc[[1,4,7,10]]['model']))
xvalues2 = np.arange(0,len(fivefold_df.loc[[2,5,8,11]]['model']))
xvalues3 = np.arange(0,len(fivefold_df.loc[[3,6,9,12]]['model']))
width = 0.25

xvalues_line = np.linspace(-1.25,3.5)

ax.bar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['mae_avg'],color=colors[0],width=0.25, edgecolor='black', linewidth=2,label='uESE-1')
ax.errorbar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['mae_avg'],yerr=fivefold_df.loc[[1,4,7,10]]['mae_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues2,fivefold_df.loc[[2,5,8,11]]['mae_avg'],color=colors[1],width=0.25, edgecolor='black', linewidth=2, label='uESE-2')
ax.errorbar(xvalues2,fivefold_df.loc[[2,5,8,11]]['mae_avg'],yerr=fivefold_df.loc[[2,5,8,11]]['mae_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['mae_avg'],color=colors[2],width=0.25, edgecolor='black', linewidth=2, label='uESE-2,3')
ax.errorbar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['mae_avg'],yerr=fivefold_df.loc[[3,6,9,12]]['mae_std'],fmt='none',capsize=5,c='black')

ax.bar(-1,fivefold_df.loc[[0]]['mae_avg'],color='gray',width=0.25, edgecolor='black', linewidth=2)
ax.errorbar(-1,fivefold_df.loc[[0]]['mae_avg'],yerr=fivefold_df.loc[[0]]['mae_std'],fmt='none',capsize=5,c='black')

ax.plot(xvalues_line,[fivefold_df.loc[[0]]['mae_avg'] for _ in range(0,len(xvalues_line))],c='black',linestyle='--',linewidth=2)

ax.set_xticks([-1,0,1,2,3])
ax.set_xticklabels(['Baseline',r'Theory ($\Delta G^{calc}_\mathrm{solv}$)','Feature-informed','Difference', 'Ratio'])
ax.tick_params(bottom=True, top=False, left=True, right=False)
ax.xaxis.set_tick_params(which='major', size=7, width=2, direction='out')
plt.xticks(rotation=45,ha='right')
ax.set_xlabel('Model',fontweight='bold')
ax.set_ylabel('MAE (kcal/mol)',fontweight='bold')
ax.set_ylim(0.0,0.8)
plt.legend(frameon=False)
plt.savefig('./Figures/comparison_MAE_fivefold_CV_updated.pdf',bbox_inches='tight')
plt.show()

In [None]:
fig, ax = get_figure(figsize=(6.5,4.5))

cmap = plt.get_cmap('viridis', 5)
colors = [cmap(i) for i in range(0,3)]

xvalues1 = np.arange(0,len(fivefold_df.loc[[1,4,7,10]]['model']))
xvalues2 = np.arange(0,len(fivefold_df.loc[[2,5,8,11]]['model']))
xvalues3 = np.arange(0,len(fivefold_df.loc[[3,6,9,12]]['model']))
width = 0.25

xvalues_line = np.linspace(-1.25,3.5)

ax.bar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['r2_avg'],color=colors[0],width=0.25, edgecolor='black', linewidth=2,label='uESE-1')
ax.errorbar(xvalues1-0.25,fivefold_df.loc[[1,4,7,10]]['r2_avg'],yerr=fivefold_df.loc[[1,4,7,10]]['r2_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues2,fivefold_df.loc[[2,5,8,11]]['r2_avg'],color=colors[1],width=0.25, edgecolor='black', linewidth=2, label='uESE-2')
ax.errorbar(xvalues2,fivefold_df.loc[[2,5,8,11]]['r2_avg'],yerr=fivefold_df.loc[[2,5,8,11]]['r2_std'],fmt='none',capsize=5,c='black')
ax.bar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['r2_avg'],color=colors[2],width=0.25, edgecolor='black', linewidth=2, label='uESE-2,3')
ax.errorbar(xvalues3+0.25,fivefold_df.loc[[3,6,9,12]]['r2_avg'],yerr=fivefold_df.loc[[3,6,9,12]]['r2_std'],fmt='none',capsize=5,c='black')

ax.bar(-1,fivefold_df.loc[[0]]['r2_avg'],color='gray',width=0.25, edgecolor='black', linewidth=2)
ax.errorbar(-1,fivefold_df.loc[[0]]['r2_avg'],yerr=fivefold_df.loc[[0]]['r2_std'],fmt='none',capsize=5,c='black')

ax.plot(xvalues_line,[fivefold_df.loc[[0]]['r2_avg'] for _ in range(0,len(xvalues_line))],c='black',linestyle='--',linewidth=2)

ax.set_xticks([-1,0,1,2,3])
ax.set_xticklabels(['Baseline',r'Theory ($\Delta G^{calc}_\mathrm{solv}$)','Feature-informed','Difference', 'Ratio'])
ax.tick_params(bottom=True, top=False, left=True, right=False)
ax.xaxis.set_tick_params(which='major', size=7, width=2, direction='out')
plt.xticks(rotation=45,ha='right')
ax.set_xlabel('Model',fontweight='bold')
ax.set_ylabel(r'$\mathbf{R^2}$',fontweight='bold')
ax.set_ylim(0.7,1.1)
plt.legend(frameon=False,loc='upper right')
plt.savefig('./Figures/comparison_R2_fivefold_CV_updated.pdf',bbox_inches='tight')
plt.show()

In [None]:
calconly_rmse = pd.read_csv('./Loss_vs_Data_Results/calc_only_calc_rmse_no_noise.csv',header=None).to_numpy()
mlonly_rmse = pd.read_csv('./Loss_vs_Data_Results/MLonly_calc_rmse_no_noise.csv',header=None).to_numpy()
informed_rmse = pd.read_csv('./Loss_vs_Data_Results/informed_calc_rmse_no_noise.csv',header=None).to_numpy()
diff_calc_rmse = pd.read_csv('./Loss_vs_Data_Results/diff_calc_rmse_no_noise.csv',header=None).to_numpy()
quot_calc_rmse = pd.read_csv('./Loss_vs_Data_Results/quot_calc_rmse_no_noise.csv',header=None).to_numpy()

calconly_mae = pd.read_csv('./Loss_vs_Data_Results/calc_only_calc_mae_no_noise.csv',header=None).to_numpy()
mlonly_mae = pd.read_csv('./Loss_vs_Data_Results/MLonly_calc_mae_no_noise.csv',header=None).to_numpy()
informed_mae = pd.read_csv('./Loss_vs_Data_Results/informed_calc_mae_no_noise.csv',header=None).to_numpy()
diff_calc_mae = pd.read_csv('./Loss_vs_Data_Results/diff_calc_mae_no_noise.csv',header=None).to_numpy()
quot_calc_mae = pd.read_csv('./Loss_vs_Data_Results/quot_calc_mae_no_noise.csv',header=None).to_numpy()

calconly_r2 = pd.read_csv('./Loss_vs_Data_Results/calc_only_calc_r2_no_noise.csv',header=None).to_numpy()
mlonly_r2 = pd.read_csv('./Loss_vs_Data_Results/MLonly_calc_r2_no_noise.csv',header=None).to_numpy()
informed_r2 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_r2_no_noise.csv',header=None).to_numpy()
diff_calc_r2 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_r2_no_noise.csv',header=None).to_numpy()
quot_calc_r2 = pd.read_csv('./Loss_vs_Data_Results/quot_calc_r2_no_noise.csv',header=None).to_numpy()

num_data_list = [20, 50]
percent_data_list = [i/100*2519 for i in range(5,100,5)]
num_data = np.concatenate([num_data_list,percent_data_list])
max_data_size = 2519


In [None]:
fig, ax = get_figure(figsize=(6,4))

ax.plot(num_data,np.mean(mlonly_rmse,axis=0),linewidth=2,marker='s', label='Calculated')
ax.fill_between(num_data,np.mean(mlonly_rmse,axis=0)+np.std(mlonly_rmse,axis=0),np.mean(mlonly_rmse,axis=0)-np.std(mlonly_rmse,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(informed_rmse,axis=0),linewidth=2,marker='s',label= 'Informed ML')
ax.fill_between(num_data,np.mean(informed_rmse,axis=0)+np.std(informed_rmse,axis=0),np.mean(informed_rmse,axis=0)-np.std(informed_rmse,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(diff_calc_rmse,axis=0),linewidth=2,marker='s',label= 'Difference')
ax.fill_between(num_data,np.mean(diff_calc_rmse,axis=0)+np.std(diff_calc_rmse,axis=0),np.mean(diff_calc_rmse,axis=0)-np.std(diff_calc_rmse,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(quot_calc_rmse,axis=0),linewidth=2,c='C5',marker='s',label= 'Difference')
ax.fill_between(num_data,np.mean(quot_calc_rmse,axis=0)+np.std(quot_calc_rmse,axis=0),np.mean(quot_calc_rmse,axis=0)-np.std(quot_calc_rmse,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,[fivefold_df.loc[1]['rmse_avg'] for _ in range(0,len(num_data))],'--',c='black')
        
ax.set_xlim(0,2500)
ax.set_ylim(0.5,1.5)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')

plt.arrow(1075,0.96,0.0,-0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C0')
plt.arrow(400,0.96,0.0,-0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C1')
plt.arrow(145,0.75,0.0,0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C2')

plt.tight_layout()
plt.savefig('./Figures/RMSE_vs_numdata.pdf',bbox_inches='tight')
plt.show()


In [None]:
fig, ax = get_figure(figsize=(6,4))

ax.plot(num_data,np.mean(mlonly_mae,axis=0),linewidth=2,marker='s', label='Calculated')
ax.fill_between(num_data,np.mean(mlonly_mae,axis=0)+np.std(mlonly_mae,axis=0),np.mean(mlonly_mae,axis=0)-np.std(mlonly_mae,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(informed_mae,axis=0),linewidth=2,marker='s',label= 'Informed ML')
ax.fill_between(num_data,np.mean(informed_mae,axis=0)+np.std(informed_mae,axis=0),np.mean(informed_mae,axis=0)-np.std(informed_mae,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(diff_calc_mae,axis=0),linewidth=2,marker='s',label= 'Difference')
ax.fill_between(num_data,np.mean(diff_calc_mae,axis=0)+np.std(diff_calc_mae,axis=0),np.mean(diff_calc_mae,axis=0)-np.std(diff_calc_mae,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(quot_calc_mae,axis=0),linewidth=2,c='C5',marker='s',label= 'Ratio')
ax.fill_between(num_data,np.mean(quot_calc_mae,axis=0)+np.std(quot_calc_mae,axis=0),np.mean(quot_calc_mae,axis=0)-np.std(quot_calc_mae,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,[fivefold_df.loc[1]['mae_avg'] for _ in range(0,len(num_data))],'--',c='black')

ax.set_xlim(0,2500)
ax.set_ylim(0.2,1.2)
ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('MAE (kcal/mol)',fontweight='bold')

plt.arrow(775,0.68,0.0,-0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C0')
plt.arrow(300,0.68,0.0,-0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C1')
plt.arrow(140,0.45,0.0,0.04,head_width=65,head_length=0.04,linewidth=1.75,facecolor='C2')

plt.tight_layout()
plt.savefig('./Figures/MAE_vs_numdata.pdf',bbox_inches='tight')
plt.show()

In [None]:
fig, ax = get_figure(figsize=(6,4))

ax.plot(num_data,np.mean(mlonly_r2,axis=0),linewidth=2,marker='s', label='Baseline')
ax.fill_between(num_data,np.mean(mlonly_r2,axis=0)+np.std(mlonly_r2,axis=0),np.mean(mlonly_r2,axis=0)-np.std(mlonly_r2,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(informed_r2,axis=0),linewidth=2,marker='s',label= 'Feature-informed')
ax.fill_between(num_data,np.mean(informed_r2,axis=0)+np.std(informed_r2,axis=0),np.mean(informed_r2,axis=0)-np.std(informed_r2,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(diff_calc_r2,axis=0),linewidth=2,marker='s',label= 'Difference')
ax.fill_between(num_data,np.mean(diff_calc_r2,axis=0)+np.std(diff_calc_r2,axis=0),np.mean(diff_calc_r2,axis=0)-np.std(diff_calc_r2,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(quot_calc_r2,axis=0),linewidth=2,c='C5',marker='s',label= 'Ratio')
ax.fill_between(num_data,np.mean(quot_calc_r2,axis=0)+np.std(quot_calc_r2,axis=0),np.mean(quot_calc_r2,axis=0)-np.std(quot_calc_r2,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,[fivefold_df.loc[1]['r2_avg'] for _ in range(0,len(num_data))],'--',c='black',label='uESE-1')

ax.set_xlim(0,2500)
ax.set_ylim(0.6,1.0)
ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel(r'$\mathbf{R^2}$',fontweight='bold')

plt.arrow(1075,0.86,0.0,0.015,head_width=65,head_length=0.02,linewidth=1.75,facecolor='C0')
plt.arrow(325,0.86,0.0,0.015,head_width=65,head_length=0.02,linewidth=1.75,facecolor='C1')
plt.arrow(140,0.95,0.0,-0.015,head_width=65,head_length=0.02,linewidth=1.75,facecolor='C2')

plt.legend(frameon=False,loc='lower right')
plt.tight_layout()
plt.savefig('./Figures/R2_vs_numdata.pdf',bbox_inches='tight')
plt.show()

## Effect of theory approximations

In [None]:
informed_rmse = pd.read_csv('./Loss_vs_Data_Results/informed_calc_rmse_no_noise.csv',header=None).to_numpy()
informed_rmse_uESE2 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_rmse_uESE2min_no_noise.csv',header=None).to_numpy()
informed_rmse_uESE23 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_rmse_uESE23min_no_noise.csv',header=None).to_numpy()

informed_mae = pd.read_csv('./Loss_vs_Data_Results/informed_calc_mae_no_noise.csv',header=None).to_numpy()
informed_mae_uESE2 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_mae_uESE2min_no_noise.csv',header=None).to_numpy()
informed_mae_uESE23 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_mae_uESE23min_no_noise.csv',header=None).to_numpy()

informed_r2 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_r2_no_noise.csv',header=None).to_numpy()
informed_r2_uESE2 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_r2_uESE2min_no_noise.csv',header=None).to_numpy()
informed_r2_uESE23 = pd.read_csv('./Loss_vs_Data_Results/informed_calc_r2_uESE23min_no_noise.csv',header=None).to_numpy()

diff_rmse = pd.read_csv('./Loss_vs_Data_Results/diff_calc_rmse_no_noise.csv',header=None).to_numpy()
diff_rmse_uESE2 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_rmse_uESE2min_no_noise.csv',header=None).to_numpy()
diff_rmse_uESE23 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_rmse_uESE23min_no_noise.csv',header=None).to_numpy()

diff_mae = pd.read_csv('./Loss_vs_Data_Results/diff_calc_mae_no_noise.csv',header=None).to_numpy()
diff_mae_uESE2 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_mae_uESE2min_no_noise.csv',header=None).to_numpy()
diff_mae_uESE23 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_mae_uESE23min_no_noise.csv',header=None).to_numpy()

diff_r2 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_r2_no_noise.csv',header=None).to_numpy()
diff_r2_uESE2 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_r2_uESE2min_no_noise.csv',header=None).to_numpy()
diff_r2_uESE23 = pd.read_csv('./Loss_vs_Data_Results/diff_calc_r2_uESE23min_no_noise.csv',header=None).to_numpy()

num_data = [20, 50, 125, 251, 377, 503, 629, 755, 881, 1007, 1133, 1259, 1385, 1511, 1637, 1763, 1889, 2015, 2141, 2267, 2393]

max_data_size = 2526

In [None]:
cmap = plt.get_cmap('viridis', 5)
colors = [cmap(i) for i in range(0,3)]

fig, ax = get_figure(figsize=(6,4))

ax.plot(num_data,np.mean(informed_rmse,axis=0),linewidth=2,marker='s',c=colors[0],label='uESE-1')
ax.fill_between(num_data,np.mean(informed_rmse,axis=0)+np.std(informed_rmse,axis=0),np.mean(informed_rmse,axis=0)-np.std(informed_rmse,axis=0),alpha=0.4,color=colors[0])

ax.plot(num_data,np.mean(informed_rmse_uESE2,axis=0),linewidth=2,marker='s',c=colors[1],label='uESE-2')
ax.fill_between(num_data,np.mean(informed_rmse_uESE2,axis=0)+np.std(informed_rmse_uESE2,axis=0),np.mean(informed_rmse_uESE2,axis=0)-np.std(informed_rmse_uESE2,axis=0),alpha=0.4,color=colors[1])

ax.plot(num_data,np.mean(informed_rmse_uESE23,axis=0),linewidth=2,marker='s',c=colors[2],label='uESE-2,3')
ax.fill_between(num_data,np.mean(informed_rmse_uESE23,axis=0)+np.std(informed_rmse_uESE23,axis=0),np.mean(informed_rmse_uESE23,axis=0)-np.std(informed_rmse_uESE23,axis=0),alpha=0.4,color=colors[2])

ax.plot(num_data,[fivefold_df.loc[1]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[0],linewidth=2)
ax.plot(num_data,[fivefold_df.loc[2]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[1],linewidth=2)
ax.plot(num_data,[fivefold_df.loc[3]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[2],linewidth=2)
        
ax.set_xlim(0,2500)
ax.set_ylim(0.1,3)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')
plt.legend(frameon=False)
plt.tight_layout()
plt.savefig('./Figures/RMSE_vs_numdata_FI_uESE.pdf',bbox_inches='tight')
plt.show()

In [None]:
cmap = plt.get_cmap('viridis', 5)
colors = [cmap(i) for i in range(0,3)]

fig, ax = get_figure(figsize=(6,4))

ax.plot(num_data,np.mean(diff_rmse,axis=0),linewidth=2,marker='s',c=colors[0],label='uESE-1')
ax.fill_between(num_data,np.mean(diff_rmse,axis=0)+np.std(diff_rmse,axis=0),np.mean(diff_rmse,axis=0)-np.std(diff_rmse,axis=0),alpha=0.4,color=colors[0])

ax.plot(num_data,np.mean(diff_rmse_uESE2,axis=0),linewidth=2,marker='s',c=colors[1],label='uESE-2')
ax.fill_between(num_data,np.mean(diff_rmse_uESE2,axis=0)+np.std(diff_rmse_uESE2,axis=0),np.mean(diff_rmse_uESE2,axis=0)-np.std(diff_rmse_uESE2,axis=0),alpha=0.4,color=colors[1])

ax.plot(num_data,np.mean(diff_rmse_uESE23,axis=0),linewidth=2,marker='s',c=colors[2])
ax.fill_between(num_data,np.mean(diff_rmse_uESE23,axis=0)+np.std(diff_rmse_uESE23,axis=0),np.mean(diff_rmse_uESE23,axis=0)-np.std(diff_rmse_uESE23,axis=0),alpha=0.4,color=colors[2])

ax.plot(num_data,[fivefold_df.loc[1]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[0],linewidth=2)
ax.plot(num_data,[fivefold_df.loc[2]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[1],linewidth=2)
ax.plot(num_data,[fivefold_df.loc[3]['rmse_avg'] for _ in range(0,len(num_data))],'--',c=colors[2],linewidth=2)
        
ax.set_xlim(0,2500)
ax.set_ylim(0.4,1.3)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')

plt.tight_layout()
plt.savefig('./Figures/RMSE_vs_numdata_Diff_uESE.pdf',bbox_inches='tight')
plt.show()

## Active Learning

In [None]:
dGsolv_MLonly_rmse_var = pd.read_csv('./MLonly_GPR_calc_rmse_MaxVar_no_noise.csv',header=None).to_numpy()
dGsolv_MLonly_rmse_random = pd.read_csv('./MLonly_GPR_calc_rmse_Random_no_noise.csv',header=None).to_numpy()

dGsolv_informed_rmse_var = pd.read_csv('./informed_GPR_calc_rmse_MaxVar_no_noise.csv',header=None).to_numpy()
dGsolv_informed_rmse_diff = pd.read_csv('./informed_GPR_calc_rmse_MaxDiff_no_noise.csv',header=None).to_numpy()

dGsolv_diff_rmse_var = pd.read_csv('./diff_GPR_calc_rmse_MaxVar_no_noise.csv',header=None).to_numpy()
dGsolv_diff_rmse_diff = pd.read_csv('./diff_GPR_calc_rmse_MaxDiff_no_noise.csv',header=None).to_numpy()

In [None]:
fig, ax = get_figure(figsize=(6,4))

num_data = np.arange(0,201)*5.+20

ax.plot(num_data,np.mean(dGsolv_MLonly_rmse_random,axis=0),linewidth=2,c='black', label='Descriptors - Random')
ax.fill_between(num_data,np.mean(dGsolv_MLonly_rmse_random,axis=0)+np.std(dGsolv_MLonly_rmse_random,axis=0),np.mean(dGsolv_MLonly_rmse_random,axis=0)-np.std(dGsolv_MLonly_rmse_random,axis=0),color='black',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_MLonly_rmse_var,axis=0),linewidth=2,c='C0',label='Descriptors - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_MLonly_rmse_var,axis=0)+np.std(dGsolv_MLonly_rmse_var,axis=0),np.mean(dGsolv_MLonly_rmse_var,axis=0)-np.std(dGsolv_MLonly_rmse_var,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_informed_rmse_var,axis=0),linewidth=2,c='C1',label= 'Feature Informed - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_informed_rmse_var,axis=0)+np.std(dGsolv_informed_rmse_var,axis=0),np.mean(dGsolv_informed_rmse_var,axis=0)-np.std(dGsolv_informed_rmse_var,axis=0),alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_diff_rmse_var,axis=0),linewidth=2,c='C2',label= 'Difference - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_diff_rmse_var,axis=0)+np.std(dGsolv_diff_rmse_var,axis=0),np.mean(dGsolv_diff_rmse_var,axis=0)-np.std(dGsolv_diff_rmse_var,axis=0),alpha=0.4)

ax.plot(num_data,[1.61 for _ in range(0,len(num_data))],'--',c='black',label='uESE - 1')

ax.set_ylim(0.5,7)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')
plt.legend(frameon=False)
plt.savefig('./Figures/GPR_activelearning_compareMaxVar_RMSE_vs_numdata.pdf',bbox_inches='tight')
plt.show()

In [None]:
fig, ax = get_figure(figsize=(6,4))

num_data = np.arange(0,201)*5.+20

ax.plot(num_data,np.mean(dGsolv_informed_rmse_var,axis=0),linewidth=2,c='C1',label='Feature Informed - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_informed_rmse_var,axis=0)+np.std(dGsolv_informed_rmse_var,axis=0),np.mean(dGsolv_informed_rmse_var,axis=0)-np.std(dGsolv_informed_rmse_var,axis=0),color='C1',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_informed_rmse_diff,axis=0),linewidth=2,c='C5',linestyle='-.',label= 'Feature Informed - MaxDiff')
ax.fill_between(num_data,np.mean(dGsolv_informed_rmse_diff,axis=0)+np.std(dGsolv_informed_rmse_diff,axis=0),np.mean(dGsolv_informed_rmse_diff,axis=0)-np.std(dGsolv_informed_rmse_diff,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_diff_rmse_var,axis=0),linewidth=2,c='C2',label= 'Difference - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_diff_rmse_var,axis=0)+np.std(dGsolv_diff_rmse_var,axis=0),np.mean(dGsolv_diff_rmse_var,axis=0)-np.std(dGsolv_diff_rmse_var,axis=0),color='C2',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_diff_rmse_diff,axis=0),linewidth=2,c='C3',linestyle='-.',label= 'Difference - MaxDiff')
ax.fill_between(num_data,np.mean(dGsolv_diff_rmse_diff,axis=0)+np.std(dGsolv_diff_rmse_diff,axis=0),np.mean(dGsolv_diff_rmse_diff,axis=0)-np.std(dGsolv_diff_rmse_diff,axis=0),color='C3',alpha=0.4)

ax.plot(num_data,[1.61 for _ in range(0,len(num_data))],'--',c='black',label='uESE - 1',linewidth=2)

ax.set_ylim(0.5,7.5)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('RMSE (kcal/mol)',fontweight='bold')
plt.legend(frameon=False)
plt.savefig('./Figures/GPR_activelearning_compareMaxDiff_RMSE_vs_numdata.pdf',bbox_inches='tight')
plt.show()

In [None]:
dGsolv_informed_mae_diff = pd.read_csv('./informed_GPR_calc_mae_MaxDiff_no_noise.csv',header=None).to_numpy()
dGsolv_informed_r2_diff = pd.read_csv('./informed_GPR_calc_r2_MaxDiff_no_noise.csv',header=None).to_numpy()

dGsolv_informed_mae_var = pd.read_csv('./informed_GPR_calc_mae_MaxVar_no_noise.csv',header=None).to_numpy()
dGsolv_informed_r2_var = pd.read_csv('./informed_GPR_calc_r2_MaxVar_no_noise.csv',header=None).to_numpy()

In [None]:
fig, ax = get_figure(figsize=(6,4))

num_data = np.arange(0,201)*5.+20

ax.plot(num_data,np.mean(dGsolv_informed_mae_var,axis=0),linewidth=2,c='C1',label='Feature Informed - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_informed_mae_var,axis=0)+np.std(dGsolv_informed_mae_var,axis=0),np.mean(dGsolv_informed_mae_var,axis=0)-np.std(dGsolv_informed_mae_var,axis=0),color='C1',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_informed_mae_diff,axis=0),linewidth=2,c='C5',linestyle='-.',label= 'Feature Informed - MaxDiff')
ax.fill_between(num_data,np.mean(dGsolv_informed_mae_diff,axis=0)+np.std(dGsolv_informed_mae_diff,axis=0),np.mean(dGsolv_informed_mae_diff,axis=0)-np.std(dGsolv_informed_mae_diff,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,[1.61 for _ in range(0,len(num_data))],'--',c='black',label='uESE - 1',linewidth=2)

ax.set_ylim(0.5,7.5)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel('MAE (kcal/mol)',fontweight='bold')
plt.legend(frameon=False)
#plt.savefig('./Figures/GPR_activelearning_compareMaxDiff_MAE_vs_numdata.pdf',bbox_inches='tight')
plt.show()

In [None]:
fig, ax = get_figure(figsize=(6,4))

num_data = np.arange(0,201)*5.+20

ax.plot(num_data,np.mean(dGsolv_informed_r2_var,axis=0),linewidth=2,c='C1',label='Feature Informed - MaxVar')
ax.fill_between(num_data,np.mean(dGsolv_informed_r2_var,axis=0)+np.std(dGsolv_informed_r2_var,axis=0),np.mean(dGsolv_informed_r2_var,axis=0)-np.std(dGsolv_informed_r2_var,axis=0),color='C1',alpha=0.4)

ax.plot(num_data,np.mean(dGsolv_informed_r2_diff,axis=0),linewidth=2,c='C5',linestyle='-.',label= 'Feature Informed - MaxDiff')
ax.fill_between(num_data,np.mean(dGsolv_informed_r2_diff,axis=0)+np.std(dGsolv_informed_r2_diff,axis=0),np.mean(dGsolv_informed_r2_diff,axis=0)-np.std(dGsolv_informed_r2_diff,axis=0),color='C5',alpha=0.4)

ax.plot(num_data,[1.61 for _ in range(0,len(num_data))],'--',c='black',label='uESE - 1',linewidth=2)

ax.set_ylim(0.5,1.0)

ax.set_xlabel('# of data',fontweight='bold')
ax.set_ylabel(r'$\mathbf{R^2}$',fontweight='bold')
plt.legend(frameon=False)
#plt.savefig('./Figures/GPR_activelearning_compareMaxDiff_r2_vs_numdata.pdf',bbox_inches='tight')
plt.show()