This notebook performs analyses on how fitted and transferred DFAs perform when looking at only TMCs containing a specific metal.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from scipy.optimize import minimize
from scipy.stats import linregress
from tqdm import tqdm

In [2]:
csd_sse_df = pd.read_csv('../data/cleaned_csd76_sse.csv').set_index('Unnamed: 0')

csd_hfx_df = pd.read_csv('../data/CSD76targets.csv').set_index('Unnamed: 0').clip(lower=0, upper=100)

csd_76 = pd.read_csv('../data/CSD-76.csv').set_index('name')

vss_sse_df = pd.read_csv('../data/cleaned_vss452_sse.csv').set_index('Unnamed: 0')
names = {}
for name in vss_sse_df.index:
    elems = name.split('/')
    names[name] =elems[-1]
vss_sse_df = vss_sse_df.rename(index=names)

vss_hfx_df = pd.read_csv('../data/VSS452targets.csv').set_index('Unnamed: 0')
names = {}
for name in vss_hfx_df.index:
    elems = name.split('/')
    names[name] =elems[-1]
vss_hfx_df = vss_hfx_df.rename(index=names)
vss_hfx_df = vss_hfx_df.clip(lower=0, upper=100)

vss_452 = pd.read_csv('../data/VSS-452.csv').set_index('name')

#format the dataframes differently to make working with them easier

vss_pbe_sse = vss_sse_df.drop([key for key in vss_sse_df.keys() if 'scan' in key], axis=1)
vss_scan_sse = vss_sse_df.drop([key for key in vss_sse_df.keys() if 'pbe' in key], axis=1)
csd_pbe_sse = csd_sse_df.drop([key for key in csd_sse_df.keys() if 'scan' in key], axis=1)
csd_scan_sse = csd_sse_df.drop([key for key in csd_sse_df.keys() if 'pbe' in key], axis=1)

scan_map = {}
pbe_map = {}
for column in vss_scan_sse.columns:
    scan_map[column] = int(column.split('_')[-1])
for column in vss_pbe_sse.columns:
    pbe_map[column] = int(column.split('_')[-1])

vss_scan_sse = vss_scan_sse.rename(columns=scan_map).sort_index(axis=1)
vss_pbe_sse = vss_pbe_sse.rename(columns=pbe_map).sort_index(axis=1)
csd_scan_sse = csd_scan_sse.rename(columns=scan_map).sort_index(axis=1)
csd_pbe_sse = csd_pbe_sse.rename(columns=pbe_map).sort_index(axis=1)

def single_HFX_AE(dataset, pred_hfx):

    def pred_energy(structure, functional, dataset):
        if dataset == 'csd':
            df = csd_76
            sse_df = csd_sse_df
            hfx_df = csd_hfx_df
        elif dataset == 'vss':
            df = vss_452
            sse_df = vss_sse_df
            hfx_df = vss_hfx_df
        else:
            raise ValueError("Specify a valid dataset!")
        
        sses = []
        all_increments = np.arange(0, 101, 5)
        increments = []
        for increment in all_increments:
            sse = sse_df.loc[structure][functional + '_hfx_' + str(increment)]
            if not np.isnan(sse):
                increments.append(increment)
                sses.append(sse)
        if len(increments) < 5:
            #print('Not enough converged values!')
            return

        line = interp1d(increments, sses, kind='linear', fill_value='extrapolate')
        return line(pred_hfx)

    if dataset == 'csd':
        df = csd_76
        sse_df = csd_sse_df
        hfx_df = csd_hfx_df
    elif dataset == 'vss':
        df = vss_452
        sse_df = vss_sse_df
        hfx_df = vss_hfx_df

    pbe_errors = []
    scan_errors = []

    for struct in hfx_df.index.to_list():
        if dataset == 'csd':
            reference = csd_76.loc[struct]['dlpno-CCSD_T.vertsse']
        elif dataset == 'vss':
            reference = vss_452.loc[struct]['dlpno-CCSD_T.vertsse']
        else:
            raise ValueError('Provide a proper reference')
        pbe_energy = pred_energy(struct, 'pbe', dataset)
        if pbe_energy is not None:
            pbe_errors.append(reference - pbe_energy)
        scan_energy = pred_energy(struct, 'scan', dataset)
        if scan_energy is not None:
            scan_errors.append(reference-scan_energy)

    return np.abs(np.array(pbe_errors)), np.abs(np.array(scan_errors))

In [3]:
functionals = ['pbe', 'scan']

single_hfx_pbe_csd = np.nan
single_hfx_scan_csd = np.nan
single_hfx_pbe_vss = np.nan
single_hfx_scan_vss = np.nan

for base_functional in functionals:
    for idx, target_df in enumerate([csd_hfx_df, vss_hfx_df]):
        if idx == 0:
            datasetname = 'csd'
            metals = ['Co', 'Fe', 'Mn']
        else:
            datasetname = 'vss'
            metals = ['cr', 'co', 'fe', 'mn']
        print(f'Functional {base_functional} dataset {datasetname}:')
        if idx == 0:
            if base_functional == 'pbe':
                df = csd_pbe_sse
            else:
                df = csd_scan_sse
        else:
            if base_functional == 'pbe':
                df = vss_pbe_sse
            else:
                df = vss_scan_sse

        if base_functional == 'pbe':
            fun = lambda hfx: single_HFX_AE(datasetname, hfx)[0].mean()
        else:
            fun = lambda hfx: single_HFX_AE(datasetname, hfx)[1].mean()

        res = minimize(fun, 40, method='Nelder-Mead')
        print(res)
        if idx == 0:
            if base_functional == 'pbe':
                single_hfx_pbe_csd = res.fun
            else:
                single_hfx_scan_csd = res.fun
        else:
            if base_functional == 'pbe':
                single_hfx_pbe_vss = res.fun
            else:
                single_hfx_scan_vss = res.fun

Functional pbe dataset csd:
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 5.006685965037144
             x: [ 3.127e+01]
           nit: 19
          nfev: 38
 final_simplex: (array([[ 3.127e+01],
                       [ 3.127e+01]]), array([ 5.007e+00,  5.007e+00]))
Functional pbe dataset vss:
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 6.929096577429319
             x: [ 3.512e+01]
           nit: 18
          nfev: 36
 final_simplex: (array([[ 3.512e+01],
                       [ 3.512e+01]]), array([ 6.929e+00,  6.929e+00]))
Functional scan dataset csd:
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 5.679388418962995
             x: [ 3.307e+01]
           nit: 23
          nfev: 46
 final_simplex: (array([[ 3.307e+01],
                       [ 3.307e+01]]), array([ 5.679e+00,  5.679e+00]))
Fun

In [4]:
def single_metal_HFX_AE(dataset, metal, pred_hfx):

    def pred_energy(structure, functional, dataset):
        if dataset == 'csd':
            df = csd_76.loc[[x for x in csd_76.index if x.startswith(metal)]]
            sse_df = csd_sse_df.loc[[x for x in csd_sse_df.index if x.startswith(metal)]]
            hfx_df = csd_hfx_df.loc[[x for x in csd_hfx_df.index if x.startswith(metal)]]
        elif dataset == 'vss':
            df = vss_452.loc[[x for x in vss_452.index if x.startswith(metal)]]
            sse_df = vss_sse_df.loc[[x for x in vss_sse_df.index if x.startswith(metal)]]
            hfx_df = vss_hfx_df.loc[[x for x in vss_hfx_df.index if x.startswith(metal)]]
        else:
            raise ValueError("Specify a valid dataset!")
        
        sses = []
        all_increments = np.arange(0, 101, 5)
        increments = []
        for increment in all_increments:
            sse = sse_df.loc[structure][functional + '_hfx_' + str(increment)]
            if not np.isnan(sse):
                increments.append(increment)
                sses.append(sse)
        if len(increments) < 5:
            #print('Not enough converged values!')
            return

        line = interp1d(increments, sses, kind='linear', fill_value='extrapolate')
        return line(pred_hfx)

    if dataset == 'csd':
        df = csd_76.loc[[x for x in csd_76.index if x.startswith(metal)]]
        sse_df = csd_sse_df.loc[[x for x in csd_sse_df.index if x.startswith(metal)]]
        hfx_df = csd_hfx_df.loc[[x for x in csd_hfx_df.index if x.startswith(metal)]]
    elif dataset == 'vss':
        df = vss_452.loc[[x for x in vss_452.index if x.startswith(metal)]]
        sse_df = vss_sse_df.loc[[x for x in vss_sse_df.index if x.startswith(metal)]]
        hfx_df = vss_hfx_df.loc[[x for x in vss_hfx_df.index if x.startswith(metal)]]

    pbe_errors = []
    scan_errors = []

    for struct in hfx_df.index.to_list():
        if dataset == 'csd':
            reference = csd_76.loc[struct]['dlpno-CCSD_T.vertsse']
        elif dataset == 'vss':
            reference = vss_452.loc[struct]['dlpno-CCSD_T.vertsse']
        else:
            raise ValueError('Provide a proper reference')
        pbe_energy = pred_energy(struct, 'pbe', dataset)
        if pbe_energy is not None:
            pbe_errors.append(reference - pbe_energy)
        scan_energy = pred_energy(struct, 'scan', dataset)
        if scan_energy is not None:
            scan_errors.append(reference-scan_energy)

    return np.abs(np.array(pbe_errors)), np.abs(np.array(scan_errors))

In [5]:
functionals = ['pbe', 'scan']

single_hfx_pbe_csd = np.nan
single_hfx_scan_csd = np.nan
single_hfx_pbe_vss = np.nan
single_hfx_scan_vss = np.nan

csd_opt_hfx_pbe = {}
vss_opt_hfx_pbe = {}
csd_opt_hfx_scan = {}
vss_opt_hfx_scan = {}

for base_functional in functionals:
    for idx, target_df in enumerate([csd_hfx_df, vss_hfx_df]):
        if idx == 0:
            datasetname = 'csd'
            metals = ['Co', 'Fe', 'Mn']
        else:
            datasetname = 'vss'
            metals = ['cr', 'co', 'fe', 'mn']
        print(f'Functional {base_functional} dataset {datasetname}:')
        if idx == 0:
            if base_functional == 'pbe':
                df = csd_pbe_sse
            else:
                df = csd_scan_sse
        else:
            if base_functional == 'pbe':
                df = vss_pbe_sse
            else:
                df = vss_scan_sse

        if base_functional == 'pbe':
            fun = lambda hfx: single_metal_HFX_AE(datasetname, metal, hfx)[0].mean()
        else:
            fun = lambda hfx: single_metal_HFX_AE(datasetname, metal, hfx)[1].mean()

        for metal in metals:
            print(f'Metal: {metal}')
            print(f"Number of Structures with {metal}: {len(target_df.loc[[x for x in target_df.index if x.startswith(metal)]]['hfx_'+base_functional].dropna())}")
            res = minimize(fun, 40, method='Nelder-Mead')
            print(res)
            if idx == 0:
                if base_functional == 'pbe':
                    single_hfx_pbe_csd = res.fun
                    csd_opt_hfx_pbe[metal] = res.x[0]
                else:
                    single_hfx_scan_csd = res.fun
                    csd_opt_hfx_scan[metal] = res.x[0]
            else:
                if base_functional == 'pbe':
                    single_hfx_pbe_vss = res.fun
                    vss_opt_hfx_pbe[metal] = res.x[0]
                else:
                    single_hfx_scan_vss = res.fun
                    vss_opt_hfx_scan[metal] = res.x[0]

Functional pbe dataset csd:
Metal: Co
Number of Structures with Co: 22
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 2.478110133103952
             x: [ 2.693e+01]
           nit: 21
          nfev: 42
 final_simplex: (array([[ 2.693e+01],
                       [ 2.693e+01]]), array([ 2.478e+00,  2.478e+00]))
Metal: Fe
Number of Structures with Fe: 39
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 4.572572014729041
             x: [ 3.419e+01]
           nit: 19
          nfev: 38
 final_simplex: (array([[ 3.419e+01],
                       [ 3.419e+01]]), array([ 4.573e+00,  4.573e+00]))
Metal: Mn
Number of Structures with Mn: 8
       message: Optimization terminated successfully.
       success: True
        status: 0
           fun: 5.087010066663526
             x: [ 2.654e+01]
           nit: 21
          nfev: 42
 final_simplex: (array([[ 2.654e+01],
    

In [6]:
#transferred error

vss_opt_hfx_scan = {x.capitalize(): vss_opt_hfx_scan[x] for x in vss_opt_hfx_scan}
csd_opt_hfx_scan = {x.lower(): csd_opt_hfx_scan[x] for x in csd_opt_hfx_scan}
vss_opt_hfx_pbe = {x.capitalize(): vss_opt_hfx_pbe[x] for x in vss_opt_hfx_pbe}
csd_opt_hfx_pbe = {x.lower(): csd_opt_hfx_pbe[x] for x in csd_opt_hfx_pbe}


functionals = ['pbe', 'scan']

for base_functional in functionals:
    for idx, target_df in enumerate([csd_hfx_df, vss_hfx_df]):
        if idx == 0:
            datasetname = 'csd'
            metals = ['Co', 'Fe', 'Mn']
        else:
            datasetname = 'vss'
            metals = ['co', 'fe', 'mn']
        print(f'Functional {base_functional} dataset {datasetname}:')
        if idx == 0:
            if base_functional == 'pbe':
                df = csd_pbe_sse
                opt_hfx = vss_opt_hfx_pbe
            else:
                df = csd_scan_sse
                opt_hfx = vss_opt_hfx_scan
        else:
            if base_functional == 'pbe':
                df = vss_pbe_sse
                opt_hfx = csd_opt_hfx_pbe
            else:
                df = vss_scan_sse
                opt_hfx = csd_opt_hfx_scan

        if base_functional == 'pbe':
            fun = lambda hfx: single_metal_HFX_AE(datasetname, metal, hfx)[0].mean()
        else:
            fun = lambda hfx: single_metal_HFX_AE(datasetname, metal, hfx)[1].mean()

        for metal in metals:
            print(f'Metal: {metal}')
            print(f"Number of Structures with {metal}: {len(target_df.loc[[x for x in target_df.index if x.startswith(metal)]]['hfx_'+base_functional].dropna())}")
            res = fun(opt_hfx[metal])
            print(f"Transferred error: {res:1.3f} kcal/mol")

Functional pbe dataset csd:
Metal: Co
Number of Structures with Co: 22
Transferred error: 8.982 kcal/mol
Metal: Fe
Number of Structures with Fe: 39
Transferred error: 4.577 kcal/mol
Metal: Mn
Number of Structures with Mn: 8
Transferred error: 9.912 kcal/mol
Functional pbe dataset vss:
Metal: co
Number of Structures with co: 91
Transferred error: 8.448 kcal/mol
Metal: fe
Number of Structures with fe: 162
Transferred error: 3.714 kcal/mol
Metal: mn
Number of Structures with mn: 78
Transferred error: 15.394 kcal/mol
Functional scan dataset csd:
Metal: Co
Number of Structures with Co: 22
Transferred error: 8.974 kcal/mol
Metal: Fe
Number of Structures with Fe: 37
Transferred error: 3.238 kcal/mol
Metal: Mn
Number of Structures with Mn: 8
Transferred error: 11.706 kcal/mol
Functional scan dataset vss:
Metal: co
Number of Structures with co: 88
Transferred error: 9.084 kcal/mol
Metal: fe
Number of Structures with fe: 155
Transferred error: 2.738 kcal/mol
Metal: mn
Number of Structures with m