In [1]:
import numpy as np
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from scipy.optimize import minimize
import os

This notebook demonstrates how to convert HFX% predictions to predictions of the SSE. It also generates the `.csv` files where the target and prediction values are in kcal/mol instead of HFX%.

In [2]:
csd_sse_df = pd.read_csv('../data/cleaned_csd76_sse.csv').set_index('Unnamed: 0')
csd_hfx_df = pd.read_csv('../data/CSD76targets.csv').set_index('Unnamed: 0')
csd_76 = pd.read_csv('../data/CSD-76.csv').set_index('name')

vss_sse_df = pd.read_csv('../data/cleaned_vss452_sse.csv').set_index('Unnamed: 0')
names = {}
for name in vss_sse_df.index:
    elems = name.split('/')
    names[name] =elems[-1]
vss_sse_df = vss_sse_df.rename(index=names)

vss_hfx_df = pd.read_csv('../data/VSS452targets.csv').set_index('Unnamed: 0')
names = {}
for name in vss_hfx_df.index:
    elems = name.split('/')
    names[name] =elems[-1]
vss_hfx_df = vss_hfx_df.rename(index=names)

vss_452 = pd.read_csv('../data/VSS-452.csv').set_index('name')

def pred_energy(structure, functional, dataset, pred_df, pred_name):
    '''
    Converts a prediction of HFX% to a prediction of the VSSE through linear interpolation of the VSSE vs. HFX curve.
    structure is the index of the structure of interest
    dataset is either 'csd' for CSD-76-HFX or 'vss' for VSS-452-HFX
    pred_df is a dataframe of predicted HFX%
    pred_name is the name of the column containing HFX predictions in pred_df
    '''
    if dataset == 'csd':
        df = csd_76
        sse_df = csd_sse_df
        hfx_df = csd_hfx_df
    elif dataset == 'vss':
        df = vss_452
        sse_df = vss_sse_df
        hfx_df = vss_hfx_df
    else:
        raise ValueError("Specify a valid dataset!")
    pred_hfx = pred_df.loc[structure][pred_name]
    if np.isnan(pred_hfx):
        return
    
    sses = []
    all_increments = np.arange(0, 101, 5)
    increments = []
    for increment in all_increments:
        sse = sse_df.loc[structure][functional + '_hfx_' + str(increment)]
        if not np.isnan(sse):
            increments.append(increment)
            sses.append(sse)
    if len(increments) < 5:
        #print('Not enough converged values!')
        return

    line = interp1d(increments, sses, kind='linear', fill_value='extrapolate')
    return line(pred_hfx)

# Generating files with energy predictions

In [3]:
# DF-BP conversions

path = '../ml_training/DF-BP/'
for folder in tqdm([path +x for x in os.listdir(path) if '.' not in x]):
    #print('=================')
    #print(folder.split('/')[-1])
    for split in ['train', 'val', 'test']:
        if split == 'test':
            dataset = 'csd'
            ref_df = csd_76
        else:
            ref_df = vss_452
            dataset = 'vss'
        pred_df = pd.read_csv(folder + '/BP_predictions_hyperparams-' + split + '.csv').set_index('Unnamed: 0')
        pred_df['DLPNO-CCSD(T) SSE'] = [ref_df.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
        pred_df['PBEx SSE'] = [pred_energy(x, 'pbe', dataset, pred_df, 'PBE Prediction') for x in pred_df.index]
        pred_df['SCANx SSE'] = [pred_energy(x, 'scan', dataset, pred_df, 'SCAN Prediction') for x in pred_df.index]
        #print('SPLIT: ' + split)
        #print(f'PBE error: {np.mean(np.abs(pred_df["DLPNO-CCSD(T) SSE"] - pred_df["PBEx SSE"]))}')
        #print(f'SCAN error: {np.mean(np.abs(pred_df["DLPNO-CCSD(T) SSE"] - pred_df["SCANx SSE"]))}')
        pred_df.to_csv(folder + '/energy-BP_predictions_hyperparams-' + split + '.csv')

100%|███████████████████████████████████████████| 21/21 [00:39<00:00,  1.88s/it]


In [4]:
# RACs conversions

pred_df = pd.read_csv('../ml_training/RACs/csd_racs_predictions.csv').set_index('Unnamed: 0')
for col in tqdm(pred_df.columns):
    if col in ['hfx_pbe', 'hfx_scan']:
        continue
    pred_df['DLPNO-CCSD(T) SSE'] = [csd_76.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
    pred_df[col + ' SSE'] = [pred_energy(x, col.split('-')[0], 'csd', pred_df, col) for x in pred_df.index]
    pred_df.to_csv('../ml_training/RACs/energy-csd_racs_predictions.csv')

pred_df = pd.read_csv('../ml_training/RACs/vss_racs_predictions.csv').set_index('Unnamed: 0')
for col in tqdm(pred_df.columns):
    if col in ['hfx_pbe', 'hfx_scan'] or 'set' in col:
        continue
    pred_df['DLPNO-CCSD(T) SSE'] = [vss_452.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
    pred_df[col + ' SSE'] = [pred_energy(x, col.split('-')[0], 'vss', pred_df, col) for x in pred_df.index]
    pred_df.to_csv('../ml_training/RACs/energy-vss_racs_predictions.csv')

100%|███████████████████████████████████████████| 38/38 [00:06<00:00,  5.76it/s]
100%|███████████████████████████████████████████| 74/74 [00:44<00:00,  1.65it/s]


In [5]:
# RACs replicating DF-BP conversions

for functional in ['pbe', 'scan']:
    pred_df = pd.read_csv('../ml_training/racs_replicate_dfbp_set/' + functional + '-csd_racs_predictions.csv').set_index('Unnamed: 0')
    for col in tqdm(pred_df.columns):
        if col in ['hfx_pbe', 'hfx_scan']:
            continue
        pred_df['DLPNO-CCSD(T) SSE'] = [csd_76.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
        pred_df[col + ' SSE'] = [pred_energy(x, col.split('-')[0], 'csd', pred_df, col) for x in pred_df.index]
        pred_df.to_csv('../ml_training/racs_replicate_dfbp_set/energy-' + functional + '-csd_racs_predictions.csv')
    
    pred_df = pd.read_csv('../ml_training/racs_replicate_dfbp_set/' + functional + '-vss_racs_predictions.csv').set_index('Unnamed: 0')
    for col in tqdm(pred_df.columns):
        if col in ['hfx_pbe', 'hfx_scan'] or 'set' in col:
            continue
        pred_df['DLPNO-CCSD(T) SSE'] = [vss_452.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
        pred_df[col + ' SSE'] = [pred_energy(x, col.split('-')[0], 'vss', pred_df, col) for x in pred_df.index]
        pred_df.to_csv('../ml_training/racs_replicate_dfbp_set/energy-' + functional + '-vss_racs_predictions.csv')

100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 15.53it/s]
100%|█████████████████████████████████████████████| 4/4 [00:00<00:00,  4.00it/s]
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 15.67it/s]
100%|█████████████████████████████████████████████| 4/4 [00:00<00:00,  4.41it/s]


In [6]:
# IP tuning targets

pred_df = pd.read_csv('../data/tuned_targets.csv').set_index('Unnamed: 0')
for col in tqdm(pred_df.columns):
    pred_df['DLPNO-CCSD(T) SSE'] = [csd_76.loc[x]['dlpno-CCSD_T.vertsse'] for x in pred_df.index]
    pred_df[col + ' SSE'] = [pred_energy(x, col.split()[-1].lower(), 'csd', pred_df, col) for x in pred_df.index]
    pred_df.to_csv('../data/energy-tuned_targets.csv')

100%|█████████████████████████████████████████████| 4/4 [00:00<00:00,  5.04it/s]
