In [1]:
import pickle
import numpy as np
import sys
import os
import pandas as pd

This notebook generates the density-fitted features for the structures in VSS-452 and CSD-76. Note that in order to generate these features, you need a Psi4 wave function file for both the high spin and low spin states, and then the features are generated by postprocessing of a density fitting calculation on these, an example of which can be seen in the `density_fitting` subdirectory. Due to the large size of the wave function files, they are not provided in this repository. However, the `density_fitting` subdirectory and this notebook demonstrate how to convert a Psi4 spin-splitting calculation into the features used in this work.

# VSS-452

In [2]:
functional = 'scan'

path = '../../vss_data/wfns/'+functional+'0/vss452/'

folders = [x for x in os.listdir(path) if '.' not in x]
output_df = pd.DataFrame(index=folders, columns=['features'])
all_data = {'symbol': [], 'spec': []}
failed = 0

removed = 0
energies = pd.read_csv('../data/cleaned_vss452_sse.csv')
energies['Unnamed: 0'] = energies['Unnamed: 0'].apply(lambda x: x.split('/')[-1])
energies = energies.set_index('Unnamed: 0')

for folder in folders:
    if np.isnan(energies[functional+'_hfx_25'][folder]):
        removed += 1
        output_df = output_df.drop([folder])
    else:
        try:
            data = np.load(path + folder + '/df_output.pkl', allow_pickle=True)
            all_data['symbol'].append(data['symbol'])
            all_data['spec'].append(data['spec'])
        except:
            failed += 1
            output_df = output_df.drop([folder])

len(folders), failed, removed, len(output_df), len(all_data['symbol'])

(408, 32, 53, 323, 323)

In [3]:
def standardize_over_all(x: list, padding_len: int = 1) -> dict:
    '''
    Atom-type-based standardization.
    
    Params:
    --------
        x: list,
            a list of density fitting features for a specific atom type
        padding_len: int, default as 1,
            the number of extra padding length; default as 1 for the atom type
    
    Returns:
    --------
        mean_std: dict,
            a dictionary for the mean and std. dev. for the density fitting features
    '''
    _mean = np.concatenate([np.mean(np.array(x), axis=0), np.array([0 for _ in range(padding_len)])], axis=-1)
    _std = np.std(np.array(x), axis=0)
    _std[_std < 1e-6] = 1
    _std = np.concatenate([_std, np.array([1 for _ in range(padding_len)])], axis=-1)
    mean_std = {"mean": _mean, "std": _std} 
    return mean_std
    
atoms  = ["X", "H", "C", "N", "O", "F", "Cr", "Mn", "Fe", "Co"] # X represent a "vacuum" atom
atom_maps = {"X": 0, "H": 1, "C": 2, "N": 3, "O": 4, "F": 5, "P": 3, "S": 4, "Cl": 5, "Cr": 6, "Mn": 7, "Fe": 8, "Co": 9}
ele_group = {"H": "H", "C": "C", "N": "N", "O": "O", "F": "F", "P": "N", "S": "O", "Cl": "F", 
             "Cr": "Cr", "Mn": "Mn", "Fe": "Fe", "Co": "Co"} # here we use the same local network for 2p/3p pairs.
max_size = 65 # largest number of atoms in the complexes of VSS-452. But the models is not limited by the size of atoms after trained.

res_tot = all_data
tot_sample = len(res_tot["symbol"])

# ---normalize density fitting features---
standard_dict, arrs = {}, {}
for ele in atoms[1:]:
    arrs[ele] = []
    standard_dict[ele] = {}
for ii in range(tot_sample):
    for jj, ele in enumerate(res_tot["symbol"][ii]):
        _den_alpha = np.pad(np.array(res_tot["spec"][ii]['alpha'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])))
        _den_beta = np.pad(np.array(res_tot["spec"][ii]['beta'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])))
        _tot = np.concatenate([_den_alpha, _den_beta], axis=-1)
        arrs[ele_group[ele]] += [_tot]
for ele in atoms[1:]:
    standard_dict[ele]  = standardize_over_all(arrs[ele], padding_len=1)
standard_dict["X"] = {"mean": np.zeros(shape=(58*2 + 1, )), "std": np.ones(shape=(58*2 + 1, ))}

# ---get normalized features---
X = np.zeros(shape=(tot_sample, max_size, 58*2 + 1))
c = 0
for ii in range(tot_sample):
    for jj, ele in enumerate(res_tot['symbol'][ii]):
        _den_alpha = np.pad(np.array(res_tot["spec"][ii]['alpha'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])), 'constant', constant_values=(0, 0))
        _den_beta = np.pad(np.array(res_tot["spec"][ii]['beta'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])), 'constant', constant_values=(0, 0))
        _tot = np.concatenate([_den_alpha, _den_beta, np.array([atom_maps[ele]])], axis=-1)
        X[c, jj, :] = (_tot - standard_dict[ele_group[ele]]["mean"])/standard_dict[ele_group[ele]]["std"]
    c += 1

X.shape

(323, 65, 117)

In [4]:
#save files
with open("BP_features/"+functional+"0-vss452_X.pkl", "wb") as fo:
    pickle.dump(X, fo)

with open("BP_features/"+functional+"0-standard_dict.pkl", "wb") as fo:
    pickle.dump(standard_dict, fo)

output_df.to_csv("BP_features/"+functional+"0-vss452_structures.csv")

# CSD-76

In [5]:
path = '../../vss_data/wfns/'+functional+'0/csd76/'

folders = [x for x in os.listdir(path) if '.' not in x and 'Cr' not in x]
output_df = pd.DataFrame(index=folders, columns=['features'])
all_data = {'symbol': [], 'spec': []}
failed = 0

removed = 0
energies = pd.read_csv('../data/cleaned_csd76_sse.csv')
energies = energies.set_index('Unnamed: 0')

for folder in folders:
    if np.isnan(energies[functional+'_hfx_25'][folder]):
        removed += 1
        output_df = output_df.drop([folder])
    else:
        try:
            data = np.load(path + folder + '/df_output.pkl', allow_pickle=True)
            all_data['symbol'].append(data['symbol'])
            all_data['spec'].append(data['spec'])
        except:
            failed += 1
            output_df = output_df.drop([folder])

len(folders), failed, removed, len(output_df), len(all_data['symbol'])

(68, 1, 3, 64, 64)

In [6]:
standard_dict = np.load("BP_features/"+functional+"0-standard_dict.pkl", allow_pickle=True)
    
atoms  = ["X", "H", "C", "N", "O", "F", "Cr", "Mn", "Fe", "Co"] # X represent a "vacuum" atom
atom_maps = {"X": 0, "H": 1, "C": 2, "N": 3, "O": 4, "F": 5, "P": 3, "S": 4, "Cl": 5, "Cr": 6, "Mn": 7, "Fe": 8, "Co": 9}
ele_group = {"H": "H", "C": "C", "N": "N", "O": "O", "F": "F", "P": "N", "S": "O", "Cl": "F", 
             "Cr": "Cr", "Mn": "Mn", "Fe": "Fe", "Co": "Co"} # here we use the same local network for 2p/3p pairs.
max_size = 65 # largest number of atoms in the complexes of VSS-452. But the models is not limited by the size of atoms after trained.

res_tot = all_data
tot_sample = len(res_tot["symbol"])

# ---get normalized features---
X = np.zeros(shape=(tot_sample, max_size, 58*2 + 1))
c = 0
for ii in range(tot_sample):
    for jj, ele in enumerate(res_tot['symbol'][ii]):
        _den_alpha = np.pad(np.array(res_tot["spec"][ii]['alpha'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])), 'constant', constant_values=(0, 0))
        _den_beta = np.pad(np.array(res_tot["spec"][ii]['beta'][jj]), (0, 58-len(res_tot["spec"][ii]['alpha'][jj])), 'constant', constant_values=(0, 0))
        _tot = np.concatenate([_den_alpha, _den_beta, np.array([atom_maps[ele]])], axis=-1)
        X[c, jj, :] = (_tot - standard_dict[ele_group[ele]]["mean"])/standard_dict[ele_group[ele]]["std"]
    c += 1

X.shape

(64, 65, 117)

In [7]:
#save files
with open("BP_features/"+functional+"0-csd76_X.pkl", "wb") as fo:
    pickle.dump(X, fo)

output_df.to_csv("BP_features/"+functional+"0-csd76_structures.csv")