In [1]:
import h5py, os, glob, json
from block2 import SU2
from pyblock2.driver.core import DMRGDriver, SymmetryTypes
import pandas as pd
import numpy as np
from dmrg_utils import load_mps, max_det_coeff, expand_csf
from cas_utils import log_hilbert_space_size
from matplotlib import pyplot as plt
from pathlib import Path
import urllib.request

In [2]:
# General parameters
repository_url = "https://github.com/isi-usc-edu/qb-gsee-benchmark/archive/refs/heads/main.zip"
problem_instance_files_repository_path = (
    "problem_instances"
)

In [3]:
# Download problem instance files
repository_filepath = Path("repository.zip")

repository_path = Path("qb-gsee-benchmark-main")
if not repository_path.exists():
    # Download repository
    urllib.request.urlretrieve(repository_url, repository_filepath.name)
    # unzip repository
    os.system(f"unzip {repository_filepath}")

problem_instance_files_path = repository_path / problem_instance_files_repository_path
instance_files = list(problem_instance_files_path.glob('problem_instance.*.json'))

In [4]:
# Retrieve relevant metadata
metadata = []

for file in instance_files:
    with open(file, 'r') as jf:
        json_data = json.load(jf)
    
    
    for task_data in json_data['tasks']:
        features = dict(task_data['features'])
        #assert len(task_data['supporting_files']) == 1 
        #if len(task_data['supporting_files']) > 1:
        #    print(task_data['task_uuid'])
        #    print(f'Number of supporting files : {len(task_data["supporting_files"])}')
        #    for sinfo in task_data['supporting_files']:
        #        print(sinfo)
        #    break
        for sinfo in task_data['supporting_files']:
            if 'fcidump' in sinfo['instance_data_object_url'] or 'FCIDUMP' in sinfo['instance_data_object_url']:
                features = {**features, **sinfo}
                break

        if not 'instance_data_object_url' in features.keys():
            print(f'{task_data["task_uuid"]} does not have an fcidump!')
            print(task_data['supporting_files'])
            print(10*'-')

        features['reference_energy'] = task_data['requirements'].get('reference_energy')
        features['reference_energy_units'] = task_data['requirements'].get('reference_energy_units')
        features['task_uuid'] = task_data['task_uuid']
        metadata.append(features)

metadata = pd.DataFrame(metadata)

In [5]:
metadata_0 = pd.DataFrame(metadata.loc[~metadata.avas_ne.isna()])
metadata_1 = pd.DataFrame(metadata.loc[metadata.avas_ne.isna()])
metadata_0.drop(columns=['num_electrons', 'num_orbitals'], inplace=True)
metadata_1.drop(columns=['avas_ne', 'avas_no'], inplace=True)
metadata_0.rename(columns={'avas_ne' : 'num_electrons', 'avas_no' : 'num_orbitals'}, inplace=True)
metadata = pd.concat([metadata_0, metadata_1], ignore_index=True)

In [6]:
metadata.columns

Index(['molecule_name', 'geometry', 'basis_set', 'charge', 'multiplicity',
       'instance_data_object_uuid', 'instance_data_object_url',
       'instance_data_checksum', 'instance_data_checksum_type',
       'reference_energy', 'reference_energy_units', 'task_uuid',
       'utility_scale', 'software_used', 'avas_atomic_orbitals', 'avas_minao',
       'num_electrons', 'num_orbitals', 'nbasis', 'correlated_electrons'],
      dtype='object')

In [7]:
#metadata.loc[metadata['instance_data_object_url'].isna()]

In [8]:
task_uuid_incomp = metadata.loc[metadata.num_electrons.isna()]['task_uuid'].to_list()
print(task_uuid_incomp)

['9030e9c9-0323-413c-a98e-aba16b180ba7', 'c7c653b0-4440-4a36-a66b-8b81a9a351c7', '82f35801-6276-4090-a9aa-b821d94708d4', '362386c9-f0c2-483e-a64d-52208ab9bbc1', '61ad4cfe-b057-4dd8-bd06-ed56347745c1', 'fc25a232-d249-4e1c-a9ec-360cab33f779', 'b99ec02f-b46a-43fe-8a87-1b3286e072d8', '8874bf11-f594-41df-b8a0-d2a7441904a1', '202e6184-1e7d-4c02-a82a-f588b6abf809', '7c8f0a36-cda5-4b33-bb46-4e58a15b37a1', 'cbab5937-5e2f-4f3f-819e-ede8b66d7238', 'c28e09e1-5983-42f7-9ecb-b271438ebcc8', '881f5275-83eb-470c-80fe-eebb7ced30e9', 'e0de65ed-e45e-46e0-a57b-c6ad4efbdafe', '6918e02b-1e04-40bf-a800-1c9ed100d261', 'a2ad41b8-37bb-44e1-be3e-df1052952e9c', 'b4968ccd-6711-4173-8f80-f16094b3e0cd', 'ee2925f1-a302-4c21-b33c-5cb30b10270f', 'b86457a4-abf4-43c9-9c08-2397e95b7c0f', '7148b36a-0571-4a41-9239-19a97339f68a']


In [9]:
from utils import retrieve_fcidump_from_sftp
username = 'darpa-qb-zapata'
key_path = '/Users/akunitsa/.ssh/darpa-qb-zapata-key.ppk'
def process_fcidump(r):
    if r['task_uuid'] in task_uuid_incomp:
        #print(r['task_uuid'])
        #print(r['instance_data_object_url'])
        fci = retrieve_fcidump_from_sftp(r['instance_data_object_url'], username, key_path)
        return (fci['NELEC'], fci['NORB'], fci['MS2'] + 1)
    else:
        return (r['num_electrons'], r['num_orbitals'], r['multiplicity'])

In [10]:
metadata[['num_electrons', 'num_orbitals', 'multiplicity']] = metadata.apply(process_fcidump, axis=1, result_type='expand')

Downloading gsee/FCIDUMP_d_1.68_b_sto-3g_ne_12.3092dd74-660d-4c7a-9d43-16d1436e084b.gz to FCIDUMP_d_1.68_b_sto-3g_ne_12.3092dd74-660d-4c7a-9d43-16d1436e084b.gz...
Parsing FCIDUMP_d_1.68_b_sto-3g_ne_12.3092dd74-660d-4c7a-9d43-16d1436e084b
Downloading gsee/FCIDUMP_d_1.68_b_sto-3g_ne_28.96843098-e69d-4d1f-8a88-5b24826f7390.gz to FCIDUMP_d_1.68_b_sto-3g_ne_28.96843098-e69d-4d1f-8a88-5b24826f7390.gz...
Parsing FCIDUMP_d_1.68_b_sto-3g_ne_28.96843098-e69d-4d1f-8a88-5b24826f7390
Downloading gsee/FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_12.673dfe91-d90e-4ecd-8560-d6d74de11070.gz to FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_12.673dfe91-d90e-4ecd-8560-d6d74de11070.gz...
Parsing FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_12.673dfe91-d90e-4ecd-8560-d6d74de11070
Downloading gsee/FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_28.4412b7d6-86db-4616-9dd2-2c32ee02560f.gz to FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_28.4412b7d6-86db-4616-9dd2-2c32ee02560f.gz...
Parsing FCIDUMP_d_1.68_b_cc-pvdz-dk_ne_28.4412b7d6-86db-4616-9dd2-2c32ee02560f
Downloading gsee/FCIDUMP

In [11]:
#metadata.multiplicity.unique()

In [12]:
# We will exclude Cr2 instances here since their are not properly fromatted
#metadata = metadata.loc[~metadata.num_electrons.isna()]

In [13]:
mps_data_dir = Path('../data/data_storage/') # Location of MPSs;
list_of_solution_dirs = list(mps_data_dir.glob('*'))
solution_paths = []
for sol_dir in list_of_solution_dirs:
    task_uuid = os.path.basename(sol_dir).split('_')[-1]
    solution_paths.append({'task_uuid' : task_uuid, 'solution_dir' : sol_dir})
sol_paths = pd.DataFrame(solution_paths)
print(sol_paths.head(1))

                              task_uuid  \
0  0f0c8766-4bf5-4c1e-991a-9dafe9bc8f27   

                                        solution_dir  
0  ../data/data_storage/V0_vdz_a1436084-b348-4c11...  


In [14]:
processed_uuids = []
if os.path.isfile('overlaps.csv'):
    data = pd.read_csv('overlaps.csv')
    processed_uuids = data['task_uuid'].to_list()

In [15]:
overlap_data = []
missing_solutions = []
for index, row in metadata.iterrows():

    if row['task_uuid'] in processed_uuids:
        continue

    n_electrons = int(row["num_electrons"])
    spin = int(row["multiplicity"]) - 1
    n_cas = int(row["num_orbitals"])
    task_uuid = row["task_uuid"]

    path_to_solution = sol_paths.query(f'task_uuid == "{task_uuid}"') 
    if path_to_solution.empty:
        #print(full_path_to_solution['solution_dir'])
        #print(f'Solution not found for {row["task_uuid"]}')
        missing_solutions.append({'task_uuid' : task_uuid, 'molecule' : row['molecule_name']})
        continue
    
    driver = DMRGDriver(scratch="./tmp", symm_type=SymmetryTypes.SU2, n_threads=4)
    driver.initialize_system(
        n_sites=int(n_cas),
        n_elec=int(n_electrons),
        spin=int(spin),
    )
    full_dir = path_to_solution['solution_dir'].item()/'mps_storage'
    #print(full_dir)
    dmrg_loop_dir_lst = list(full_dir.glob("*"))
    #print(dmrg_loop_dir_lst)
    assert len(dmrg_loop_dir_lst) == 1 # we should have just one solution per instance
    dmrg_loop_dir = dmrg_loop_dir_lst[0]
    print(dmrg_loop_dir)

    assert os.path.isfile(dmrg_loop_dir/'mps_info.bin')

    ket = load_mps(dmrg_loop_dir)

    coeff, csf = max_det_coeff(driver, ket, cutoff=0.0005)

    overlap_data.append({'task_uuid' : task_uuid,
                         'num_orbitals' : row['num_orbitals'], 
                         'num_electrons' : row['num_electrons'], 
                         'hf_coeff' : np.abs(coeff), 
                         'csf' : csf})
        

../data/data_storage/38_1_ts_noncan_0_466245ef-7efa-480e-9724-1dcb4534b820_9da97ef6-4bad-4c82-8576-9b8e539a7ba8/mps_storage/dmrg_loop_027_ket_optimized
../data/data_storage/39_1_ts_noncan_0_4ded7042-dd68-4fa9-8e23-56d839c26910_f20db1b1-86e3-4ddf-96ff-a6d37f331935/mps_storage/dmrg_loop_033_ket_optimized
../data/data_storage/40_1_ts_noncan_0_05efedcd-ca71-4522-a5b9-8ffeb945fb78_742be33c-0982-4b06-b324-1d86242e28d9/mps_storage/first_preloop_calc_ket_optimized
../data/data_storage/41_1_ts_noncan_0_b1147cc6-f8d8-4fb3-a037-d6ed92203ccc_3b49563d-948a-4ecb-84a0-d3c0c2c0d217/mps_storage/first_preloop_calc_ket_optimized
../data/data_storage/42_1_star_noncan_0_9ca331da-b0f6-4bc8-8387-3fffc02d9672_950da653-13ce-4cd7-95d8-5045ab03d4bc/mps_storage/dmrg_loop_036_ket_optimized
../data/data_storage/43_1_star_noncan_0_881d2a46-e6b4-4fd5-bce4-b539222bf7ee_13821f5c-78a6-485c-9006-a17a234cde9f/mps_storage/dmrg_loop_048_ket_optimized
../data/data_storage/44_1_star_noncan_0_a847b2fc-f824-4b5c-9bd2-44e1756f94

In [16]:
missing_solutions_df = pd.DataFrame(missing_solutions)
overlap_data_df = pd.DataFrame(overlap_data)

In [17]:
missing_solutions_df.molecule.unique()

array(['be_cc-pVDZ', 'V1_vtz'], dtype=object)

In [18]:
overlap_data_df.head()

Unnamed: 0,task_uuid,num_orbitals,num_electrons,hf_coeff,csf
0,9da97ef6-4bad-4c82-8576-9b8e539a7ba8,18.0,22.0,0.736407,"[3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 1, 3, 3, 0, ..."
1,f20db1b1-86e3-4ddf-96ff-a6d37f331935,22.0,30.0,0.654199,"[1, 3, 3, 3, 3, 3, 0, 2, 3, 3, 0, 3, 0, 0, 0, ..."
2,742be33c-0982-4b06-b324-1d86242e28d9,34.0,50.0,0.252078,"[3, 3, 0, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,3b49563d-948a-4ecb-84a0-d3c0c2c0d217,36.0,50.0,0.179748,"[0, 3, 1, 0, 3, 0, 3, 3, 3, 3, 3, 3, 2, 3, 1, ..."
4,950da653-13ce-4cd7-95d8-5045ab03d4bc,18.0,22.0,0.540153,"[1, 3, 1, 0, 0, 0, 0, 3, 0, 3, 2, 3, 3, 3, 3, ..."


In [19]:
overlap_data_df.shape

(36, 5)

In [20]:
overlap_data_df.rename(columns={'hf_coeff': 'overlap'}, inplace=True)
overlap_data_df[['overlap', 'task_uuid']].to_csv('overlaps_.csv', index=False)