In [1]:
import numpy as np
import h5py
import dmrghandler.hdf5_io as hdf5_io
import dmrghandler.data_processing as dp
from pathlib import Path
import openpyxl as px
import openpyxl.chart as px_chart
import pandas as pd

This notebook will collect the DMRG calculation data and then output an excel file with the data and plots (one sheet per calculation run), as well as csv files that contain the performance metric data (bond dimension, DMRG energy, time, discarded weights, etc.)

To run this notebook and collect the data, the following are needed:
- catalysis_instances.csv
    
    This contains the metadata for the catalysist instances.
    Put into `catalyst_meta_data_path`

- DMRG output datafolders (i.e. those that were output with the UUIDs)
    
    Put folder of these datafolders into `data_file_path`

- The run commands returned by the `prepare_calcs_*.py` files
    
    Put into `run_commands`

- SLURM emails (.eml or other text format should work)
    
    The notebook will automatically collect the run times from the email subjects.
    Put folder of these emails into `slurm_emails_folder`

- The FCIDUMP files
    
    Put the folder of these files into `fci_dump_folder`

- Excel output save folder
    
    Put into `excel_storage_path`
    
- name of output excel file
    
    Put into `excel_name`

- Performance metrics folder location
    
    Put into `csv_storage_path`

In [2]:
catalyst_meta_data_path=Path('catalysis_instances.csv')
excel_name=Path('20240502_catalyst_up_to_h_space_20.xlsx')
data_file_path = Path("/mnt/d/ZapataCalcs/dmrg_data/data_storage_catalyst_up_to_h_space_20")

run_commands = \
"""
# up to 20 log10 hilbert space
#######################
# fcidumps_catalysts/fcidump.3_ts_ru_macho_co2_{'Ru': 'cc-pVTZ-PP', 'default': '6-311++G**'}
sbatch ../201317d3-180a-4dc3-a79f-2c8a99c05a51/submit_201317d3-180a-4dc3-a79f-2c8a99c05a51.sh
# fcidumps_catalysts/fcidump.45_1_star_{'default' : 'cc-pVTZ', 'Ru' : 'LANL2TZ'}
sbatch ../2ce09dc8-f33c-4182-a642-6f441ab5bfd9/submit_2ce09dc8-f33c-4182-a642-6f441ab5bfd9.sh
# fcidumps_catalysts/fcidump.49_2_{'default' : 'cc-pVTZ', 'Ru' : 'LANL2TZ'}
sbatch ../149bb248-1537-4fa2-b289-3000312cff96/submit_149bb248-1537-4fa2-b289-3000312cff96.sh
# fcidumps_catalysts/fcidump.40_1_ts_{'default' : 'cc-pVTZ', 'Ru' : 'LANL2TZ'}
sbatch ../89133007-eb5c-4fa0-825e-bdd5f349d7e8/submit_89133007-eb5c-4fa0-825e-bdd5f349d7e8.sh
# fcidumps_catalysts/fcidump.41_1_ts_{'default' : 'cc-pVTZ', 'Ru' : 'LANL2TZ'}
sbatch ../731b2260-39bc-4660-a780-9e176d3096c1/submit_731b2260-39bc-4660-a780-9e176d3096c1.sh
# fcidumps_catalysts/fcidump.56_5_{'default' : '6-31++G(d,p)'}
sbatch ../bda65fe2-de8c-4de2-b482-fb3d91f5070f/submit_bda65fe2-de8c-4de2-b482-fb3d91f5070f.sh
# fcidumps_catalysts/fcidump.50_6acme_{'default' : '6-31++G(d,p)'}
sbatch ../788a6027-f7bd-4070-8cd7-977d215671a6/submit_788a6027-f7bd-4070-8cd7-977d215671a6.sh
# fcidumps_catalysts/fcidump.1_ru_macho_{'Ru': 'cc-pVTZ-PP', 'default': '6-311++G**'}
sbatch ../1940e8e0-3863-412e-a508-0dd2ce5a1cd7/submit_1940e8e0-3863-412e-a508-0dd2ce5a1cd7.sh
# fcidumps_catalysts/fcidump.32_2ru_III_3pl_{'default' : '6-31+G(d,p)', 'Ru' : 'lanl2tz' }
sbatch ../0a6caecd-d18d-41d9-a73b-3db288a6bd9e/submit_0a6caecd-d18d-41d9-a73b-3db288a6bd9e.sh
# fcidumps_catalysts/fcidump.33_2ru_III_3pl_{'default' : '6-31+G(d,p)', 'Ru' : 'lanl2tz' }
sbatch ../5842b252-6f65-4700-9e6f-53ee80ae8c37/submit_5842b252-6f65-4700-9e6f-53ee80ae8c37.sh
# fcidumps_catalysts/fcidump.34_3ruo_IV_2pl_{'Ru' : 'lanl2tz', 'default' : '6-31+G(d,p)'}
sbatch ../9938a711-5f29-4714-a1a3-ae87af1c55d7/submit_9938a711-5f29-4714-a1a3-ae87af1c55d7.sh
# fcidumps_catalysts/fcidump.61_3_15_af_{'default' : 'cc-pVTZ', 'Mn' : 'lanl2tz'}
sbatch ../ee71204e-6eb8-4df5-b72c-40f69a304d5b/submit_ee71204e-6eb8-4df5-b72c-40f69a304d5b.sh
# fcidumps_catalysts/fcidump.35_3ruo_IV_2pl_{'Ru' : 'lanl2tz', 'default' : '6-31+G(d,p)'}
sbatch ../b7fa5052-dc87-475a-bf86-51a08b4d8e2c/submit_b7fa5052-dc87-475a-bf86-51a08b4d8e2c.sh
# fcidumps_catalysts/fcidump.59_5_16_{'default' : 'cc-pVTZ', 'Mn' : 'lanl2tz'}
sbatch ../7f6e14b8-1a30-4bda-b5fc-fbef80575cac/submit_7f6e14b8-1a30-4bda-b5fc-fbef80575cac.sh
# fcidumps_catalysts/fcidump.63_5_15_af_ts_{'default' : 'cc-pVTZ', 'Mn' : 'lanl2tz'}
sbatch ../7210148a-d49c-439a-80eb-4f1f8175cc2b/submit_7210148a-d49c-439a-80eb-4f1f8175cc2b.sh
### # fcidumps_catalysts/fcidump.65_5_15_af_{'default' : 'cc-pVTZ', 'Mn' : 'lanl2tz'}
### sbatch ../b2abb1dc-08bf-4ba2-9bd6-f65430aea44c/submit_b2abb1dc-08bf-4ba2-9bd6-f65430aea44c.sh
# fcidumps_catalysts/fcidump.60_5_16_{'default' : 'cc-pVTZ', 'Mn' : 'lanl2tz'}
sbatch ../36d92b6f-2ca5-43af-a7d1-4303f6349ca1/submit_36d92b6f-2ca5-43af-a7d1-4303f6349ca1.sh

"""

In [3]:
slurm_emails_folder = Path('/mnt/d/OneDrive - University of Toronto/zapata/homogeneous_catalysts_dmrg_results_private/small_catalysts_slurm_emails')
fci_dump_folder  = Path("/mnt/d/ZapataCalcs/GSEE/fcidumps_catalysts")
excel_storage_path = Path("/mnt/d/OneDrive - University of Toronto/zapata/homogeneous_catalysts_dmrg_results_private/")
csv_storage_path = excel_storage_path/Path("performance_metrics_store")


In [4]:
catalyst_meta_data = pd.read_csv(catalyst_meta_data_path)

In [5]:

cc_wall_time_dict = {}
# Get all eml files in the folder
slurm_email_files = list(slurm_emails_folder.glob('*.eml'))
for email_file in slurm_email_files:
    with open(email_file, 'r') as f:
        use_next_line_subject = False
        obtained_run_time = False
        for line in f:
            if use_next_line_subject:
                line = line.strip()
                run_time = line.split(', ')[0].split(' ')[-1]
                # print(run_time)
                obtained_run_time = True
                use_next_line_subject = False
            if line.startswith('Subject: '):
                use_next_line_subject = True
                # print(line)
            elif line.startswith('   Command=../'):
                line = line.strip()
                uuid = line.split('/')[1]
                # print(uuid)
                assert obtained_run_time, 'Run time not yet obtained'
                cc_wall_time_dict[uuid] = run_time
                
print(cc_wall_time_dict)

{'a6d74014-60d7-489b-90a4-42d488a86836': '00:08:25', '53503986-79fe-4eb4-a52c-8a78d2808746': '07:12:14', 'fc3d3e2d-e5e9-467f-a5e3-e84f9e9f5e16': '01:56:42', '9bc04ea9-6225-4854-8619-b64709df6ab3': '01:22:37', '6a62e336-fd78-4981-b1df-fe5dd77e983f': '12:57:46', '2a259e78-6b5f-4500-b680-35484a27bee3': '00:55:20', '4bf5704f-82dd-452b-b8ce-d7035e4fec43': '00:26:51', 'fe9f283a-3813-4c7f-823b-89d0db32c65b': '00:47:05', 'e9d726a5-cd3c-49a7-a3ed-408579d54446': '14:57:50', '189a415c-8a33-455c-afc0-2c24821b5ee1': '07:14:28', 'ea207f47-74ce-4c29-8661-0f89b87aa657': '06:20:33', '834f65c2-43ee-4ed3-baca-e815484b1e82': '05:31:33', 'e5d9a6e3-473c-4ab4-a7a0-c7133b1d3284': '06:49:06', '2eb73a36-19d7-406b-a436-3c28e6d9eb37': '00:00:27', 'b745eddc-955b-4cf7-8938-d07c04392bed': '04:03:45', 'e067a6b7-b8d8-4b35-8886-85c5bb9ce7fd': '05:03:25', '2baf3e03-efda-4079-acc5-e352446677f2': '05:31:52', 'ae193cf7-e649-48bd-88b4-705122fde28b': '00:00:27', '7477aa8f-50b0-4e16-bf2a-2c6854aa0174': '00:09:57', '01a306c7-f

In [6]:
#Load all fcidump file names
fcidump_files = [filename.name for filename in list(fci_dump_folder.glob('fcidump.*'))]
# fcidump_files


In [7]:

import io
buf = io.StringIO(run_commands)
orig_data_dict_list = []
for line in buf.readlines():
    # print(line)
    if line.startswith('### '):
        continue
    if line.startswith('# fcidumps_catalysts/'):
        fcidump_name_temp = line.split('fcidumps_catalysts/')[1].strip()
        dict_entry = {}
        # print(fcidump_name_temp)
        # if fcidump_name.endswith('}'):
        #     fcidump_test_string = fcidump_name.split('{')[0]
        # else:
        #     basis_end = fcidump_name.split('_')[-1]
        #     fcidump_test_string = fcidump_name.split('_'+basis_end)[0]+'_'
        fcidump_test_string = fcidump_name_temp.split('_')[0]+'_'
        # print(fcidump_test_string)
        for fcidump_name in fcidump_files:
            if fcidump_name.startswith(fcidump_test_string):
                break
        print(fcidump_name)
        dict_entry['fcidump'] = fcidump_name
        dict_entry['fcidump_test_string'] = fcidump_test_string
    elif line.startswith('sbatch ../'):
        calc_uuid = line.split('sbatch ../')[1].split('/')[0]
        # print(calc_uuid)
        dict_entry['Calc UUID'] = calc_uuid
        dict_entry['CC Wall Time'] = cc_wall_time_dict[calc_uuid]
        dict_entry['Attempt Result'] = ""
        orig_data_dict_list.append(dict_entry)

print(orig_data_dict_list)      

fcidump.3_ts_ru_macho_co2_{'Ru'_ 'cc-pVTZ-PP', 'default'_ '6-311++G__'}
fcidump.45_1_star_{'default' _ 'cc-pVTZ', 'Ru' _ 'LANL2TZ'}
fcidump.49_2_{'default' _ 'cc-pVTZ', 'Ru' _ 'LANL2TZ'}
fcidump.40_1_ts_{'default' _ 'cc-pVTZ', 'Ru' _ 'LANL2TZ'}
fcidump.41_1_ts_{'default' _ 'cc-pVTZ', 'Ru' _ 'LANL2TZ'}
fcidump.56_5_{'default' _ '6-31++G(d,p)'}
fcidump.50_6acme_{'default' _ '6-31++G(d,p)'}
fcidump.1_ru_macho_{'Ru'_ 'cc-pVTZ-PP', 'default'_ '6-311++G__'}
fcidump.32_2ru_III_3pl_{'default' _ '6-31+G(d,p)', 'Ru' _ 'lanl2tz' }
fcidump.33_2ru_III_3pl_{'default' _ '6-31+G(d,p)', 'Ru' _ 'lanl2tz' }
fcidump.34_3ruo_IV_2pl_{'Ru' _ 'lanl2tz', 'default' _ '6-31+G(d,p)'}
fcidump.61_3_15_af_{'default' _ 'cc-pVTZ', 'Mn' _ 'lanl2tz'}
fcidump.35_3ruo_IV_2pl_{'Ru' _ 'lanl2tz', 'default' _ '6-31+G(d,p)'}
fcidump.59_5_16_{'default' _ 'cc-pVTZ', 'Mn' _ 'lanl2tz'}
fcidump.63_5_15_af_ts_{'default' _ 'cc-pVTZ', 'Mn' _ 'lanl2tz'}
fcidump.60_5_16_{'default' _ 'cc-pVTZ', 'Mn' _ 'lanl2tz'}
[{'fcidump': "fcidump.3_t

In [8]:
# catalyst_meta_data.columns

In [9]:
data_dict_list = []
for data_dict in orig_data_dict_list:
    new_data_dict = {}
    fcidump_test_string = data_dict['fcidump_test_string']
    #Get metadata row
    data_row = catalyst_meta_data[catalyst_meta_data['fcidump'].str.contains(fcidump_test_string)]
    new_data_dict['instance ID'] = data_row['instance ID'].values[0]
    new_data_dict['molecule'] = data_row['molecule'].values[0]
    new_data_dict['charge'] = data_row['charge'].values[0]
    new_data_dict['multiplicity'] = data_row['multiplicity'].values[0]
    new_data_dict['fcidump'] = data_dict['fcidump']
    new_data_dict['log10_hilbert_space_size'] = data_row['log10_hilbert_space_size'].values[0]
    new_data_dict['Attempt Result'] = data_dict['Attempt Result']
    new_data_dict['Calc UUID'] = data_dict['Calc UUID']
    new_data_dict['CC Wall Time'] = data_dict['CC Wall Time']
    if 'Calc UUID Small BD' in data_dict:
        new_data_dict['Calc UUID Small BD'] = data_dict['Calc UUID Small BD']
        new_data_dict['CC Wall Time Small BD'] = data_dict['CC Wall Time Small BD']
    data_dict_list.append(new_data_dict)



    

In [10]:
# data_dict_list

In [11]:
wb = px.Workbook()

dp.setup_workbook(data_file_path=data_file_path,data_dict_list=data_dict_list,workbook=wb,csv_storage_path=csv_storage_path)

wb.save(excel_storage_path/excel_name)

Last loop included = 35
Processed results available
Checking that processed results match raw results.
Last loop included = 35
Processed results available
Checking that processed results match raw results.
Last loop included = 38
Processed results available
Checking that processed results match raw results.
Last loop included = 21
Processed results available
Checking that processed results match raw results.
Last loop included = 20
Processed results available
Checking that processed results match raw results.
Last loop included = 45
Last loop included = 28
Processed results available
Checking that processed results match raw results.
Last loop included = 44
Last loop included = 1
Processed results available
Checking that processed results match raw results.
Last loop included = 43
Last loop included = 1
Processed results available
Checking that processed results match raw results.
Last loop included = 25
Processed results available
Checking that processed results match raw results.
Las