In [1]:
import numpy as np
import h5py
import dmrghandler.hdf5_io as hdf5_io
import dmrghandler.data_processing as dp
from pathlib import Path
import openpyxl as px
import openpyxl.chart as px_chart
import pandas as pd
import re
import io

This notebook will collect the DMRG calculation data and then output an excel file with the data and plots (one sheet per calculation run), as well as csv files that contain the performance metric data (bond dimension, DMRG energy, time, discarded weights, etc.)

To run this notebook and collect the data, the following are needed:
- catalysis_instances.csv
    
    This contains the metadata for the catalysist instances.
    Put into `catalyst_meta_data_path`

- DMRG output datafolders (i.e. those that were output with the UUIDs)
    
    Put folder of these datafolders into `data_file_path`

- The run commands returned by the `prepare_calcs_*.py` files
    
    Put into `run_commands`

- SLURM emails (.eml or other text format should work)
    
    The notebook will automatically collect the run times from the email subjects.
    Put folder of these emails into `slurm_emails_folder`

- The FCIDUMP files
    
    Put the folder of these files into `fci_dump_folder`

- Excel output save folder
    
    Put into `excel_storage_path`
    
- name of output excel file
    
    Put into `excel_name`

- Performance metrics folder location
    
    Put into `csv_storage_path`

In [2]:


run_commands = \
"""
# 4a, fixed orb Symm
#####################################
# fcidumps_catalysts/fcidump.30_4a_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../a6d74014-60d7-489b-90a4-42d488a86836/submit_a6d74014-60d7-489b-90a4-42d488a86836.sh
# fcidumps_catalysts/fcidump.31_4a_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../53503986-79fe-4eb4-a52c-8a78d2808746/submit_53503986-79fe-4eb4-a52c-8a78d2808746.sh
# Pincer Reac 1
#####################################
# fcidumps_catalysts/fcidump.20_rc_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../1a3f8996-3341-4332-b600-954068e54591/submit_1a3f8996-3341-4332-b600-954068e54591.sh
# fcidumps_catalysts/fcidump.21_rc_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../5aa0be63-2ac1-4c77-9a1c-865fb33dd575/submit_5aa0be63-2ac1-4c77-9a1c-865fb33dd575.sh
### First run, <150 BD max: # fcidumps_catalysts/fcidump.24_ts_1over2_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
### First run, <150 BD max: cd config_store/submit_dir
### First run, <150 BD max: sbatch ../6ff1893b-d72e-40a3-942d-0ddb0559657e/submit_6ff1893b-d72e-40a3-942d-0ddb0559657e.sh
# fcidumps_catalysts/fcidump.24_ts_1over2_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../01da1863-a53b-49dd-b702-5390d5880638/submit_01da1863-a53b-49dd-b702-5390d5880638.sh
# fcidumps_catalysts/fcidump.25_ts_1over2_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../e6e51673-7231-4d34-9e51-8c4d887dcf89/submit_e6e51673-7231-4d34-9e51-8c4d887dcf89.sh
# fcidumps_catalysts/fcidump.26_pc_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../de3c9cf7-4396-4e20-988b-c4f5d4e1cd43/submit_de3c9cf7-4396-4e20-988b-c4f5d4e1cd43.sh
# fcidumps_catalysts/fcidump.27_pc_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../c87fa609-2ffd-4389-8758-b13b53238648/submit_c87fa609-2ffd-4389-8758-b13b53238648.sh
# fcidumps_catalysts/fcidump.28_2_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../9b9f1025-786e-42ba-8918-9aa040013b37/submit_9b9f1025-786e-42ba-8918-9aa040013b37.sh
# fcidumps_catalysts/fcidump.29_2_{'Mo': 'def2-SVP', 'I': 'def2-SVP', 'Cl': 'def2-SVP', 'default': '6-311+G(d,p)'}
cd config_store/submit_dir
sbatch ../75c5729f-cca8-49df-b3f6-8bb797d2eb62/submit_75c5729f-cca8-49df-b3f6-8bb797d2eb62.sh
# Schrock
#####################################
# fcidumps_catalysts/fcidump.9_mo_n2-_{'Mo': 'def2-TZVP', 'default': 'def2-SVP'}
sbatch ../9bbe8ef0-cc44-46a4-8545-b5e082bb4b89/submit_9bbe8ef0-cc44-46a4-8545-b5e082bb4b89.sh
# fcidumps_catalysts/fcidump.10_fecp2+_s0.5_def2-tzvp
sbatch ../9344fcc2-5a14-48e5-ab35-fc5f96abdf59/submit_9344fcc2-5a14-48e5-ab35-fc5f96abdf59.sh
# fcidumps_catalysts/fcidump.11_fecp2_s0_def2-tzvp
sbatch ../f0569b40-69a5-4f03-8f6b-31b650bde489/submit_f0569b40-69a5-4f03-8f6b-31b650bde489.sh
# fcidumps_catalysts/fcidump.12_mo_n2_{'Mo': 'def2-TZVP', 'default': 'def2-SVP'}
sbatch ../8c88d12b-5ca4-41d7-bca6-dcaa99fd27c5/submit_8c88d12b-5ca4-41d7-bca6-dcaa99fd27c5.sh

"""

In [3]:
bd_extrapolation_dict = None
# bd_extrapolation_dict = {
# "fcidump.30_4a_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":23,
# "fcidump.31_4a_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":1906,
# "fcidump.20_rc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":299,
# "fcidump.21_rc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":4126,
# "fcidump.24_ts_1over2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)":92,
# "fcidump.25_ts_1over2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)":4076,
# "fcidump.26_pc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":332,
# "fcidump.27_pc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":4888,
# "fcidump.28_2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":583,
# "fcidump.29_2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}":6434,
# # "fcidump.9_mo_n2-_{'Mo'_ 'def2-TZVP', 'default'_ 'def2-SVP'}":,
# # "fcidump.10_fecp2+_s0.5_def2-tzvp":,
# # "fcidump.11_fecp2_s0_def2-tzvp":,
# # "fcidump.12_mo_n2_{'Mo'_ 'def2-TZVP', 'default'_ 'def2-SVP'}":,
# }

In [4]:


slurm_emails_folder = Path('/mnt/d/OneDrive - University of Toronto/zapata/homogeneous_catalysts_dmrg_results_private/small_catalysts_slurm_emails')
fci_dump_folder  = Path("/mnt/d/ZapataCalcs/GSEE/fcidumps_catalysts")
catalyst_meta_data_path=Path('/mnt/d/ZapataCalcs/GSEE/catalysis_instances.csv')
data_file_path = Path("/mnt/d/ZapataCalcs/dmrg_data/data_storage_paper_data_orig")
excel_storage_path = Path("/mnt/d/OneDrive - University of Toronto/zapata/homogeneous_catalysts_dmrg_results_private/")
csv_storage_path = excel_storage_path/Path("performance_metrics_store_app_paper")


excel_name=Path('dmrg_results_app_paper.xlsx')
memory_summary_csv_filename = excel_storage_path/Path('app_paper_memory_summary.csv')





In [5]:
catalyst_meta_data = pd.read_csv(catalyst_meta_data_path)

In [6]:

cc_wall_time_dict = {}
# Get all eml files in the folder
slurm_email_files = list(slurm_emails_folder.glob('*.eml'))
for email_file in slurm_email_files:
    # print(email_file)
    with open(email_file, 'r') as f:
        use_next_line_subject = False
        obtained_run_time = False
        job_id = None
        lines = f.readlines()
        for iter,line in enumerate(lines):
            # if use_next_line_subject:
            #     line = line.strip()
            #     run_time = line.split(', ')[0].split(' ')[-1]
            #     # print(run_time)
            #     obtained_run_time = True
            #     use_next_line_subject = False
            if line.startswith('Subject: '):
                # use_next_line_subject = True
                #"Subject: Niagara slurm Job_id=12645129 Name=dmrg_thresholding Failed, Run"
                # "Subject: Niagara slurm Job_id=12645129 Name=dmrg_thresholding Failed, Run
                # time 06:42:25, FAILED, ExitCode 1"
                # Join the next line, failed or not
                line = line.strip()
                next_line = lines[iter+1].strip()
                next_next_line = lines[iter+2].strip()
                joined_line = line + " " + next_line + " " + next_next_line
                # print(joined_line)
                #Use Regular expression to get the run time
                reg_exp = r'(?<=Run time )\d+:\d+:\d+'
                
                run_time = re.search(reg_exp, joined_line).group()
                # print(run_time)
                obtained_run_time = True

                # Get the job id
                job_id = line.split('Job_id=')[1].split(' ')[0]
                # print(line)
            elif line.startswith('   Command=../'):
                line = line.strip()
                uuid = line.split('/')[1]
                # print(uuid)
                assert obtained_run_time, 'Run time not yet obtained'
                cc_wall_time_dict[uuid] = run_time
            elif job_id is not None and line.startswith(f"{job_id}.ba+"):
                # 12645129.ba+      batch rrg-izmay+   06:42:25 219926900K  30851368K   04:03:30 8-12:22:49      1:0 
                # Get memory usage via regex, it is the second number (5th column), not the first, use the run time to get the correct number
                reg_exp = r'\d+K' 
                # Get all occurrences of memory usage
                memory_usage = re.findall(reg_exp, line)

                
                # memory_usage = re.search(reg_exp, line).group()
                # print(memory_usage)
                memory_usage_virtual = memory_usage[0]
                memory_usage_virtual = int(memory_usage_virtual[:-1])*1024
                cc_wall_time_dict[f"{uuid}_virtual_mem"] = memory_usage_virtual
                memory_usage_rss = memory_usage[1]
                memory_usage_rss = int(memory_usage_rss[:-1])*1024
                cc_wall_time_dict[f"{uuid}_rss_mem"] = memory_usage_rss
                assert memory_usage_rss <= memory_usage_virtual, 'RSS memory usage is greater than virtual memory usage'
                # print(line)
                # print(memory_usage_virtual)
                # print(memory_usage_rss)


                # # Get memory usage
                # memory_usage = line.split()[5]
                # memory_usage = int(memory_usage[:-1])*1024
                # print(line)



                
print(cc_wall_time_dict)

{'de3c9cf7-4396-4e20-988b-c4f5d4e1cd43': '01:13:16', 'de3c9cf7-4396-4e20-988b-c4f5d4e1cd43_virtual_mem': 10852290560, 'de3c9cf7-4396-4e20-988b-c4f5d4e1cd43_rss_mem': 7343116288, '1a3f8996-3341-4332-b600-954068e54591': '00:43:12', '1a3f8996-3341-4332-b600-954068e54591_virtual_mem': 11189825536, '1a3f8996-3341-4332-b600-954068e54591_rss_mem': 7223762944, '6ff1893b-d72e-40a3-942d-0ddb0559657e': '00:10:05', '6ff1893b-d72e-40a3-942d-0ddb0559657e_virtual_mem': 5296214016, '6ff1893b-d72e-40a3-942d-0ddb0559657e_rss_mem': 1829195776, 'c87fa609-2ffd-4389-8758-b13b53238648': '04:20:48', 'c87fa609-2ffd-4389-8758-b13b53238648_virtual_mem': 206097448960, 'c87fa609-2ffd-4389-8758-b13b53238648_rss_mem': 22891057152, '5aa0be63-2ac1-4c77-9a1c-865fb33dd575': '06:36:32', '5aa0be63-2ac1-4c77-9a1c-865fb33dd575_virtual_mem': 214961160192, '5aa0be63-2ac1-4c77-9a1c-865fb33dd575_rss_mem': 31902253056, 'e6e51673-7231-4d34-9e51-8c4d887dcf89': '07:24:39', 'e6e51673-7231-4d34-9e51-8c4d887dcf89_virtual_mem': 2151464

In [7]:
#Load all fcidump file names
fcidump_files = [filename.name for filename in list(fci_dump_folder.glob('fcidump.*'))]
# fcidump_files


In [8]:


buf = io.StringIO(run_commands)
orig_data_dict_list = []
for line in buf.readlines():
    # print(line)
    if line.startswith('### '):
        continue
    if line.startswith('# fcidumps_catalysts/'):
        fcidump_name_temp = line.split('fcidumps_catalysts/')[1].strip()
        dict_entry = {}
        # print(fcidump_name_temp)
        # if fcidump_name.endswith('}'):
        #     fcidump_test_string = fcidump_name.split('{')[0]
        # else:
        #     basis_end = fcidump_name.split('_')[-1]
        #     fcidump_test_string = fcidump_name.split('_'+basis_end)[0]+'_'
        fcidump_test_string = fcidump_name_temp.split('_')[0]+'_'
        # print(fcidump_test_string)
        for fcidump_name in fcidump_files:
            if fcidump_name.startswith(fcidump_test_string):
                break
        print(fcidump_name)
        dict_entry['fcidump'] = fcidump_name
        dict_entry['fcidump_test_string'] = fcidump_test_string
    elif line.startswith('sbatch ../'):
        calc_uuid = line.split('sbatch ../')[1].split('/')[0]
        # print(calc_uuid)
        dict_entry['Calc UUID'] = calc_uuid
        dict_entry['CC Wall Time'] = cc_wall_time_dict[calc_uuid]
        dict_entry['Attempt Result'] = ""
        dict_entry['Virtual Memory Usage'] = cc_wall_time_dict[f"{calc_uuid}_virtual_mem"]
        dict_entry['RSS Memory Usage'] = cc_wall_time_dict[f"{calc_uuid}_rss_mem"]
        orig_data_dict_list.append(dict_entry)

print(orig_data_dict_list)      

fcidump.30_4a_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.31_4a_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.20_rc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.21_rc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.24_ts_1over2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)
fcidump.25_ts_1over2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)
fcidump.26_pc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.27_pc_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.28_2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.29_2_{'Mo'_ 'def2-SVP', 'I'_ 'def2-SVP', 'Cl'_ 'def2-SVP', 'default'_ '6-311+G(d,p)'}
fcidump.9_mo_n2-_{'Mo'_ 'def2-TZVP', 'defa

In [9]:
# catalyst_meta_data.columns

In [10]:
data_dict_list = []
for data_dict in orig_data_dict_list:
    new_data_dict = {}
    fcidump_test_string = data_dict['fcidump_test_string']
    #Get metadata row
    data_row = catalyst_meta_data[catalyst_meta_data['fcidump'].str.contains(fcidump_test_string)]
    new_data_dict['instance ID'] = data_row['instance ID'].values[0]
    new_data_dict['molecule'] = data_row['molecule'].values[0]
    new_data_dict['charge'] = data_row['charge'].values[0]
    new_data_dict['multiplicity'] = data_row['multiplicity'].values[0]
    new_data_dict['fcidump'] = data_dict['fcidump']
    new_data_dict['log10_hilbert_space_size'] = data_row['log10_hilbert_space_size'].values[0]
    new_data_dict['Attempt Result'] = data_dict['Attempt Result']
    new_data_dict['Calc UUID'] = data_dict['Calc UUID']
    new_data_dict['CC Wall Time'] = data_dict['CC Wall Time']
    new_data_dict['Virtual Memory Usage (GiB)'] = data_dict['Virtual Memory Usage']/1024/1024/1024
    new_data_dict['RSS Memory Usage (GiB)'] = data_dict['RSS Memory Usage']/1024/1024/1024
    if 'Calc UUID Small BD' in data_dict:
        new_data_dict['Calc UUID Small BD'] = data_dict['Calc UUID Small BD']
        new_data_dict['CC Wall Time Small BD'] = data_dict['CC Wall Time Small BD']
    data_dict_list.append(new_data_dict)



    



    

In [11]:
# data_dict_list

In [12]:
wb = px.Workbook()

dp.setup_workbook(
    data_file_path=data_file_path,
    data_dict_list=data_dict_list,
    workbook=wb,
    csv_storage_path=csv_storage_path,
    bd_extrapolation_dict=bd_extrapolation_dict,
    memory_summary_csv_filename=memory_summary_csv_filename,
)

wb.save(excel_storage_path / excel_name)

Last loop included = 24
Processed results available
Checking that processed results match raw results.
Last loop included = 44
Processed results available
Checking that processed results match raw results.
Last loop included = 10
Processed results available
Checking that processed results match raw results.
Last loop included = 13
Last loop included = 29
Processed results available
Checking that processed results match raw results.
Last loop included = 13
Last loop included = 35
Processed results available
Checking that processed results match raw results.
Last loop included = 11
Last loop included = 17
Processed results available
Checking that processed results match raw results.
Last loop included = 12
Last loop included = 53
Last loop included = 43
Last loop included = 43
Last loop included = 47
