In [9]:
import numpy as np
import orquestra.sdk as sdk
import pandas as pd
import s3fs, os

In [10]:
config_name = 'darpa-ta1'
workspace_id = 'ta1-chemistry-1a2981'

key = sdk.secrets.get(
    "darpa-shared-data-access-key-id",
    config_name=config_name,
    workspace_id=workspace_id,
)
secret = sdk.secrets.get(
    "darpa-shared-data-secret-access-key",
    config_name=config_name,
    workspace_id=workspace_id,
)

s3 = s3fs.S3FileSystem(key=key, secret=secret)

In [11]:
cat_meta = pd.read_csv('catalysis_metadata_new_fcidump.csv')
cat_meta.head(2)

Unnamed: 0,geometry,basis_set,charge,multiplicity,avas_atomic_orbitals,avas_minao,method_type,molecule_name,ref,cat_tag,...,mean_field_obejct_from_fcidump,converged,stable_int,mult_mf,s2_mf,e_tot,avas_ne,avas_no,utility_scale,software_used
0,"['C 0.00000 0.00000 -0.00004', 'O 0.00000 0.00...",6-311++G**,0,1.0,"['C 2s', 'C 2p']",STO-3G,Catalysis,co2,"J. Am. Chem. Soc. 2017, 139 (3), 1245–1260",ru_macho,...,fcidump.2_co2_6-311++G**,,,,,,8,6,False,PySCF 2.2.1
1,"['C@1 -0.29337 -0.35620 0.23059', 'O@1 -0.6755...",6-311++G**,0,1.0,"['C@1 2s', 'C@1 2p', 'O@1 2s', 'O@1 2p']",STO-3G,Catalysis,melact,"J. Am. Chem. Soc. 2017, 139 (3), 1245–1260",ru_macho,...,fcidump.7_melact_6-311++G**,,,,,,12,9,False,PySCF 2.2.1


In [12]:
S3_BUCKET = "darpa-benchmarking-backup-2/zap-storage-zapata-darpa-benchmarking-426lh/scf_outputs/"

In [14]:
to_be_renamed = []
for _, row in cat_meta.iterrows():
    #print(row['molecule_name'])
    ru_macho_non_tm = (row['cat_tag'] == 'ru_macho' and (row['molecule_name'] == 'co2' or row['molecule_name'] == 'melact'))
    if ru_macho_non_tm or row['cat_tag'] == 'fe_red':
        fcidump_name = f'fcidump.{row["id"]}_{row["molecule_name"]}_0.2_old'
        s3.download(f'{S3_BUCKET}fcidump.{row["id"]}_{row["molecule_name"]}_*', ".")
        os.rename(row['mean_field_obejct_from_fcidump'], fcidump_name)
    else:
        remote_name = S3_BUCKET + row['mean_field_obejct_from_fcidump']
        s3.download(remote_name, row['mean_field_obejct_from_fcidump'])

In [15]:
#cat_meta.query('cat_tag == "ru_macho"')

In [16]:
def rename_old_instances(row):
    ru_macho_non_tm = (row['cat_tag'] == 'ru_macho' and (row['molecule_name'] == 'co2' or row['molecule_name'] == 'melact'))
    if ru_macho_non_tm or row['cat_tag'] == 'fe_red':
        return f'fcidump.{row["id"]}_{row["molecule_name"]}_0.2_old'
    else: 
        return row['mean_field_obejct_from_fcidump']

In [17]:
cat_meta.loc[:, 'mean_field_obejct_from_fcidump'] = cat_meta.apply(rename_old_instances, axis=1)

In [18]:
cat_meta.query('cat_tag == "ru_macho"')[['molecule_name', 'mean_field_obejct_from_fcidump']]

Unnamed: 0,molecule_name,mean_field_obejct_from_fcidump
0,co2,fcidump.2_co2_0.2_old
1,melact,fcidump.7_melact_0.2_old
2,melact,fcidump.8_melact_0.2_old
12,ru_macho,fcidump.0_ru_macho_noncan_0.2_new
23,ru_macho,fcidump.1_ru_macho_noncan_0.2_new
44,ts_ru_macho_co2,fcidump.3_ts_ru_macho_co2_noncan_0.2_new
55,ts_ru_macho_co2,fcidump.4_ts_ru_macho_co2_noncan_0.2_new
57,ts_ru_macho_melact,fcidump.5_ts_ru_macho_melact_noncan_0.2_new
65,ts_ru_macho_melact,fcidump.6_ts_ru_macho_melact_noncan_0.2_new


In [19]:
cat_meta.to_csv('catalysis_metadata_new_fcidump.csv', index=False)