In [1]:
import json
import pandas as pd
import uuid
import hashlib
import paramiko
from pathlib import Path
import ast
import os

In [2]:
#utility functions
def create_sftp_session(host, port, username, key_file_path):
    """Create an SFTP session using SSH credentials."""
    key = paramiko.RSAKey.from_private_key_file(key_file_path)
    transport = paramiko.Transport((host, port))
    transport.connect(username=username, pkey=key)
    sftp = paramiko.SFTPClient.from_transport(transport)
    return sftp

def generate_uuid():
    return str(uuid.uuid4())

def calculate_sha1(file_path):
    sha1 = hashlib.sha1()
    with open(file_path, 'rb') as f:
        while True:
            data = f.read(65536)  # Read in 64k chunks
            if not data:
                break
            sha1.update(data)
    return sha1.hexdigest()


In [3]:
def update_instance_data_and_upload(sftp, csv_file_path, json_file_path, upload_directory, target_directory):
    # Load JSON data
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    # Extract the instance name from JSON
    short_name = json_data.get('short_name', '')
    
    # Load CSV data
    csv_data = pd.read_csv(csv_file_path)
    
    # Filter for the specific instance name using the extracted short name
    filtered_csv_data = csv_data[csv_data['cat_tag'] == short_name]
    
    # Select only the desired columns
    columns = [
        'software_used', 'molecule_name', 'geometry', 'basis_set', 'charge',
        'multiplicity', 'avas_atomic_orbitals', 'avas_minao', 'avas_ne', 
        'avas_no', 'nbasis', 'utility_scale', 'mean_field_obejct_from_fcidump'
    ]
    filtered_csv_data = filtered_csv_data[columns]

    # Process each entry and upload files
    instance_data = []
    for _, row in filtered_csv_data.iterrows():
        new_uuid = generate_uuid()
        file_name = row['mean_field_obejct_from_fcidump']
        local_file_path = Path(upload_directory) / (file_name + '.gz')
        target_file_path = f"{target_directory}{file_name}.{new_uuid}.gz"

        # Upload the file
        sftp.put(local_file_path.as_posix(), target_file_path)
        
        # Proper URL for external access
        full_url = f"sftp://sftp.l3harris.com{target_file_path}"
        print(full_url)
        checksum = calculate_sha1(local_file_path)

        # Prepare the entry for JSON, excluding 'mean_field_obejct_from_fcidump'
        # Apply ast.literal_eval then convert to string
        if isinstance(row['geometry'], str):
            row['geometry'] = ast.literal_eval(row['geometry'])
        if isinstance(row['basis_set'], str):
            try:
                row['basis_set'] = ast.literal_eval(row['basis_set'])
            except (SyntaxError, ValueError):
                row['basis_set'] = row['basis_set'].strip()  # Handle whitespace or other issues

        if isinstance(row['avas_atomic_orbitals'], str):
            row['avas_atomic_orbitals'] = ast.literal_eval(row['avas_atomic_orbitals'])
        
        entry_features = row.to_dict()
        entry_features.pop('mean_field_obejct_from_fcidump', None)  # Remove the file name info
        instance_entry = {
            "instance_data_object_uuid": new_uuid,
            "instance_data_object_url": full_url,
            "instance_datta_checksum_type": "sha1sum",
            "instance_data_checksum": checksum,
            "features": entry_features,
            "requirements": {
               "probability_of_success": 0.99,
               "time_limit_seconds": 172800,
               "accuracy": 1.0,
               "enery_units": "millihartree",
               "energy_target": 0.99
            }
        }
        instance_data.append(instance_entry)

    # Update the JSON data structure
    json_data['instance_data'] = instance_data

    # Optionally, write back the updated JSON data locally
    with open(json_file_path, 'w') as file:
        json.dump(json_data, file, indent=4)


In [4]:
host = 'sftp.l3harris.com'
port = 22
username = 'darpa-qb-zapata'
key_file_path = '../../darpa-qb-zapata-key.ppk'
directory = '../problem_instances'  # Directory containing the JSON files
upload_directory = '../../fcidumps_catalysts_new'  # Local directory containing the new fcidump files
csv_file_path = upload_directory + '/catalysis_metadata_new_fcidump.csv'  # Path to the CSV file containing new data
target_directory = '/gsee/'  # Target directory on the SFTP server

# Create SFTP session
sftp = create_sftp_session(host, port, username, key_file_path)

# Iterate over all files in the problem_instances directory
for file in os.listdir(directory):
    if file.endswith('.json'):
        json_file_path = os.path.join(directory, file)  # Full path to the JSON file
        print(f"Processing {json_file_path}...")
        update_instance_data_and_upload(sftp, csv_file_path, json_file_path, upload_directory, target_directory)

# Close the SFTP session
sftp.close()


Processing ../problem_instances/problem_instance.blue_dimer.dc8bdc56-5ebd-4996-6b81-81b1a06d8c76.json...
sftp://sftp.l3harris.com/gsee/fcidump.38_1_ts_noncan_0.2_new.46967cfc-d867-40e8-bef0-9655c46cef29.gz
sftp://sftp.l3harris.com/gsee/fcidump.39_1_ts_noncan_0.2_new.85d9d818-501e-4a0e-8fd8-c216ab5e3cb5.gz
sftp://sftp.l3harris.com/gsee/fcidump.40_1_ts_noncan_0.2_new.7db0f859-073f-46af-9325-016a67229a4b.gz
sftp://sftp.l3harris.com/gsee/fcidump.41_1_ts_noncan_0.2_new.2f0b6ced-7831-4fe0-b95e-ce4d456c9c6b.gz
sftp://sftp.l3harris.com/gsee/fcidump.42_1_star_noncan_0.2_new.a722dd32-93c7-4fbd-9cf5-f1bf44b2cdae.gz
sftp://sftp.l3harris.com/gsee/fcidump.43_1_star_noncan_0.2_new.d9707874-28d5-48b6-824e-a2e6c8753437.gz
sftp://sftp.l3harris.com/gsee/fcidump.44_1_star_noncan_0.2_new.727ab190-e399-4bb6-a9b3-20ae57789bac.gz
sftp://sftp.l3harris.com/gsee/fcidump.45_1_star_noncan_0.2_new.e986eb83-6b9d-439e-85ca-a9b931eef4a2.gz
sftp://sftp.l3harris.com/gsee/fcidump.46_2_noncan_0.2_new.6bf3207c-664b-4ed3-8e

In [5]:
def list_files_recursively(sftp, directory, prefix=''):
    """Recursively list all files in specified SFTP directory."""
    try:
        # List all items in the directory
        items = sftp.listdir_attr(directory)
        for item in items:
            # Construct full path
            full_path = f"{directory}/{item.filename}"
            if item.longname.startswith('d'):  # Check if it's a directory
                print(f"{prefix}Directory: {full_path}")
                # Recurse into the directory
                list_files_recursively(sftp, full_path, prefix + '  ')
            else:
                print(f"{prefix}File: {full_path}")
    except Exception as e:
        print(f"Failed to list directory {directory}: {e}")

def main():
    host = 'sftp.l3harris.com'
    port = 22
    username = 'darpa-qb-zapata'
    key_file_path = '../../darpa-qb-zapata-key.ppk'
    directory = '/'  # Directory to list files from

    # Create SFTP session
    sftp = create_sftp_session(host, port, username, key_file_path)
    
    # Recursively list files in the specified directory
    print("Starting directory listing:")
    list_files_recursively(sftp, directory)
    
    # Close the SFTP session
    sftp.close()

if __name__ == '__main__':
    main()


Starting directory listing:
Directory: //diff-eq
Directory: //dynamics
Directory: //gsee
  File: //gsee/fcidump.0_ru_macho_noncan_0.2_new.5b0ddeb4-d577-45f1-bf68-74b8e7b1073a.gz
  File: //gsee/fcidump.10_fecp2+_s0.5_noncan_0.2_new.e10bd99a-435c-41cd-84f1-41bd67890fc7.gz
  File: //gsee/fcidump.11_fecp2_s0_noncan_0.2_new.b71baf3c-d5f0-4f8d-aead-648ba69a058e.gz
  File: //gsee/fcidump.12_mo_n2_noncan_0.2_new.77c12db8-c32e-4f88-8552-9762a0fff763.gz
  File: //gsee/fcidump.13_1_lut_ts_noncan_0.2_new.09c3ddd5-0187-46e8-95c3-157c470cb69a.gz
  File: //gsee/fcidump.14_1_lut_prod_noncan_0.2_new.f36a9dbc-c34a-401f-8ea6-dc996d785edf.gz
  File: //gsee/fcidump.15_1_lut_react_noncan_0.2_new.9cba211b-820c-4ad4-a050-336e8049e1c7.gz
  File: //gsee/fcidump.16_ts_1over4a_noncan_0.2_new.4476d0ff-9618-4774-9619-e3f223249cea.gz
  File: //gsee/fcidump.17_ts_1over4a_noncan_0.2_new.d74d0c01-8507-445b-bec1-f02bafbbfaab.gz
  File: //gsee/fcidump.18_I_noncan_0.2_new.f6516937-6182-4dae-a39c-f94c0d72bf70.gz
  File: //