In [299]:
import pandas as pd
import os
import logging
# Configure the logging system
logging.basicConfig(level=logging.INFO, filename='app.log', filemode='w',
                    format='%(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('batch_test')
import ast
from pathlib import Path
import glob2
import tqdm
import numpy as np

# Kate's Function
1. Per batch
2. Start with the base compound and then the elaborations
3. For each of those iterate through the **reaction steps** starting from step 1
4. Metadata needed per reactant/product:
    - smiles
5. Reaction:
    - Type / reaction name
6. Poses:
    - **Path to the minimised .mol**

In [301]:
def parse_syndirella_products_csv(df: pd.DataFrame,
                                  product_csv: bool,
                                  base_output_path: str):
    """
    Parse a products csv within a base compound. 
    :param df: dataframe of the products csv (could be internal step or final step). 
    :param product_csv: bool if it is the final products csv or not.
    :param base_output_path: path to output folder for the base compound. 
    :return to_hippo_df: pd.DataFrame with all of the reactants and products within that batch folder.
    """
    to_hippo_df: pd.DataFrame = pd.DataFrame(columns=['reaction', 'r1_smiles', 'r2_smiles', 'product', 'inspiration1', 'inspiration2', 'pose_path', 'metadata'])
    for i, row in df.iterrows():
        metadata: dict = {}
        if product_csv:
            try:
                if np.isnan(row['∆∆G']):
                    continue # there was no successful placement
                else:
                    metadata['placement_ddG'] = row['∆∆G']
                    metadata['placement_mRMSD'] = row.comRMSD
                # inspiration
                inspiration1: str = ast.literal_eval(row.regarded)[0]
                inspiration2: str = ast.literal_eval(row.regarded)[1]
                # pose_path
                name = row['name']
                pose_path = os.path.join(base_output_path, name, f'{name}.minimised.mol')
            except Exception as e:
                continue # if this product was not placed, skip it
        else:
            inspiration1 = None
            inspiration2 = None
            pose_path = None
        # register the reactants
        r1_smiles: str = row.r1_smiles
        # r2 reactant could not exist yet
        try:
            r2_smiles: str = row.r2_smiles
        except Exception:
            r2_smiles: str = None        
        # register the product
        product_smiles: str = row.smiles
        # register the reaction
        reaction_name: str = row.reaction
        try:
            metadata['flag'] = row.flag
        except Exception:
            pass
        # add all info to df
        to_hippo_df.loc[i] = [reaction_name, r1_smiles, r2_smiles, product_smiles, inspiration1, inspiration2, pose_path, metadata]
    return to_hippo_df

def parse_syndirella_products_batch(path: str):
    """
    Parse all base compounds within a batch. 
    :param path: path to batch folder. 
    :return batch_df with all of the reactants and products within that batch folder. 
    """
    # Iterate over all directories directly under the specified path
    all_product_dfs: list(pd.DataFrame) = []
    
    for root, dirs, files in os.walk(path, topdown=True):
        if root==path:
            for dir in dirs:
                if root != path:
                    continue
                current_dir_path = os.path.join(root, dir)
                print(f"Current dir path: {current_dir_path}")
                csv_files: list(str) = glob2.glob(os.path.join(current_dir_path, '**/*products*.csv'), recursive=True)
                for file in csv_files:
                    print(f"Processing file: {file}")
                    df = pd.read_csv(file, low_memory=False)
                    product_csv: bool = 'regarded' in df.columns
                    base_output_path: str = os.path.join(current_dir_path, 'output')
                    print(f"Base output path: {base_output_path}")
                    new_df = parse_syndirella_products_csv(df, 
                                                           product_csv, 
                                                           base_output_path)
                    all_product_dfs.append(new_df)
            batch_df = pd.concat(all_product_dfs)
            batch_df.reset_index(drop=True, inplace=True)
        else:
            # Clear the dirs list to prevent os.walk from going any deeper
            dirs[:] = []  # This stops os.walk from descending into subdirs beyond the first level
        
    return batch_df

In [302]:
batch_df = parse_syndirella_products_batch('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0')

Current dir path: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N
Processing file: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N/BCPKNOZHZDDVQN-UHFFFAOYSA-N_Amidation_products_3of3_placements.csv
Base output path: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N/output
Processing file: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N/extra/BCPKNOZHZDDVQN-UHFFFAOYSA-N_N-Boc_deprotection_products_2of3.csv
Base output path: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N/output
Processing file: /Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run/testing/batch0/BCPKNOZHZDDVQN-UHFFFAOYSA-N/extra/BCPKNOZHZDDVQN-UHFFFAOYSA-N_Amidation_products_1of3.csv
Base output path: /Users/kate_fieseler/PycharmProje

In [303]:
batch_df

Unnamed: 0,reaction,r1_smiles,r2_smiles,product,inspiration1,inspiration2,pose_path,metadata
0,Amidation,N#CCC(=O)O,Cc1ccc(NC(=O)C2CCCO2)cc1N,Cc1ccc(NC(=O)[C@@H]2CCCO2)cc1NC(=O)CC#N,A71EV2A-x0310_0A,A71EV2A-x0416_0A,/Users/kate_fieseler/PycharmProjects/EV-A71-2A...,"{'placement_ddG': -7.088948016603069, 'placeme..."
1,Amidation,N#CCC(=O)O,Cc1ccc(NC(=O)C2CCCO2)cc1N,Cc1ccc(NC(=O)[C@H]2CCCO2)cc1NC(=O)CC#N,A71EV2A-x0310_0A,A71EV2A-x0416_0A,/Users/kate_fieseler/PycharmProjects/EV-A71-2A...,"{'placement_ddG': -17.49852954868254, 'placeme..."
2,Amidation,CC(C#N)C(=O)O,Cc1ccc(NC(=O)C2CCCO2)cc1N,Cc1ccc(NC(=O)[C@@H]2CCCO2)cc1NC(=O)[C@H](C)C#N,A71EV2A-x0310_0A,A71EV2A-x0416_0A,/Users/kate_fieseler/PycharmProjects/EV-A71-2A...,"{'placement_ddG': -3.6648069897823343, 'placem..."
3,Amidation,CC(C#N)C(=O)O,Cc1ccc(NC(=O)C2CCCO2)cc1N,Cc1ccc(NC(=O)[C@H]2CCCO2)cc1NC(=O)[C@H](C)C#N,A71EV2A-x0310_0A,A71EV2A-x0416_0A,/Users/kate_fieseler/PycharmProjects/EV-A71-2A...,"{'placement_ddG': -3.2878999159478894, 'placem..."
4,Amidation,CC(C#N)C(=O)O,Cc1ccc(NC(=O)C2CCCO2)cc1N,Cc1ccc(NC(=O)[C@@H]2CCCO2)cc1NC(=O)[C@@H](C)C#N,A71EV2A-x0310_0A,A71EV2A-x0416_0A,/Users/kate_fieseler/PycharmProjects/EV-A71-2A...,"{'placement_ddG': -10.314010490964392, 'placem..."
...,...,...,...,...,...,...,...,...
340325,N-Boc_deprotection,COC(=O)c1cc(NC(=O)C2CCC(CN3C(=O)C=CC3=O)O2)c(N...,,COC(=O)c1cc(NC(=O)[C@@H]2CC[C@@H](CN3C(=O)C=CC...,,,,{}
340326,N-Boc_deprotection,COC(=O)c1cc(NC(=O)C2CCC(CN3C(=O)C=CC3=O)O2)c(N...,,COC(=O)c1cc(NC(=O)[C@H]2CC[C@@H](CN3C(=O)C=CC3...,,,,{}
340327,N-Boc_deprotection,COC(=O)c1cc(NC(=O)C23COC(C(=O)OC(C)(C)C)(C2)C3...,,COC(=O)c1cc(NC(=O)C23COC(C(=O)OC(C)(C)C)(C2)C3...,,,,{}
340328,N-Boc_deprotection,COC(=O)c1cc(NC(=O)C2(Cc3ccc(F)cc3)CCCO2)c(N(C(...,,COC(=O)c1cc(NC(=O)[C@]2(Cc3ccc(F)cc3)CCCO2)c(N...,,,,{}
