I need to parse through .log files to collate information on the status of compounds. 

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity

def parse_log_file(log_file_path):
    # List to hold the relevant log lines
    log_entries = []

    # Keywords for filtering the log lines
    error_keywords = ['ERROR']
    info_keywords = ['INFO', 'successful placements', 'Saved HIPPO', 'Finished elaborating']

    # Open and read the log file line by line
    with open(log_file_path, 'r') as log_file:
        for line in log_file:
            # Filter for ERROR lines
            if any(keyword in line for keyword in error_keywords):
                log_entries.append(line.strip())
            # Filter for WARNING lines, excluding those from 'syndirella.slipper.intra_geometry'
            elif 'WARNING' in line and 'syndirella.slipper.intra_geometry' not in line and 'matplotlib' not in line:
                log_entries.append(line.strip())
            # Filter for INFO lines that meet the criteria
            elif 'INFO' in line and (
                    'successful placements' in line or
                    'Saved HIPPO' in line or
                    'Finished elaborating' in line or
                    'Placing' in line or
                    'Running retrosynthesis' in line or
                    'Elaborating compound' in line):
                log_entries.append(line.strip())

    # Create a DataFrame from the filtered log lines
    log_df = pd.DataFrame(log_entries, columns=['LogEntry'])

    return log_df

In [2]:
log_file_path = '../logs/slurm-error_Ax0926a_manual_70033.log'
df = parse_log_file(log_file_path)
df

Unnamed: 0,LogEntry
0,"2024-08-06 16:57:45,954 - syndirella.pipeline ..."
1,"2024-08-06 17:05:39,490 - syndirella.slipper.S..."
2,"2024-08-07 01:16:53,268 - syndirella.slipper.S..."
3,"2024-08-07 01:22:51,564 - syndirella.slipper.S..."
4,"2024-08-07 01:22:54,391 - syndirella.slipper.S..."
5,"2024-08-07 01:23:18,987 - syndirella.pipeline ..."


In [3]:
log_file_path = '../logs/slurm-error_A71EV2A_exact_hits_3_71665.log'
df = parse_log_file(log_file_path)
df

Unnamed: 0,LogEntry
0,"2024-08-07 13:48:34,477 - syndirella.Cobbler.C..."
1,"2024-08-07 13:57:36,023 - syndirella.slipper.S..."
2,"2024-08-07 16:51:31,190 - syndirella.slipper.S..."
3,"2024-08-07 16:54:12,906 - syndirella.slipper.S..."
4,"2024-08-07 16:54:13,393 - syndirella.slipper.S..."
5,"2024-08-07 16:54:21,097 - syndirella.pipeline ..."
6,"2024-08-07 16:54:24,498 - syndirella.Cobbler.C..."
7,"2024-08-07 16:58:37,670 - syndirella.slipper.S..."
8,"2024-08-07 17:41:32,880 - syndirella.slipper.S..."
9,"2024-08-07 17:42:12,523 - syndirella.slipper.S..."


In [4]:
log_file_path = '../logs/slurm-error_A71EV2A_ryan_merges_24_71338.log'
df = parse_log_file(log_file_path)
df

Unnamed: 0,LogEntry
0,"2024-08-07 11:06:09,815 - syndirella.pipeline ..."
1,"2024-08-07 11:06:09,815 - syndirella.pipeline ..."
2,"2024-08-07 11:06:13,959 - syndirella.Cobbler.C..."
3,"2024-08-07 11:10:56,667 - syndirella.slipper.S..."
4,"2024-08-07 13:14:05,510 - syndirella.slipper.S..."
...,...
67,"2024-08-08 06:07:26,940 - syndirella.pipeline ..."
68,"2024-08-08 06:07:50,380 - syndirella.pipeline ..."
69,"2024-08-08 06:07:50,380 - syndirella.pipeline ..."
70,"2024-08-08 06:07:56,336 - syndirella.Cobbler.C..."


In [5]:
# adding information to input csv
import re

def tanimoto_similarity(smiles1: str, smiles2: str):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048)
    return TanimotoSimilarity(fp1, fp2)

def output_info(smiles: str, log_df: pd.DataFrame) -> dict:
    pattern = re.compile(re.escape(smiles))
    # get beginning index of row that contain smiles
    try:
        smiles_index = log_df[log_df['LogEntry'].str.contains(pattern)].index[0]
    except IndexError:
        return {'smiles': smiles,
                'output': 'not-found'}
    # get last index of row that contain smiles
    smiles_index_end = log_df[log_df['LogEntry'].str.contains(pattern)].index[-1]
    for row in log_df.loc[smiles_index:smiles_index_end, 'LogEntry']:
        if 'could not be placed successfully. Skipping...' in row:
            return {'smiles': smiles,
                    'output': 'base-failed'}
        elif 'Saved HIPPO' in row:
            return {'smiles': smiles,
                    'output': 'successful'}
        elif 'There are no final routes' in row:
            return {'smiles': smiles,
                    'output': 'no-synthesis-routes'}
        elif 'Error elaborating compound' in row:
            return {'smiles': smiles,
                    'output': 'general-error'}
    return {'smiles': smiles,
            'output': 'not-found'}
    
def add_output(input_file_path: str, log_file_path: str):
    # get list of smiles from input_file_path
    input_df = pd.read_csv(input_file_path)
    input_smiles = input_df['smiles'].tolist()
    # get log_df
    log_df = parse_log_file(log_file_path)
    # for each smiles, need to get log output
    outputs = []
    for smiles in input_smiles:
        output = output_info(smiles, log_df)
        outputs.append(output)
    output_df = pd.DataFrame(outputs)
    input_df = input_df.merge(output_df, on='smiles', how='left')
    if 'Unnamed: 0' in input_df.columns:
        input_df = input_df.drop(columns=['Unnamed: 0'])
    if 'Unnamed: 4' in input_df.columns:
        input_df = input_df.drop(columns=['Unnamed: 4'])
    # add date_collected column
    input_df['date_collected'] = pd.to_datetime('today').strftime('%Y-%m-%d')
    return input_df

In [6]:
output_info('CC(=O)Nc1c(C(C)CS(=O)(=O)NCC2CCCC2)cnnc1F', df)

{'smiles': 'CC(=O)Nc1c(C(C)CS(=O)(=O)NCC2CCCC2)cnnc1F', 'output': 'not-found'}

In [7]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/molecule-set-A71_2A_ryan_merges_24.csv', 
           '/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/logs/slurm-error_A71EV2A_ryan_merges_24_71338.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_2A_ryan_merges_24.csv', index=False)

In [8]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/molecule-set-A71_EV_2A_exact_hits_3.csv',
              '/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/logs/slurm-error_A71EV2A_exact_hits_3_71665.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_exact_hits_3.csv', index=False)

In [9]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/molecule-set-A71_EV_2A_Ax0926a_manual.csv',
              '/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/logs/slurm-error_Ax0926a_manual_70033.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_Ax0926a_manual.csv', index=False)

In [12]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/molecule-set-A71_EV_2A_covalent_and_noncovalents_16.csv',
                '../logs/slurm-error_A71EV2A_covalent_noncovalents_16_71339.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_covalent_and_noncovalents_16.csv', index=False)

In [49]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/A71EV2A_Knitwork_June_42.csv',
                '/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/logs/slurm-error_A71EV2A_Knitwork_71641.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/A71EV2A_Knitwork_June_42.csv', index=False)
df

Unnamed: 0,smiles,template,hits,compound_set,short_hits,output,date_collected
0,N#CCC(=O)Nc1ccc(C(N)=O)nc1,Ax0310a,A71EV2A-x0310_A_147_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2024,Ax0310a Ax0556a,successful,2024-08-12
1,CC(=O)Nc1ccc(NCc2ccc(F)cc2)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2025,Ax0365a Ax0310a,general-error,2024-08-12
2,COc1cccc(CNc2ccc(NC(C)=O)cn2)c1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2026,Ax0365a Ax0310a,successful,2024-08-12
3,CC(=O)Nc1ccc(NCc2ccc(C)cc2F)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2027,Ax0365a Ax0310a,general-error,2024-08-12
4,CC(=O)Nc1ccc(NCc2cccc(C(C)C)c2)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2028,Ax0365a Ax0310a,general-error,2024-08-12
5,Cn1nc(C2CCCOC2)cc1NC(=O)CC#N,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2029,Ax0365a Ax0310a,successful,2024-08-12
6,CC(=O)Nc1ccc(NCc2cc(F)ccc2C)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2030,Ax0365a Ax0310a,general-error,2024-08-12
7,CC(=O)Nc1ccc(NCc2ccccc2C2CC2)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2031,Ax0365a Ax0310a,general-error,2024-08-12
8,CC(=O)Nc1ccc(NCc2c(C)cccc2F)nc1,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+1 A7...,knitwork_june_2032,Ax0365a Ax0310a,general-error,2024-08-12
9,Cn1ccc2ccc(NCc3ccccc3)cc21,Ax0310a,A71EV2A-x0365_A_201_1_A71EV2A-x0526+A+147+9 A7...,knitwork_june_2033,Ax0365a Ax0451a,successful,2024-08-12


In [23]:
df = add_output('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/molecule-set-aug_2024_x1346_analogues_38.csv','/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/logs/slurm-error_A71EV2A_x1346_analogues_38_71603.log')
df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-aug_2024_x1346_analogues_38.csv', index=False)

In [42]:
# make master df with all outputs as columns and rows as the csv name
ryan = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_2A_ryan_merges_24.csv')
exact = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_exact_hits_3.csv')
manual = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_Ax0926a_manual.csv')
covalent = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-A71_EV_2A_covalent_and_noncovalents_16.csv')
knitwork = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/A71EV2A_Knitwork_June_42.csv')
x1346 = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/molecule-set-aug_2024_x1346_analogues_38.csv')

master_dfs = [ryan, exact, manual, covalent, knitwork, x1346]
names = ['ryan', 'exact', 'manual_exact', 'covalents', 'knitwork', 'x1346_analogues']

master = []
for df, name in zip(master_dfs, names):
    df = df.value_counts('output')
    print(df)
    # get the number of successful, base-failed, no-synthesis-routes, general-error if they exist
    to_add = {'name': name,
              'successful': df.get('successful', 0),
              'base-failed': df.get('base-failed', 0),
              'no-synthesis-routes': df.get('no-synthesis-routes', 0),
              'general-error': df.get('general-error', 0),
              'not-found': df.get('not-found', 0)}
    master.append(to_add)

master_df = pd.DataFrame(master)

output
base-failed            9
no-synthesis-routes    8
general-error          4
successful             3
not-found              2
Name: count, dtype: int64
output
successful    3
Name: count, dtype: int64
output
successful    1
Name: count, dtype: int64
output
successful       12
not-found         3
general-error     1
Name: count, dtype: int64
output
successful       15
base-failed      11
not-found         9
general-error     7
Name: count, dtype: int64
output
not-found        23
general-error     8
successful        5
base-failed       2
Name: count, dtype: int64


In [43]:
master_df

Unnamed: 0,name,successful,base-failed,no-synthesis-routes,general-error,not-found
0,ryan,3,9,8,4,2
1,exact,3,0,0,0,0
2,manual_exact,1,0,0,0,0
3,covalents,12,0,0,1,3
4,knitwork,15,11,0,7,9
5,x1346_analogues,5,2,0,8,23


In [44]:
master_df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/master_output.csv', index=False)

In [45]:
master_df = pd.read_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/master_output.csv')
# add total row at end of each column
total = master_df.sum()
total['name'] = 'total'
master_df = pd.concat([master_df, total.to_frame().T])
master_df

Unnamed: 0,name,successful,base-failed,no-synthesis-routes,general-error,not-found
0,ryan,3,9,8,4,2
1,exact,3,0,0,0,0
2,manual_exact,1,0,0,0,0
3,covalents,12,0,0,1,3
4,knitwork,15,11,0,7,9
5,x1346_analogues,5,2,0,8,23
0,total,39,22,8,20,37


In [46]:
# get all columns except name
cols = master_df.columns[1:]
# get total per row
master_df['total'] = master_df[cols].sum(axis=1)
master_df

Unnamed: 0,name,successful,base-failed,no-synthesis-routes,general-error,not-found,total
0,ryan,3,9,8,4,2,26
1,exact,3,0,0,0,0,3
2,manual_exact,1,0,0,0,0,1
3,covalents,12,0,0,1,3,16
4,knitwork,15,11,0,7,9,42
5,x1346_analogues,5,2,0,8,23,38
0,total,39,22,8,20,37,126


In [47]:
master_df.to_csv('/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_output/master_output.csv', index=False)