In [61]:
import ast
import pandas as pd
import glob2
import os
import numpy as np

In [20]:
# expand each short hit into hit1 hit2 hit3 columns
def add_hit_columns(df):
    try:
        hits_contents = df['short_hits'].str.split(' ')
    except KeyError:
        hits_contents = df['hits'].str.split(' ')
    # map hits to new columns
    hits = {i:[] for i in range(3)}
    for hit in hits_contents:
        for i in range(3):
            try:
                hits[i].append(hit[i])
            except IndexError:
                hits[i].append('')
    # add columns to df
    for i in range(3):
        df[f'hit{i+1}'] = hits[i]
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    if 'Unnamed: 4' in df.columns:
        df = df.drop(columns=['Unnamed: 4'])
    df = df.drop(columns=['hits'])
    return df

In [27]:
csvs = glob2.glob('../syndirella_input/*.csv')
csvs

['../syndirella_input/molecule-set-aug_2024_x1346_analogues_221024_38.csv',
 '../syndirella_input/molecule-set-A71_EV_2A_exact_hits_221024_3.csv',
 '../syndirella_input/molecule-set-A71_EV_2A_Ax0926a_221024_manual.csv',
 '../syndirella_input/A71EV2A_Knitwork_June_221024_42.csv',
 '../syndirella_input/molecule-set-A71_2A_ryan_merges_221024_24.csv',
 '../syndirella_input/molecule-set-A71_EV_2A_covalent_and_noncovalents_221024_16.csv']

In [5]:
df = pd.read_csv(csvs[0])
df

Unnamed: 0,smiles,hits,template,compound_set,Unnamed: 4,short_hits
0,NC(=O)C1CCCN(C(=O)c2ccc(Cl)c(C(=O)CN3CCCC3)c2)C1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,x0926 merge into p1’,,Ax0926a Ax1140a
1,NC(=O)C1CCCN(C(=O)c2cccc(O)c2C(=O)NC2CC2)C1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,x0926 going down p1,,Ax0926a Ax1292a
2,NC(=O)C1CCCN(C(=O)c2cccc(O)c2C(=O)NCc2ccccc2)C1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,x0926 going down p1,,Ax0926a Ax0836a
3,C[n+]1ccc(C(=O)N2CCCC(C(N)=O)C2Cl)[nH]1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,X0926 and x1346,,Ax0926a Ax1346a
4,NC(=O)C1CCCN(C(=O)Cc2[nH]ncc2CC(=O)NC2CC2)C1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,X0926 and x1346,,Ax0926a Ax1346a
5,O=C(Cc1cn[nH]c1CC(=O)N1CCCC(C(F)(F)F)C1)NC1CC1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,X0926 and x1346,,Ax0926a Ax1346a
6,O=C(Cc1cn[nH]c1CC(=O)N1CCCC(C(F)(F)F)C1)NC1CC1,A71EV2A-x0926_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0926a,X0926 and x1347and x1019,,Ax0926a Ax1346a Ax1019a
7,CC(=O)Nc1ccccc1CCS(N)(=O)=O,A71EV2A-x0487_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0487a,x0487 P2 extension,,Ax0487a Ax0719a
8,CC(=O)Nc1c(F)cncc1CCS(N)(=O)=O,A71EV2A-x0487_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0487a,x0487 P2 extension,,Ax0487a Ax0719a
9,CC(=O)Nc1c(F)cncc1C(C)CS(N)(=O)=O,A71EV2A-x0487_A_250_1_A71EV2A-x0526+A+147+1 A7...,Ax0487a,x0487 P2 extension,,Ax0487a Ax0719a


In [24]:
for csv in csvs:
    df = pd.read_csv(csv)
    df = add_hit_columns(df)
    # save to new csv asking for input
    basename = os.path.basename(csv)
    new_basename = f'{basename.rsplit('_', maxsplit=1)[0]}_221024_{basename.rsplit('_', maxsplit=1)[1]}'
    csv = f'../syndirella_input/{new_basename}'
    df.to_csv(csv, index=False)

In [68]:
# make master csv
csvs = glob2.glob('../syndirella_input/*.csv')
dfs = []
for csv in csvs:
    df = pd.read_csv(csv)
    # need to change reactants and reaction_names to seperate columns
    try:
        reactants: list[tuple[str,str]] = ast.literal_eval(df['reactants'].values[0])
        reaction_names = ast.literal_eval(df['reaction_names'].values[0])
        for i in range(len(reactants)):
            df[f'reactant_step{i+1}'] = reactants[i][0]
            df[f'reactant2_step{i+1}'] = reactants[i][1] # WILL NOT WORK FOR REACTIONS WITH MORE THAN 1 STEP
            df[f'reaction_name_step{i+1}'] = reaction_names[i]
        df.drop(columns=['reactants', 'reaction_names', 'num_steps'], inplace=True)
    except KeyError:
        pass
    dfs.append(df)
master_df = pd.concat(dfs)
master_df.reset_index(drop=True, inplace=True)
master_df.to_csv('../syndirella_input/master_syndirella_input_221024.csv', index=False)

In [70]:
for i, row in master_df.iterrows():
    # keep row names
    row_df = row.to_frame().T
    # add name of csv to compound set
    orig_cmpd_set = row_df['compound_set'].values[0]
    row_df['compound_set'] = f'{orig_cmpd_set}_{os.path.basename(csv).split('.')[0]}'
    # check if need to remove reactants and reaction columns
    if pd.isna(row_df['reaction_name_step1'].values[0]):
        row_df = row_df.drop(columns=['reaction_name_step1', 'reactant_step1', 'reactant2_step1'])
    row_df.to_csv(f'/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/syndirella_input/final/syndirella_input{i}.csv', index=True)
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123


In [71]:
# write jobs
# define the file path
file_path = '/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/jobs/template.sh'

# Define the string to search and the replacement string
search_string = 'NUM'

for i in range(len(master_df)):
    replacement_string = str(i)
    
    # Open the file, read its contents, replace the occurrences, and write it back
    with open(file_path, 'r') as file:
        file_contents = file.read()
    
    # Replace the occurrences of the search string with the replacement string
    new_contents = file_contents.replace(search_string, replacement_string)
    
    new_path = f'/Users/kate_fieseler/PycharmProjects/EV-A71-2A-syndirella-run-2/jobs/job{i}.sh'
    
    # If you want to write to a new file, you can do:
    with open(new_path, 'w') as new_file:
        new_file.write(new_contents)

print("Replacement complete.")

Replacement complete.
