In [1]:
import pandas as pd

from paroutes import PaRoutesInventory
from rdkit.Chem import DataStructs, AllChem

import pickle
import numpy as np

from tqdm import tqdm

## Read input file

In [2]:
# Routes dataframe
run_id = '202305-2911-2320-5a95df0e-3008-4ebe-acd8-ecb3b50607c7'
input_file = f'Runs/{run_id}/routes_df.csv'
routes_df = pd.read_csv(input_file)

output_file_routes = f'Runs/{run_id}/targ_routes.pickle'
output_file_distances = f'Runs/{run_id}/targ_to_purch_distances.pickle'


# Inventory
inventory=PaRoutesInventory(n=5)


## Create dataframe

In [3]:
purch_smiles = [mol.smiles for mol in inventory.purchasable_mols()]
len(purch_smiles)


13325

In [4]:
target_smiles = routes_df['target_smiles'].unique()
cols_to_keep_renamed = {
    'route_rank': 'label',
    'intermediate_smiles': 'smiles',
    'intermediate_depth': 'depth',
}

def fingerprint_from_smiles(mol_smiles):
    return AllChem.GetMorganFingerprint(AllChem.MolFromSmiles(mol_smiles), radius=3)

# Save 2 separate data dict: one with distances and rank (for every target), 
# one with sets of purchasable molecules, plus distance and rank (for every target)

# Create distances dfs
distances_df_dict = {}
routes_data_dict = {}
for target in tqdm(target_smiles):
    # 1 - Distances dfs
    # Compute distances
    target_fingerprint = fingerprint_from_smiles(target)
    purch_fingerprints = list(map(fingerprint_from_smiles, purch_smiles))
    purch_target_distance = [1 - sim for sim in DataStructs.BulkTanimotoSimilarity(target_fingerprint, purch_fingerprints)]
    distance_df = pd.DataFrame({'smiles': purch_smiles, 'Tanimoto_distance_from_target':purch_target_distance})
    
    # Add rank
    distance_df_sorted = distance_df.sort_values(['Tanimoto_distance_from_target', 'smiles'], ascending=True).reset_index(drop=True)
    distance_df_sorted['distance_to_target_rank'] = distance_df_sorted.index +1    
    
    distances_df_dict[target] = distance_df_sorted
    
    # 2 - Routes df
    target_df = routes_df.loc[routes_df['target_smiles']==target]
    routes_data_dict[target] = {}
    
    for route_rank in target_df['route_rank'].dropna().unique():
        target_route_df =  target_df.loc[((target_df['intermediate_is_purchasable']) | 
                                          (target_df['intermediate_smiles']==target)) & 
                                         (target_df['route_rank']==route_rank), 
                                         cols_to_keep_renamed.keys()].drop_duplicates()
        route_name = 'route_' + str(int(route_rank))
        target_route_df['route_rank'] = route_name
#         target_route_df['route_rank'] = 'route_' + target_route_df['route_rank'].astype(int).astype(str)
        
        target_route_df = target_route_df.rename(columns=cols_to_keep_renamed)
        
        target_route_df = pd.merge(target_route_df, distance_df_sorted, how='left', on='smiles')
        
        target_mask = target_route_df['smiles']==target
        target_route_df.loc[target_mask, 'label'] = 'Target'
        target_route_df.loc[target_mask, 'Tanimoto_distance_from_target'] = 0
        
        routes_data_dict[target].update({route_name: target_route_df})
    
    

100%|█████████████████████████████████████████████████████████████████| 9895/9895 [4:05:19<00:00,  1.49s/it]


In [5]:
routes_data_dict['CCc1cc2nncc(N3CCc4[nH]nc(C(=O)NC5CC5)c4C3)c2cc1OC']

{'route_1':      label                                             smiles  depth   
 0  route_1                                           CC(=O)Cl   12.0  \
 1  route_1                                      O=[N+]([O-])O   10.0   
 2   Target  CCc1cc2nncc(N3CCc4[nH]nc(C(=O)NC5CC5)c4C3)c2cc1OC    0.0   
 3  route_1                        O=C(NC1CC1)c1n[nH]c2c1CNCC2    2.0   
 4  route_1                                      O=P(Br)(Br)Br    4.0   
 5  route_1                                               O=NO    6.0   
 6  route_1                                       CCc1ccccc1OC   12.0   
 
    Tanimoto_distance_from_target  distance_to_target_rank  
 0                       0.963303                  12315.0  
 1                       0.991071                  13094.0  
 2                       0.000000                      NaN  
 3                       0.677686                      1.0  
 4                       0.991228                  13132.0  
 5                       0.990909    

In [6]:
with open(output_file_routes, 'wb') as handle:
    pickle.dump(routes_data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open(output_file_distances, 'wb') as handle:
    pickle.dump(distances_df_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)





In [7]:
# data_dict.keys()

In [8]:
# data_dict['CCc1cc2nncc(N3CCc4[nH]nc(C(=O)NC5CC5)c4C3)c2cc1OC']['route_1']['label'].unique()