<a href="https://colab.research.google.com/github/jyryu3161/chembounce/blob/main/chembounce_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Anaconda installation
# Conda installation
import os
conda_loc = "/usr/local"
os.system("wget https://repo.anaconda.com/archive/Anaconda3-2024.02-1-Linux-x86_64.sh")
os.system("chmod +x Anaconda3-2024.02-1-Linux-x86_64.sh")
os.system(f"bash ./Anaconda3-2024.02-1-Linux-x86_64.sh -b -f -p {conda_loc}")

0

In [None]:
#@title Environment setting for Chembounce
#@markdown This process takes apprx. 3mins
import os
import sys
# git clone
git_plf_n = 'chembounce'
if not os.path.isdir(git_plf_n) and os.path.split(os.getcwd())[1] != git_plf_n:
    print("Downloading git repository...")
    !git clone https://github.com/jyryu3161/chembounce.git
if os.path.split(os.getcwd())[1] != git_plf_n:
    os.chdir(git_plf_n)
sys.path.append(os.path.abspath("./"))
# environment setting
if not os.path.isfile(f'{conda_loc}/envs/chembounce/bin/python'):
    print("Installing package dependencies...")
    os.system(f"{conda_loc}/bin/conda env create -n chembounce -f ./environment.yml --quiet -y")
# !pip3 install -q --no-warn-conflicts scaffoldgraph==1.1.2 rdkit==2020.09.5 scipy pandas numpy oddt pubchempy molvs

# Dataset
data_installed=False
if os.path.isdir('./data'):
    if len(set(os.listdir('./data')) & set(['fragment_data.pickle','Scaffolds_processed.txt']))==2:
        data_installed = True
if not data_installed:
    print("Downloading data...")
    os.system('bash install.sh')
from google.colab import files
import math
import datetime

# pre-defined default settings
lipinski=True
fragments=[]
replace_scaffold_files=[]

Downloading git repository...
Cloning into 'chembounce'...
remote: Enumerating objects: 200, done.[K
remote: Counting objects: 100% (200/200), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 200 (delta 110), reused 147 (delta 70), pack-reused 0 (from 0)[K
Receiving objects: 100% (200/200), 923.56 KiB | 9.72 MiB/s, done.
Resolving deltas: 100% (110/110), done.
Installing package dependencies...
Downloading data...


In [None]:
#@title Basic input parameters {run:'auto'}
#@markdown ##Structures
#@markdown ####input_SMILES
#@markdown #####Input SMILES, the target molecular structure
# imatinib: CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C)NC4=NC=CC(=N4)C5=CN=CC=C5
# Gefitinib: COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4
# Losartan: CCCCC1=NC(=C(N1CC2=CC=C(C=C2)C3=CC=CC=C3C4=NNN=N4)CO)Cl
input_smiles="COC1=C(C=C2C(=C1)N=CN=C2NC3=CC(=C(C=C3)F)Cl)OCCCN4CCOCC4" #@param {type:'string'}

#@markdown ####core_smiles (optional)
#@markdown #####(Optional) Core SMILES which should not be altered while scaffold hopping
core_smiles="" #@param {type:'string'}

#@markdown ##Parameters
#@markdown ####frag_max_n
#@markdown #####Maximal number of scaffold-hopped candidates for a fragment
frag_max_n=100 #@param {type:'integer'}

#@markdown ####tanimoto_threshold
#@markdown #####Tanimoto similarity threshold, between 0 and 1: used to exclude irrelated molecular structure, based on the similarity between the original structure and scaffold-hopped one. Default is 0.5
tanimoto_threshold=0.5 #@param {type:'slider',min:0.0, max:1.0, step:0.01}

#@markdown ####output_dir
#@markdown #####Output location
output_dir = './output_Gefitinib' #@param {type:'string'}

#@markdown ####low_mem
#@markdown #####Low memory mode (recommended): Note that ordinary version requires at 55GB RAM, but less than 2GB RAM for low memory mode.
#@markdown #####`use_fb_db`: Use pre-calculated fingerprint DB, but `low_mem` option would be ignored.
low_mem=True #@param {type:'boolean'}
use_fp_db=False #@param {type:'boolean'}


os.makedirs(output_dir,exist_ok=True)
if os.path.isfile(os.path.join(output_dir,'result.txt')):
    print('Warning: A result file already exists')
print(
    f"Input SMILES\t:{input_smiles}",
    f"\nCore SMILES\t:{core_smiles}",
    f"\nTanimoto threshold\t:{tanimoto_threshold}",
    f"\nTop fragments\t:{frag_max_n}",
    f"\nLow memory mode\t:{low_mem}",
    f"\nFingerprint DB mode\t:{use_fp_db}",
    f"\nOutput location\t:{os.path.abspath(output_dir)}"
)

## Optional parameters
To apply optional parameters, run the corresponding cells below, and run the `Run` cell

### Limit maximal iteration and number of candidates
#####Other than `frag_max_n`, which limits the number of scaffold-hopped candidates for a fragment, there are several options.
- `overall_max_n`: Maximal number of scaffold-hopped candidates for the whole fragments
- `scaffold_top_n`: Number of scaffolds to test for a fragment.
- `cand_max_n__rplc`: Maximal number of candidates for a replaced scaffold

### Threshold options
##### Min/max for QED, SAscore, logP, MW, number of H donors and number of H acceptors can be limited.

### Fragments and replace scaffold structures
- Target fragment can be defined.
- If not imposed, `ChemBounce` will automatically look for possible fragments of the input structure.
- Corresponding replacement structures can also be defined as file: in priority of score (the higher score, the higher priority for the candidate) in tsv format or just SMILES




In [None]:
#@title (Optional) Maximal iteration and number of candidates {run:'auto'}
#@markdown ####overall_max_n
#@markdown #####Maximal number of scaffold-hopped candidates for overall fragments
overall_max_n = None #@param {type:'integer'}

#@markdown ####scaffold_top_n
#@markdown #####Number of scaffolds to test for a fragment
scaffold_top_n = None #@param {type:'integer'}

#@markdown ####cand_max_n__rplc
#@markdown #####Maximal number of candidates for a replaced scaffold
cand_max_n__rplc=10 #@param {type:'integer'}

_cnt_opt_cmd_ = ""
for i in ['overall_max_n','scaffold_top_n','cand_max_n__rplc']:
    if type(eval(i)) not in [float,int]:
        continue
    elif math.isnan(eval(i)):
        continue
    else:
        _cnt_opt_cmd_ += f" --{i} {eval(i)} "

print(
    f"overall_max_n\t:{overall_max_n}",
    f"\nfrag_max_n:\t{frag_max_n}",
    f"\nscaffold_top_n\t:{scaffold_top_n}",
    f"\ncand_max_n__rplc\t:{cand_max_n__rplc}",
)

In [None]:
#@title (Optional) Threshold options {run:'auto'}

#@markdown #### Lipinski's rule of Five
#@markdown ##### Application of Lipinski\'s rule of five :
#@markdown ##### logp_max=5, qed_max=500, h_donor_max=5, h_acceptor_max=10
lipinski=True #@param {type:'boolean'}

#@markdown #### Molecular properties
#@markdown ##### Min/max for QED, SAscore, logP, MW, number of H donors and number of H acceptors
#@markdown ##### User-defined options take precedence over Lipinski's rule.
qed_min=None #@param {type:'number'}
qed_max=None #@param {type:'number'}
sa_min=None #@param {type:'number'}
sa_max=None #@param {type:'number'}
logp_min=None #@param {type:'number'}
logp_max=None #@param {type:'number'}
mw_min=None #@param {type:'number'}
mw_max=None #@param {type:'number'}
h_donor_min=None #@param {type:'number'}
h_donor_max=None #@param {type:'number'}
h_acceptor_min=None #@param {type:'number'}
h_acceptor_max=None #@param {type:'number'}

print(f'Lipinski : {lipinski}')
_thr_opt_cmd_ = ""
# Threshold
print(f'Threshold options:')
for i in ['qed_min','qed_max','sa_min','sa_max','logp_min','logp_max','mw_min','mw_max','h_donor_min','h_donor_max','h_acceptor_min','h_acceptor_max']:
    if type(eval(i)) in [float,int]:
        if not math.isnan(eval(i)):
            _thr_opt_cmd_ += f" --{i} {eval(i)} "
            print(f"{i}\t:{eval(i)}")
if not _thr_opt_cmd_:
    print('None defined')

In [None]:
#@title (Optional) Fragments and replace scaffolds {run:'auto'}
#@markdown ### Note: for multiple fragments and its corresponding replace scaffolds, multiplicate this cell and run

#@markdown ####Fragment SMILES
#@markdown #####Fragment SMILES, the substructure of the target molecular structure. For multiple SMILES, delimit with comma ,
fragment_smiles="" #@param {type:'string'}
fragments=fragment_smiles.split(',')
#@markdown ####Repalce scaffold SMILES
#@markdown #####Files for repalce scaffold SMILES. Uploaded file for the list of replace mol. For multiple SMILES, delimit with comma ,. If there is not a list file for a matched fragment, empty here or impose nothing between commas(for multiplce files)
replace_scaffold_file="" #@param {type:'string'}

if not replace_scaffold_file:
    _tmp_f_=[]
else:
    _tmp_f_ = replace_scaffold_file.split(',')
replace_scaffold_files = []
while '' in fragments:
    fragments.remove('')
for _f in _tmp_f_:
    if _f and os.path.isfile(_f):
        replace_scaffold_files.append(_f)
    else:
        replace_scaffold_files.append('')

if len(fragments) != len(replace_scaffold_files):
    print(f"Warning: number of fragments and corresponding files are not matched!")
else:
    print(f"Number of input fragments\t:{len(fragments)}")

#Run

In [None]:
#@title Run
#@markdown Once this process is finished, the result will be downloaded : OUTPUT_DIR.tar.gz

#@markdown For a test run, Gefitinib took approximately 1.5hrs with the default parameters.
print(
    f"Input SMILES\t:{input_smiles}",
    f"Core SMILES\t:{core_smiles}",
    f"\nThreshold\t:{tanimoto_threshold}",
    f"\nNumber of maximal candidates for a fragments\t:{frag_max_n}",
    f"\nLow memory mode\t:{low_mem}",
    f"\nOutput location\t:{os.path.abspath(output_dir)}"
)
# command line - defaults
cmd = f"/usr/local/envs/chembounce/bin/python chembounce.py "
# IO, Tanimoto threshold, top_n
cmd += f" -o {os.path.abspath(output_dir)} -i \"{input_smiles}\" "
cmd += f" --tanimoto_threshold {tanimoto_threshold} "
cmd += f" --frag_max_n {frag_max_n} "
if core_smiles and core_smiles!="None":
    cmd += f" --core_smiles \"{core_smiles}\" "
if low_mem:
    cmd += " -l "
if type(use_fp_db) == bool:
    if use_fp_db:
        cmd += " --use-fingerprint-db "
    else:
        cmd += " --no-fingerprint-db "
# Lipinski
if not lipinski:
    cmd += " --wo_lipinski "
# command line - optionals
# iteration cmd
if '_cnt_opt_cmd_' in locals():
    cmd += _cnt_opt_cmd_
# Threshold
if "_thr_opt_cmd_" in locals():
    cmd += _thr_opt_cmd_
# fragments and replace mols
for frag, rplc in zip(fragments,replace_scaffold_files):
    cmd += f" --fragments \"{frag}\" --replace_scaffold_file \"{rplc}\" "

# Process
start = datetime.datetime.now()
print("Started:\t",start)
print(cmd)
os.system(cmd)
end = datetime.datetime.now()
print("Finished:\t",end)
print("Time cose:\t",end-start)

os.system(f'tar -czvf {output_dir}.tar.gz {output_dir}')
files.download(f"{output_dir}.tar.gz")

In [None]:
#@title (Optional) Uploading replace_scaffold_file
#@markdown Uploading a file for replace_scaffold_file.
#@markdown Currently, this code is commented out.

#f = files.upload()