In [5]:
from rdkit import Chem
from CombineMols.CombineMols import CombineMols
import os

In [6]:
CRF_PATTERN = "CC1(CCC#C)N=N1"
CRF_PATTERN_0 = "C#CC"
CRF_PATTERN_1 = "N=N"

crf_0 = "C#CCC1(N=N1)CCNC(CC)=O"
crf_0 = "ONC(=O)CCC1(CCC#C)N=N1"
crf_1 = "C#CCC1(N=N1)CCC(=O)[N]"

crf_0 = Chem.MolFromSmiles(crf_0)
crf_1 = Chem.MolFromSmiles(crf_1)

In [7]:
def has_crf(mol):
    pattern = CRF_PATTERN
    has_pattern = mol.HasSubstructMatch(Chem.MolFromSmarts(pattern))
    if not has_pattern:
        if mol.HasSubstructMatch(
            Chem.MolFromSmarts(CRF_PATTERN_0)
        ) and mol.HasSubstructMatch(Chem.MolFromSmarts(CRF_PATTERN_1)):
            return True
        else:
            return False
    return True

In [8]:
def attach_crf(smiles):
    mol = Chem.MolFromSmiles(smiles)
    combined_mol_0 = CombineMols(mol, crf_0, "O")
    # combined_mol_1 = CombineMols(mol, crf_1, "N")
    combined_mol_1 = []
    combined_mol = combined_mol_0 + combined_mol_1
    result = []
    for cm in combined_mol:
        smi = Chem.MolToSmiles(cm)
        if "." in smi:
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        if not has_crf(mol):
            continue
        result += [Chem.MolToSmiles(mol)]
    return result


cases = attach_crf(
    "CC1(C)CC(OC(=O)CN2CN(c3ccccc3)C3(CCN(C(=O)c4ccc(C5CCCCC5)cc4)CC3)C2=O)CC(C)(C)N1"
)

## Background ChEMBL

In [14]:
import pandas as pd
from tqdm import tqdm


def attach_crf_to_chembl_smiles(idx):
    with open("../data/chembl33_sample_20k_{0}.smi".format(idx), "r") as f:
        smiles_list = []
        for r in f:
            smiles_list += [r.rstrip(os.linesep)]
    file_name = "../data/chembl33_sample_20k_with_crf_{0}.csv".format(idx)
    R = []
    for smi in tqdm(smiles_list):
        try:
            cases = attach_crf(smi)
            if len(cases) == 0:
                continue
            R += [(smi, "; ".join(cases))]
        except:
            continue
    d = pd.DataFrame(R, columns=["smiles", "smiles_with_crf"])
    d.to_csv(file_name, index=False)


attach_crf_to_chembl_smiles(0)
attach_crf_to_chembl_smiles(1)
attach_crf_to_chembl_smiles(2)

100%|██████████| 20000/20000 [1:59:29<00:00,  2.79it/s]   
100%|██████████| 20000/20000 [2:04:17<00:00,  2.68it/s]     
100%|██████████| 20000/20000 [1:53:50<00:00,  2.93it/s]   


## GSF Lab

In [21]:
import pandas as pd

df = pd.read_csv("../data/slc_inhibitor_collection_gsf.tsv", sep="\t")
df = df[df["structure"].notnull()]
df = df[df["primary_SLC"].notnull()]

In [22]:
from tqdm import tqdm

R = []
for v in tqdm(df[["primary_SLC", "structure"]].values):
    cases = attach_crf(v[1])
    if len(cases) == 0:
        continue
    r = (v[0], v[1], "; ".join(cases))
    R += [r]

dr = pd.DataFrame(R, columns=["gene_name", "smiles", "smiles_with_crf"])
dr.to_csv("../data/slc_inhibitor_collection_gsf_with_auto_crf.tsv", index=False)

unique_slcs = sorted(set(dr["gene_name"]))
with open("../data/examples/slc_from_gsf.txt", "w") as f:
    for r in unique_slcs:
        f.write(r + os.linesep)

100%|██████████| 20235/20235 [1:30:20<00:00,  3.73it/s]  
