In [1]:
!git clone --branch bert-phase-2 "https://github.com/nguyenrtm/DDI-KT-2024.git"
!pip install rdkit torch_geometric

Cloning into 'DDI-KT-2024'...
remote: Enumerating objects: 4761, done.[K
remote: Counting objects: 100% (1685/1685), done.[K
remote: Compressing objects: 100% (521/521), done.[K
remote: Total 4761 (delta 1235), reused 1559 (delta 1160), pack-reused 3076[K
Receiving objects: 100% (4761/4761), 106.64 MiB | 20.30 MiB/s, done.
Resolving deltas: 100% (3097/3097), done.
Updating files: 100% (1612/1612), done.
Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting torch_geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloadi

In [2]:
import os
os.chdir('DDI-KT-2024')

In [3]:
from ddi_kt_2024.utils import get_lookup, load_pkl, dump_pkl, get_labels, rm_no_smiles, read_index, convert_to_label_list
from ddi_kt_2024.text.model.word_embedding import WordEmbedding
from ddi_kt_2024.text.model.custom_dataset import CustomDataset
from ddi_kt_2024.mol.preprocess import mapped_property_reader, get_property_dict, find_drug_property, candidate_property

In [4]:
all_candidates_train = load_pkl('cache/pkl/v1/candidates.train.pkl')
all_candidates_test = load_pkl('cache/pkl/v1/candidates.test.pkl')
df = mapped_property_reader('cache/mapped_drugs/DDI/full.csv')

mapped_formula = get_property_dict(df, property_name='smiles')
x_train, y_train = candidate_property(all_candidates_train, mapped_formula)
x_test, y_test = candidate_property(all_candidates_test, mapped_formula)

In [5]:
import rdkit
from rdkit import Chem
from rdkit.Chem import BRICS, FunctionalGroups, Fragments, Recap
from rdkit.Chem import Draw

def decompose(smiles):
    m = Chem.MolFromSmiles(smiles)
    frags = list(Chem.BRICS.BRICSDecompose(m, minFragmentSize=1, returnMols=True))
    mols = [Chem.MolToSmiles(x) for x in frags]
    return mols, frags

def find_brics_bonds(smiles):
    m = Chem.MolFromSmiles(smiles)
    bonds = list(Chem.BRICS.FindBRICSBonds(m))
    return bonds

def plot_chem(smiles):
    m = Chem.MolFromSmiles(smiles)
    draw_mol = Draw.MolToImage(m, size=(600, 300))
    return draw_mol
    
def plot_brics(smiles):
    m = Chem.MolFromSmiles(smiles)
    frags = list(Chem.BRICS.BRICSDecompose(m, minFragmentSize=1, returnMols=True))
    draw_brics = Draw.MolsToGridImage(frags, molsPerRow=4, subImgSize=(300, 300))
    return draw_brics

In [8]:
from tqdm import tqdm

brics_full = list()
bonds_full = list()
for i in tqdm(range(len(df))):
    if type(df.iloc[i]['smiles']) == str:
        smiles = df.iloc[i]['smiles']
        mols = decompose(smiles)
        bonds = find_brics_bonds(smiles)
        brics_full.append(mols[0])
        bonds_full.append(bonds)

100%|██████████| 2673/2673 [03:03<00:00, 14.53it/s] 


In [12]:
brics_all = list()
for x in brics_full:
    brics_all += x

In [17]:
dump_pkl(brics_distinct, 'brics.set.full.pkl')