In [23]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np
import torch
import os
import yaml
from tqdm.notebook import tqdm


In [33]:
with open('../config/filepath.yml', 'r') as f:
    path_config = yaml.safe_load(f)

dude_dir = os.path.join('..', path_config['data']['DUD-E'])
alphafold_dir = os.path.join('..', path_config['data']['alphafold'])
smiles_dir = os.path.join('..', path_config['data']['smiles'])
output_dir = os.path.join('..', path_config['data']['output'])
hist_dir = os.path.join('..', path_config['data']['hist'])
preprocessed_dir = os.path.join('..', path_config['data']['preprocessed'])
sample_dir = os.path.join('..', path_config['data']['samples'])
test_dir = os.path.join('..', path_config['data']['test'])

# filter smiles
生成モデルの事前学習に使うためのリガンドファイルを用意するためのスクリプト
### read smiles from tsv

In [8]:
smiles_file = os.path.join(smiles_dir, 'chembl_35.tsv')

smiles_df = pd.read_csv(smiles_file, sep='\t')

print(smiles_df.head())
print(smiles_df.shape)

      chembl_id                                   canonical_smiles  \
0  CHEMBL153534                       Cc1cc(-c2csc(N=C(N)N)n2)cn1C   
1  CHEMBL440060  CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...   
2  CHEMBL440245  CCCC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](CC(C)C)NC(...   
3  CHEMBL440249  CC(C)C[C@@H]1NC(=O)CNC(=O)[C@H](c2ccc(O)cc2)NC...   
4  CHEMBL405398             Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1   

                                      standard_inchi  \
0  InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...   
1  InChI=1S/C123H212N44O34S/c1-19-63(12)96(164-11...   
2  InChI=1S/C160H268N50O41/c1-23-27-41-95-134(228...   
3  InChI=1S/C124H154ClN21O39/c1-57(2)48-81-112(17...   
4  InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...   

            standard_inchi_key  
0  MFRNFCWYPYSFQQ-UHFFFAOYSA-N  
1  RSEQNZQKBMRQNM-VRGFNVLHSA-N  
2  FTKBTEIKPOYCEX-OZSLQWTKSA-N  
3  UYSXXKGACMHPIM-KFGDMSGDSA-N  
4  VDSXZXJEWIWBCG-UHFFFAOYSA-N  
(2474590, 4)


### preprocess and filter smiles 
やること
1. 複合体の場合、最も分子量の大きいものを対象リガンドとして扱う
2. リピンスキーの法則にならって分子量 600 以上の化合物をフィルター
3. 重複を排除

In [26]:
def extract_higher_MW(s_list: list) -> list:
    out = []
    for s in s_list:
        smis = s.split('.')
        mols = []
        for smi in smis:
            mol = Chem.MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)

        if len(mols) == 0:
            continue

        Mws = [Descriptors.MolWt(mol) for mol in mols]
        max_Mw_idx = Mws.index(max(Mws))

        # 分子量600以上のものを排除
        if Mws[max_Mw_idx] > 600:
            continue
        
        out.append(Chem.MolToSmiles(mols[max_Mw_idx]))
    return out

def remove_dup(smis: list) -> list:
    smis = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smis]
    smis = list(set(smis))
    return smis

def process_and_filter_smiles(df: pd.DataFrame, smiles_col: str) -> pd.DataFrame:
    tqdm.pandas(desc="Processing SMILES")

    # smiles列を処理し、有効な結果のみを保持
    def process_smiles(smiles):
        processed = remove_dup(extract_higher_MW([smiles]))
        return ".".join(processed) if processed else None

    df[smiles_col] = df[smiles_col].progress_apply(process_smiles)
    
    # smiles列がNoneまたは空の行を削除
    df = df[df[smiles_col].notna() & (df[smiles_col] != "")]
    
    return df

In [27]:
filtered_df = process_and_filter_smiles(smiles_df, 'canonical_smiles')

Processing SMILES:   0%|          | 0/2474590 [00:00<?, ?it/s]

[16:53:25] Can't kekulize mol.  Unkekulized atoms: 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
[16:53:34] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:53:36] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:53:37] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:54:54] Can't kekulize mol.  Unkekulized atoms: 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
[16:57:09] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:57:09] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:57:09] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:57:12] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:58:05] Explicit valence for atom # 1 P, 7, is greater than permitted
[16:58:05] Explicit valenc

In [32]:
print(filtered_df.head())
print(filtered_df.shape)

max_smi_len = max([len(smi) for smi in filtered_df['canonical_smiles']])
print(smiles_df[smiles_df['canonical_smiles'].str.len() == max_smi_len])

      chembl_id                                   canonical_smiles  \
0  CHEMBL153534                       Cc1cc(-c2csc(N=C(N)N)n2)cn1C   
4  CHEMBL405398             Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1   
5  CHEMBL503634               COc1c(O)cc(O)c(C(=N)Cc2ccc(O)cc2)c1O   
6  CHEMBL503643                   CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1   
7  CHEMBL503865  CC(=O)O[C@@H]1[C@@H](OC(C)=O)/C(C)=C\[C@@H]2OC...   

                                      standard_inchi  \
0  InChI=1S/C10H13N5S/c1-6-3-7(4-15(6)2)8-5-16-10...   
4  InChI=1S/C19H21BrN6O/c20-15-2-1-3-17(18(15)22-...   
5  InChI=1S/C15H15NO5/c1-21-15-12(19)7-11(18)13(1...   
6  InChI=1S/C12H11NO4/c1-2-17-12(16)10-6-8-5-7(11...   
7  InChI=1S/C28H36O13/c1-12-11-18-28(27(8,41-28)2...   

            standard_inchi_key  
0  MFRNFCWYPYSFQQ-UHFFFAOYSA-N  
4  VDSXZXJEWIWBCG-UHFFFAOYSA-N  
5  OPELSESCRGGKAM-UHFFFAOYSA-N  
6  CAVYPAYXEMVXMS-UHFFFAOYSA-N  
7  NMFRJERNUSBMLR-BOVHOEAXSA-N  
(2264749, 4)
             chembl_id         

### save filtered df

In [31]:
filtered_df.to_csv(os.path.join(preprocessed_dir, 'filtered_chembl_35.csv'), index=False)

ドッキングスコア予測のテストに用いているリガンドを除く

In [38]:
regression_test_df = pd.read_csv(os.path.join(test_dir, 'test.csv'))

filterd_df_data_num = filtered_df.shape[0]
print(f"before process data num: {filterd_df_data_num}")
print(f"test data num: {regression_test_df.shape[0]}") # ZINC のデータも含まれているのでこの数値の分だけデータが減るわけではない

filtered_df = filtered_df[~filtered_df['canonical_smiles'].isin(regression_test_df['Canonical_SMILES'])]

filterd_df_data_num = filtered_df.shape[0]
print(f"after process data num: {filterd_df_data_num}")

before process data num: 2264259
test data num: 32092
after process data num: 2263982


### save df

In [39]:
filtered_df.to_csv(os.path.join(preprocessed_dir, 'filtered_chembl_35_no_test.csv'), index=False)