In [68]:
import os

import deepchem as dc
import hickle as hkl
import pandas as pd
from rdkit import Chem, RDLogger
from tqdm import tqdm

In [69]:
tmp = pd.read_csv("../data/drugSynonym.csv")
tmp = tmp[
    (~tmp.nci60.isna() & ~tmp.ctrp.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc1.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc2.isna())
]
tmp = [int(i) for i in set(tmp["nci60"].str.split("|").explode())]
df = pd.read_csv("../data/nsc_cid_smiles_class_name.csv")
df = pd.concat([df[df.NSC.isin(tmp)], df[df.MECHANISM != "Other"]]).drop_duplicates()
drugAct = pd.read_csv('../nci_data/drugAct.csv', index_col=0)
df = df[df.NSC.isin(drugAct.index)]

In [70]:
save_dir = "../nci_data/drug_graph_feat/"
os.makedirs(save_dir, exist_ok=True)  # より安全なディレクトリ作成

# データ読み込み
nsc2smile = dict(zip(df["NSC"], df["SMILES"]))

featurizer = dc.feat.graph_features.ConvMolFeaturizer()

for nsc, smile in tqdm(nsc2smile.items()):
    try:
        # RDKitで分子オブジェクトの生成を試みる
        mol = Chem.MolFromSmiles(smile)
        if mol is None:
            print(f"分子の生成に失敗: NSC {nsc}")
            continue

        # 原子価の明示的な設定（Tin原子用）
        for atom in mol.GetAtoms():
            if atom.GetSymbol() == "Sn":
                atom.SetFormalCharge(-1)  # 電荷を明示的に設定

        # 特徴量抽出
        mol_object = featurizer.featurize([mol])
        if len(mol_object) == 0:
            print(f"特徴量抽出失敗: NSC {nsc}")
            continue

        # データ保存
        hkl.dump(
            [
                mol_object[0].atom_features,
                mol_object[0].canon_adj_list,
                mol_object[0].deg_list,
            ],
            os.path.join(save_dir, f"{nsc}.hkl"),
        )

    except Exception as e:
        print(f"エラー発生: NSC {nsc} - {str(e)}")
        continue

100%|████████████████████████████████████████████████████████████████████████████████████| 1005/1005 [00:06<00:00, 159.09it/s]


In [55]:
def get_feature(df, save_dir):
    # データ読み込み
    nsc2smile = dict(zip(df["drugs"], df["SMILES"]))
    
    featurizer = dc.feat.graph_features.ConvMolFeaturizer()
    
    for nsc, smile in tqdm(nsc2smile.items()):
        try:
            # RDKitで分子オブジェクトの生成を試みる
            mol = Chem.MolFromSmiles(smile)
            if mol is None:
                print(f"分子の生成に失敗: NSC {nsc}")
                continue
    
            # 原子価の明示的な設定（Tin原子用）
            for atom in mol.GetAtoms():
                if atom.GetSymbol() == "Sn":
                    atom.SetFormalCharge(-1)  # 電荷を明示的に設定
    
            # 特徴量抽出
            mol_object = featurizer.featurize([mol])
            if len(mol_object) == 0:
                print(f"特徴量抽出失敗: NSC {nsc}")
                continue
    
            # データ保存
            hkl.dump(
                [
                    mol_object[0].atom_features,
                    mol_object[0].canon_adj_list,
                    mol_object[0].deg_list,
                ],
                os.path.join(save_dir, f"{nsc}.hkl"),
            )
    
        except Exception as e:
            print(f"エラー発生: NSC {nsc} - {str(e)}")
            continue

In [52]:
PATH = '../gdsc1_data/'
df = pd.read_csv(f"{PATH}drug2smiles.csv")
save_dir = f"{PATH}drug_graph_feat/"
os.makedirs(save_dir, exist_ok=True)  # より安全なディレクトリ作成

In [53]:
get_feature(df, save_dir)

100%|██████████████████████████████████████████████████████████████████████████████████████| 331/331 [00:02<00:00, 155.37it/s]


In [56]:
PATH = '../gdsc2_data/'
df = pd.read_csv(f"{PATH}drug2smiles.csv")
save_dir = f"{PATH}drug_graph_feat/"
os.makedirs(save_dir, exist_ok=True)  # より安全なディレクトリ作成

In [57]:
get_feature(df, save_dir)

100%|██████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:01<00:00, 134.53it/s]


In [58]:
PATH = '../ctrp_data/'
df = pd.read_csv(f"{PATH}drug2smiles.csv")
save_dir = f"{PATH}drug_graph_feat/"
os.makedirs(save_dir, exist_ok=True)  # より安全なディレクトリ作成

In [59]:
get_feature(df, save_dir)

100%|██████████████████████████████████████████████████████████████████████████████████████| 460/460 [00:02<00:00, 162.79it/s]
