In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp
from joblib import Memory
from tqdm import tqdm

In [2]:
# キャッシュ設定
memory = Memory("cache/", verbose=0)

@memory.cache
def get_fingerprint_from_smiles(smiles):
    try:
        # PubChem → CID取得
        compounds = pcp.get_compounds(smiles, namespace="smiles")
        if compounds and compounds[0].cid:
            drug_data = pcp.Compound.from_cid(compounds[0].cid)
            if drug_data.fingerprint:
                bits = []
                for x in drug_data.fingerprint:
                    try:
                        bits.extend([int(b) for b in f"{int(x, 16):04b}"])
                    except Exception:
                        continue
                return np.array(bits, dtype=np.float32)
    except Exception as e:
        print(f"[PubChem FAIL] {smiles}: {e}")

    # フォールバック：ゼロで埋める（フェアなベースライン）
    return np.zeros(920, dtype=np.float32)

In [3]:
PATH = "../gdsc1_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 300/300 [02:45<00:00,  1.81it/s]


In [4]:
PATH = "../gdsc2_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 154/154 [00:37<00:00,  4.14it/s]


In [5]:
PATH = "../ctrp_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 494/494 [04:29<00:00,  1.84it/s]


In [6]:
PATH = "../nci_data/"
drugAct = pd.read_csv(PATH + "drugAct.csv", index_col=0)

# Load mechanism of action (moa) data
moa = pd.read_csv("../data/nsc_cid_smiles_class_name.csv", index_col=0)

# Filter drugs that have SMILES information
drugAct = drugAct[drugAct.index.isin(moa.NSC)]

# Load drug synonyms and filter based on availability in other datasets
tmp = pd.read_csv("../data/drugSynonym.csv")
tmp = tmp[
    (~tmp.nci60.isna() & ~tmp.ctrp.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc1.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc2.isna())
]
tmp = [int(i) for i in set(tmp["nci60"].str.split("|").explode())]

# Select drugs not classified as 'Other' in MOA and included in other datasets
drugAct = drugAct.loc[
    sorted(
        set(drugAct.index) & (set(moa[moa["MECHANISM"] != "Other"]["NSC"]) | set(tmp))
    )
]
SMILES = moa[moa.NSC.isin(drugAct.index)]
target_smiles = "C(C(C(=O)O)N)[N+](=NO)[O-].[Na+]"

tmp = [
    get_fingerprint_from_smiles(i.split(".")[0] if i == target_smiles else i)
    for i in tqdm(SMILES["SMILES"], desc="Processing SMILES")
]
pd.DataFrame(tmp, index=list(SMILES["NSC"])).to_csv(PATH + "nih_drug_feature.csv")

Processing SMILES: 100%|████████████████████████████████████████████████████████████████████| 976/976 [06:36<00:00,  2.46it/s]
