In [68]:
import numpy as np
import pandas as pd
import pubchempy as pcp
from joblib import Memory
from tqdm import tqdm

In [69]:
# キャッシュ設定
memory = Memory("cache/", verbose=0)

@memory.cache
def get_fingerprint_from_smiles(smiles):
    try:
        compounds = pcp.get_compounds(smiles, namespace="smiles")
        if compounds and compounds[0].fingerprint:
            return np.array([int(f"{int(c, 16):04b}") for c in compounds[0].fingerprint])
    except pcp.BadRequestError:
        pass  # 無視してRDKitへフォールバック

    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=881)
        return np.array([int(b) for b in fp.ToBitString()])

    raise ValueError(f"Invalid SMILES: {smiles}")

In [71]:
PATH = "../gdsc1_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")





  0%|                                                                                                 | 0/300 [00:00<?, ?it/s][A[A[A[A



  0%|▎                                                                                        | 1/300 [00:00<02:07,  2.35it/s][A[A[A[A



  1%|▌                                                                                        | 2/300 [00:00<02:07,  2.33it/s][A[A[A[A



  1%|▉                                                                                        | 3/300 [00:01<02:18,  2.15it/s][A[A[A[A



  1%|█▏                                                                                       | 4/300 [00:01<02:13,  2.22it/s][A[A[A[A



  2%|█▍                                                                                       | 5/300 [00:02<02:08,  2.30it/s][A[A[A[A



  2%|█▊                                                                                       | 6/300 [00:02<02:01,  2.42it/s][A[A[A[A



  

In [72]:
PATH = "../gdsc2_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")





  0%|                                                                                                 | 0/154 [00:00<?, ?it/s][A[A[A[A



  1%|▌                                                                                        | 1/154 [00:00<00:54,  2.83it/s][A[A[A[A



  1%|█▏                                                                                       | 2/154 [00:00<00:55,  2.75it/s][A[A[A[A



  3%|██▉                                                                                      | 5/154 [00:01<00:29,  5.13it/s][A[A[A[A



  4%|███▍                                                                                     | 6/154 [00:01<00:37,  3.96it/s][A[A[A[A



  5%|████                                                                                     | 7/154 [00:01<00:43,  3.35it/s][A[A[A[A



  5%|████▌                                                                                    | 8/154 [00:02<00:46,  3.12it/s][A[A[A[A



  

In [73]:
PATH = "../ctrp_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("Drug")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["Drug"])).to_csv(PATH + "nih_drug_feature.csv")





  0%|                                                                                                 | 0/494 [00:00<?, ?it/s][A[A[A[A



  0%|▏                                                                                        | 1/494 [00:00<03:11,  2.57it/s][A[A[A[A



  0%|▎                                                                                        | 2/494 [00:00<03:21,  2.44it/s][A[A[A[A



  1%|▌                                                                                        | 3/494 [00:01<03:19,  2.46it/s][A[A[A[A



  1%|▋                                                                                        | 4/494 [00:01<03:39,  2.23it/s][A[A[A[A



  1%|▉                                                                                        | 5/494 [00:02<03:24,  2.40it/s][A[A[A[A



  1%|█                                                                                        | 6/494 [00:02<03:17,  2.47it/s][A[A[A[A



  

In [67]:
PATH = "../nci_data/"
drugAct = pd.read_csv(PATH + "drugAct.csv", index_col=0)

# Load mechanism of action (moa) data
moa = pd.read_csv("../data/nsc_cid_smiles_class_name.csv", index_col=0)

# Filter drugs that have SMILES information
drugAct = drugAct[drugAct.index.isin(moa.NSC)]

# Load drug synonyms and filter based on availability in other datasets
tmp = pd.read_csv("../data/drugSynonym.csv")
tmp = tmp[
    (~tmp.nci60.isna() & ~tmp.ctrp.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc1.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc2.isna())
]
tmp = [int(i) for i in set(tmp["nci60"].str.split("|").explode())]

# Select drugs not classified as 'Other' in MOA and included in other datasets
drugAct = drugAct.loc[
    sorted(
        set(drugAct.index) & (set(moa[moa["MECHANISM"] != "Other"]["NSC"]) | set(tmp))
    )
]
SMILES = moa[moa.NSC.isin(drugAct.index)]
target_smiles = "C(C(C(=O)O)N)[N+](=NO)[O-].[Na+]"

tmp = [
    get_fingerprint_from_smiles(i.split(".")[0] if i == target_smiles else i)
    for i in tqdm(SMILES["SMILES"], desc="Processing SMILES")
]
pd.DataFrame(tmp, index=list(SMILES["NSC"])).to_csv(PATH + "nih_drug_feature.csv")





Processing SMILES:   0%|                                                                              | 0/976 [00:00<?, ?it/s][A[A[A[A



Processing SMILES:  20%|█████████████▎                                                    | 197/976 [00:00<00:00, 1963.85it/s][A[A[A[A



Processing SMILES:  42%|███████████████████████████▉                                      | 413/976 [00:00<00:00, 2078.38it/s][A[A[A[A



Processing SMILES:  64%|███████████████████████████████████████████▎                        | 621/976 [00:09<00:07, 46.57it/s][A[A[A[A



Processing SMILES:  64%|███████████████████████████████████████████▎                        | 621/976 [00:25<00:07, 46.57it/s][A[A[A[A



Processing SMILES:  68%|█████████████████████████████████████████████▉                      | 660/976 [00:25<00:20, 15.66it/s][A[A[A[A



Processing SMILES:  68%|██████████████████████████████████████████████                      | 661/976 [00:26<00:20, 15.28it/s][A[A[A[A



Pr