In [1]:
import numpy as np
import pandas as pd
import pubchempy as pcp
from joblib import Memory
from tqdm import tqdm

In [4]:
# キャッシュ設定
memory = Memory("cache/", verbose=0)


@memory.cache
def get_fingerprint_from_smiles(smiles):
    compounds = pcp.get_compounds(smiles, namespace="smiles")
    if not compounds:
        raise ValueError("No compounds related to the SMILES")

    compound = compounds[0]
    fingerprint = ""
    for hex_char in compound.fingerprint:
        fingerprint += f"{int(hex_char, 16):04b}"

    return np.array([int(bit) for bit in fingerprint])

In [6]:
PATH = "../gdsc1_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("drugs")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["drugs"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 331/331 [02:04<00:00,  2.67it/s]


In [7]:
PATH = "../gdsc2_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("drugs")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["drugs"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 240/240 [00:43<00:00,  5.46it/s]


In [8]:
PATH = "../ctrp_data/"
SMILES = (
    pd.read_csv(PATH + "drug2smiles.csv", index_col=0)
    .reset_index(drop=True)
    .sort_values("drugs")
)
tmp = [get_fingerprint_from_smiles(i) for i in tqdm(SMILES["SMILES"])]
pd.DataFrame(tmp, index=list(SMILES["drugs"])).to_csv(PATH + "nih_drug_feature.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████| 460/460 [02:10<00:00,  3.53it/s]


In [29]:
PATH = "../nci_data/"
drugAct = pd.read_csv(PATH + "drugAct.csv", index_col=0)

# Load mechanism of action (moa) data
moa = pd.read_csv("../data/nsc_cid_smiles_class_name.csv", index_col=0)

# Filter drugs that have SMILES information
drugAct = drugAct[drugAct.index.isin(moa.NSC)]

# Load drug synonyms and filter based on availability in other datasets
tmp = pd.read_csv("../data/drugSynonym.csv")
tmp = tmp[
    (~tmp.nci60.isna() & ~tmp.ctrp.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc1.isna())
    | (~tmp.nci60.isna() & ~tmp.gdsc2.isna())
]
tmp = [int(i) for i in set(tmp["nci60"].str.split("|").explode())]

# Select drugs not classified as 'Other' in MOA and included in other datasets
drugAct = drugAct.loc[
    sorted(
        set(drugAct.index) & (set(moa[moa["MECHANISM"] != "Other"]["NSC"]) | set(tmp))
    )
]
SMILES = moa[moa.NSC.isin(drugAct.index)]
target_smiles = "C(C(C(=O)O)N)[N+](=NO)[O-].[Na+]"

tmp = [
    get_fingerprint_from_smiles(i.split(".")[0] if i == target_smiles else i)
    for i in tqdm(SMILES["SMILES"], desc="Processing SMILES")
]
pd.DataFrame(tmp, index=list(SMILES["NSC"])).to_csv(PATH + "nih_drug_feature.csv")

Processing SMILES: 100%|████████████████████████████████████████████████████████████████| 1005/1005 [00:00<00:00, 2644.36it/s]
