In [1]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
import pandas as pd
import requests
import requests_cache
import rpy2.robjects as ro
import torch
from rdkit import Chem
from rdkit.Chem import AllChem
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector
from smiles2graph import getDrugGraph
from tqdm import tqdm

from utils import (get_canonical_smiles, get_pubchem_info, get_smiles_from_cid,
                   get_smiles_from_nsc)

In [2]:
drugs = torch.load("../data/drug_list.pt", weights_only=False)
len(drugs)

952

In [3]:
df = pd.read_csv("/Users/yoshitakainoue/Downloads/nsc_smiles.csv").dropna()
df

Unnamed: 0,NSC,SMILES
0,1,CC1=CC(=O)C=CC1=O
1,2,S(Sc1nc2ccccc2s1)c3nc4ccccc4s3
2,3,Oc1c(Cl)cc(cc1[N+](=O)[O-])[N+](=O)[O-]
3,4,Nc1ncc(s1)[N+](=O)[O-]
4,5,Nc1ccc2C(=O)c3ccccc3C(=O)c2c1
...,...,...
321745,853988,CN1CCN(C[C@@H]2COc3ccc(OCc4ccnc(n4)c5ccc(OC[C@...
321746,854009,C[C@H]1CN(C)CCN1c2cc(NC(=O)Cn3cc(c4cc(C(=O)N)c...
321747,854010,CCNCCc1ccc(CN(CC)c2cc(OC)ccc2[C@@H]3CCc4cc(O)c...
321748,854043,O.CCn1c(c2CC(C)(C)COC(=O)[C@@H]3CCCN(N3)C(=O)[...


In [4]:
df = df[df.NSC.isin(drugs)]
df

Unnamed: 0,NSC,SMILES
185,186,C[C@H]1OC=C2C(=C(C(=O)O)C(=O)C(=C2[C@@H]1C)C)O
294,295,OC(=O)CCCc1ccccc1
720,721,NC(=S)N\N=C\1/C(=O)Nc2ccccc12
739,740,CN(Cc1cnc2nc(N)nc(N)c2n1)c3ccc(cc3)C(=O)N[C@@H...
749,750,CS(=O)(=O)OCCCCOS(=O)(=O)C
...,...,...
304824,804962,COCCNc1cc(NC(=O)N2CCCc3cc(CN4CCN(C)CC4=O)c(C=O...
306602,807579,COc1cc(ccc1O)c2ccc3\C(=C\c4ccc[nH]4)\C(=O)Nc3c2
307764,809693,COc1c(OCCCN2CCOCC2)ccc3C4=NCCN4C(=Nc13)NC(=O)c...
310462,813783,COc1cc2c(CCN[C@]23CS[C@H]4[C@H]5[C@@H]6N(C)[C@...


In [5]:
missing = sorted(set(drugs) - set(df["NSC"]))
t = pd.read_csv("../data/hand_gathering_data.csv", index_col=0)
t = t[t.NSC.isin(missing)]

In [6]:
df = pd.concat([df, t]).drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,NSC,SMILES
0,186.0,C[C@H]1OC=C2C(=C(C(=O)O)C(=O)C(=C2[C@@H]1C)C)O
1,295.0,OC(=O)CCCc1ccccc1
2,721.0,NC(=S)N\N=C\1/C(=O)Nc2ccccc12
3,740.0,CN(Cc1cnc2nc(N)nc(N)c2n1)c3ccc(cc3)C(=O)N[C@@H...
4,750.0,CS(=O)(=O)OCCCCOS(=O)(=O)C
...,...,...
947,804962.0,COCCNc1cc(NC(=O)N2CCCc3cc(CN4CCN(C)CC4=O)c(C=O...
948,807579.0,COc1cc(ccc1O)c2ccc3\C(=C\c4ccc[nH]4)\C(=O)Nc3c2
949,809693.0,COc1c(OCCCN2CCOCC2)ccc3C4=NCCN4C(=Nc13)NC(=O)c...
950,813783.0,COc1cc2c(CCN[C@]23CS[C@H]4[C@H]5[C@@H]6N(C)[C@...


In [7]:
genes = pd.read_csv("../data/genes.csv").T
dti = pd.read_csv("../../DTI-quantification/data/drug_gene_score.csv.gz")
dti = dti[dti.gene.isin(list(genes.index))]

In [8]:
df = df[df.NSC.isin(sorted(set(dti.NSC)))]
df

Unnamed: 0,NSC,SMILES
0,186.0,C[C@H]1OC=C2C(=C(C(=O)O)C(=O)C(=C2[C@@H]1C)C)O
1,295.0,OC(=O)CCCc1ccccc1
2,721.0,NC(=S)N\N=C\1/C(=O)Nc2ccccc12
3,740.0,CN(Cc1cnc2nc(N)nc(N)c2n1)c3ccc(cc3)C(=O)N[C@@H...
4,750.0,CS(=O)(=O)OCCCCOS(=O)(=O)C
...,...,...
947,804962.0,COCCNc1cc(NC(=O)N2CCCc3cc(CN4CCN(C)CC4=O)c(C=O...
948,807579.0,COc1cc(ccc1O)c2ccc3\C(=C\c4ccc[nH]4)\C(=O)Nc3c2
949,809693.0,COc1c(OCCCN2CCOCC2)ccc3C4=NCCN4C(=Nc13)NC(=O)c...
950,813783.0,COc1cc2c(CCN[C@]23CS[C@H]4[C@H]5[C@@H]6N(C)[C@...


In [9]:
drug_dict = {}
for i in tqdm(df.iterrows()):
    drug_dict[int(i[1]["NSC"])] = getDrugGraph(i[1]["SMILES"])

952it [00:00, 2717.73it/s]


In [10]:
torch.save(sorted(drug_dict.keys()), "../data/drug_list.pt")

In [11]:
torch.save(drug_dict, "../data/data_dict.pt")