In [6]:
import sys
print('python', sys.version)

import numpy as np
print('numpy', np.__version__)

import pandas as pd
print('pandas', pd.__version__)

import matplotlib as mpl
print('matplotlib', mpl.__version__)

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sci
import glob
import networkx as nx
import Bio.KEGG.KGML.KGML_parser as keg

import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import cmapPy.pandasGEXpress.parse as cp
import cmapPy.pandasGEXpress.write_gctx as cw

python 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:27:36) [GCC 11.2.0]
numpy 1.26.4
pandas 2.2.3
matplotlib 3.10.0


# entrez to sym gene mapping

In [7]:
# mapping human gene symbol to entrez
dat=pd.read_table('data/Sym2Entrez.txt',sep='\t')
dat.columns=['sym','entrez']
dat=dat.loc[dat['entrez'].notnull()]
dat=dat.loc[dat['sym'].notnull()]
dat['entrez']=dat['entrez'].astype(int).astype(str)
s2e_dic=dat.set_index('sym')['entrez']
e2s_dic=dat.set_index('entrez')['sym']

# gold standard

## TTD gold standard

In [8]:
available_cells= ['A549','A375','BICR6','HT29','PC3','U251MG','MCF7']

In [9]:
cell_line_keywords = {
    "A549": ["lung cancer","lung adenocarcinoma","non-small cell lung cancer","NSCLC","pulmonary carcinoma","alveolar basal epithelial carcinoma",
        "lung carcinoma","bronchogenic carcinoma","human alveolar carcinoma","lung epithelial cancer"],
    "A375": ["melanoma","malignant melanoma","skin cancer","cutaneous melanoma","melanocytic carcinoma","skin malignant melanoma","melanocytic tumor",
        "skin carcinoma (melanoma type)","dermal melanoma","cutaneous malignant melanoma"],
    "BICR6": ["head and neck cancer","tongue squamous cell carcinoma","oral cavity carcinoma","head and neck squamous cell carcinoma",
        "HNSCC","oropharyngeal carcinoma","oral squamous carcinoma","head and neck carcinoma","oral epithelial cancer","tongue carcinoma"],
    "HT29": ["colon cancer","colorectal adenocarcinoma","colon carcinoma","colorectal cancer","large intestine carcinoma","intestinal adenocarcinoma",
        "colon adenocarcinoma","colorectal malignant tumor","colonic carcinoma","intestinal carcinoma"],
    "PC3": ["prostate cancer","prostate adenocarcinoma","prostatic carcinoma","prostate carcinoma (androgen-independent)", "advanced prostate cancer",
        "hormone-refractory prostate cancer","androgen-independent prostate carcinoma","metastatic prostate cancer","castration-resistant prostate cancer",
        "prostate malignant tumor"],
    "U251MG": ["glioblastoma","glioblastoma multiforme","GBM","brain cancer","astrocytoma","high-grade glioma","malignant glioma","glioblastoma astrocytoma",
        "brain glioma","cerebral glioblastoma"],
    "MCF7": ["breast cancer","breast adenocarcinoma","breast carcinoma","mammary carcinoma","estrogen-receptor positive breast cancer",
        "luminal A breast cancer","invasive ductal carcinoma","ER+ breast cancer","mammary adenocarcinoma","hormone-dependent breast cancer"]
}

In [10]:
ttd2sym={}
for line in open('data/P1-01-TTD_target_download.txt'):
    line=line.strip()
    if line=='':
        continue
    line_list=line.split('\t')
    if line_list[1]=='TARGETID':
        target_id=line_list[2]
    if line_list[1]=='GENENAME':
        gene_sym=line_list[2]
        ttd2sym[target_id]=gene_sym
len(ttd2sym)

3669

In [11]:
TTD_gs_dic={}
for cell in available_cells:
    TTD_gs_dic[cell]=[]


def get_cell_from_disease_info(disease_info):
    if disease_info=='solid tumour/cancer':
        return 'all_cell'
    
    for cell, cell_related_cancers in cell_line_keywords.items():
        if any(cancer_name.lower() in disease_info for cancer_name in cell_related_cancers):
            return cell
    
    return 'N/A'

approved_phase=[
'Phase 2',
'Phase 2a',
'Phase 2b',
'Phase 2/3',
'Phase 3',
'phase 3',
'Phase 4',
'NDA filed',
'BLA submitted',
'Application submitted',
'Approval submitted',
'Preregistration',
'Registered',
'Approved',
'Approved in EU',
'Approved in China',
'Approved (orphan drug)'
]

valid_cnt=0
for line in open('data/P1-06-Target_disease.txt'):
    line=line.strip()
    if line=='':
        continue
        
    line_list=line.split('\t')
    
    if line_list[1]=='TARGETID':
        target_id=line_list[2]
        
    if len(line_list)==5:
        if ('N/A' in line_list[-1]) or ('N.A.' in line_list[-1]):
            continue
         
        phase_info=line_list[2]
        if phase_info not in approved_phase:
            continue

        ICD11_code=line_list[-1].split('ICD-11: ')[1][:-1]
        #if any(cancer_code in ICD11_code for cancer_code in ['2A','2B','2C','2D','2E0','2E2']):
        disease_info=line_list[-2].lower()
        cell_selected=get_cell_from_disease_info(disease_info)
        
        if cell_selected=='N/A':
            continue

        valid_cnt+=1
        ## mode1
        if cell_selected=='all_cell':
            for cell in available_cells:
                TTD_gs_dic[cell].append(target_id)
        else:
            TTD_gs_dic[cell_selected].append(target_id)
        ## mode1 end
        
        # ## mode2
        # if cell_selected=='all_cell':
        #     continue
        # TTD_gs_dic[cell_selected].append(target_id)    
        # ## mode2 end

valid_cnt
for cell in TTD_gs_dic.keys():
    TTD_gs_dic[cell]=[s2e_dic[ttd2sym[ttd_id]] for ttd_id in TTD_gs_dic[cell] if (ttd_id in ttd2sym.keys()) and (ttd2sym[ttd_id] in s2e_dic.keys())]
    TTD_gs_dic[cell]=list(set(TTD_gs_dic[cell]))
    cell, len(TTD_gs_dic[cell])
with open("./data_preproc/gs_therTarget_TTD.pickle", 'wb') as file:
    pickle.dump(TTD_gs_dic,file)
    

734

('A549', 227)

('A375', 209)

('BICR6', 192)

('HT29', 208)

('PC3', 206)

('U251MG', 206)

('MCF7', 226)

## drugbank

In [12]:
from lxml import etree
import re

# UniProt accession pattern
UNIPROT_RE = re.compile(r"^[A-NR-Z][0-9][A-Z0-9]{3}[0-9]$|^[OPQ][0-9][A-Z0-9]{3}[0-9]$")

def extract_drug_targets_indications(xml_path, humans_only=True):
    """
    DrugBank XML → {drug_id: {"targets":[uniprot_id...], "indications":[text...]}}
    - drug_id : <drugbank-id primary="true">
    - targets : UniProt ID list
    - indications : <indication> free text list
    - including only when drug_id, targets, indications exist
    """
    result = {}

    context = etree.iterparse(xml_path, events=("end",), tag="{*}drug", recover=True, huge_tree=True)

    for _, drug in context:
        # 1) drug_id
        dbid_el = drug.find("./{*}drugbank-id[@primary='true']")
        if dbid_el is None or not dbid_el.text:
            drug.clear()
            while drug.getprevious() is not None:
                del drug.getparent()[0]
            continue
        drug_id = dbid_el.text.strip()

        # 2) targets (UniProt IDs)
        tgt_ids = set()
        for tgt in drug.findall("./{*}targets/{*}target"):
            if humans_only:
                org = (tgt.findtext("./{*}organism") or "").strip().lower()
                if org and org != "humans":
                    continue
            for poly in tgt.findall("./{*}polypeptide"):
                src = (poly.get("source") or "").strip().lower()
                uni = (poly.get("id") or "").strip()
                if src in {"swiss-prot", "trembl", "uniprot", "uniprotkb"} and UNIPROT_RE.match(uni):
                    tgt_ids.add(uni)
        tgt_ids = list(tgt_ids)

        # 3) indications (free text)
        indications = []
        for ind in drug.findall("./{*}indication"):
            if ind.text and ind.text.strip():
                indications.append(ind.text.strip())

        # 4) 모두 있을 때만 추가
        if drug_id and tgt_ids and indications:
            result[drug_id] = {
                "targets": tgt_ids,
                "indications": indications
            }

        # 메모리 해제
        drug.clear()
        while drug.getprevious() is not None:
            del drug.getparent()[0]

    return result


# ==== 사용 예시 ====
xml_path = "data/full_database.xml"
drug_map = extract_drug_targets_indications(xml_path, humans_only=True)
print("총 entries:", len(drug_map))

총 entries: 3036


In [13]:
uniprot_df=pd.read_table('data/HUMAN_9606_idmapping_selected.tab', header=None)
uniprot_df=uniprot_df[[0,2]].dropna()
uniprot_df.columns=['uniprot','entrez']
upt2enz_mapping=uniprot_df.set_index('uniprot')['entrez']

  uniprot_df=pd.read_table('data/HUMAN_9606_idmapping_selected.tab', header=None)


In [14]:
drugbank_gs_dic={}
for cell in cell_line_keywords.keys():
    drugbank_gs_dic[cell]=[]
    
for k, v in list(drug_map.items()):
    ind1=v['indications'][0].lower()
    targets=v['targets']
    for cell, disease_list in cell_line_keywords.items():
        for disease in disease_list:
            if disease.lower() in ind1:
                drugbank_gs_dic[cell]+=targets
                break

for cell in drugbank_gs_dic.keys():
    drugbank_gs_dic[cell]=list(set([upt2enz_mapping[uniprot] for uniprot in list(set(drugbank_gs_dic[cell])) if uniprot in upt2enz_mapping.index]))
    cell, len(drugbank_gs_dic[cell])
with open("./data_preproc/gs_therTarget_drugbank.pickle", 'wb') as file:
    pickle.dump(drugbank_gs_dic,file)
    

('A549', 165)

('A375', 52)

('BICR6', 11)

('HT29', 91)

('PC3', 129)

('U251MG', 62)

('MCF7', 139)