In [1]:
import io
import gzip

import pandas
import requests
import networkx

import eutility
import cooccurrence

In [2]:
# Read mappings for DO Slim terms
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs-slim.tsv'
disease_df = pandas.read_table(url)
disease_df = disease_df.query('resource == "MSH"').drop('resource', 1)
disease_df = disease_df.rename(columns={'resource_id': 'mesh_id'})

# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)
disease_df = disease_df.merge(mesh_df)

# Manually remove problematic xrefs
# https://github.com/obophenotype/human-disease-ontology/issues/45
disease_df = disease_df.query("mesh_id != 'D003327' and mesh_id != 'D017202'")
disease_df.head()

Unnamed: 0,doid_code,doid_name,mesh_id,mesh_name
0,DOID:2531,hematologic cancer,D019337,Hematologic Neoplasms
1,DOID:1319,brain cancer,D001932,Brain Neoplasms
2,DOID:1324,lung cancer,D008175,Lung Neoplasms
3,DOID:263,kidney cancer,D007680,Kidney Neoplasms
4,DOID:1793,pancreatic cancer,D010190,Pancreatic Neoplasms


## Query PubMed

In [3]:
rows_out = list()

for i, row in disease_df.iterrows():
    term_query = '{disease}[MeSH Terms]'.format(disease = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 10000)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

disease_pmids_df = pandas.DataFrame(rows_out)

10320 articles for Hematologic Neoplasms
122727 articles for Brain Neoplasms
180844 articles for Lung Neoplasms
60494 articles for Kidney Neoplasms
57863 articles for Pancreatic Neoplasms
100038 articles for Skin Neoplasms
104535 articles for Bone Neoplasms
27302 articles for Pharyngeal Neoplasms
65991 articles for Ovarian Neoplasms
226835 articles for Breast Neoplasms
63189 articles for Glioma
107447 articles for Uterine Neoplasms
24447 articles for Adrenal Gland Neoplasms
40010 articles for Esophageal Neoplasms
14552 articles for Salivary Gland Neoplasms
97203 articles for Prostatic Neoplasms
77286 articles for Stomach Neoplasms
45208 articles for Urinary Bladder Neoplasms
18495 articles for Peripheral Nervous System Neoplasms
40519 articles for Thyroid Neoplasms
130963 articles for Liver Neoplasms
60840 articles for Uterine Cervical Neoplasms
4780 articles for Vaginal Neoplasms
249626 articles for Head and Neck Neoplasms
38987 articles for Rectal Neoplasms
34076 articles for Eye Neo

In [4]:
with gzip.open('data/disease-pmids-topic.tsv.gz', 'wt') as write_file:
    disease_pmids_df.to_csv(write_file, sep='\t', index=False)

## Analyze data

In [5]:
disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids-topic.tsv.gz', key='doid_code')

In [6]:
cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, disease_to_pmids, 'doid_code_0', 'doid_code_1')

Total articles containing a doid_code_0: 4161769
Total articles containing a doid_code_1: 4161769
Total articles containing both a doid_code_0 and doid_code_1: 4161769

After removing terms without any cooccurences:
+ 133 doid_code_0s remain
+ 133 doid_code_1s remain

Cooccurrence scores calculated for 17689 doid_code_0 -- doid_code_1 pairs


In [7]:
disease_df.head()

Unnamed: 0,doid_code,doid_name,mesh_id,mesh_name,term_query,n_articles
0,DOID:2531,hematologic cancer,D019337,Hematologic Neoplasms,hematologic neoplasms[MeSH Terms],10320
1,DOID:1319,brain cancer,D001932,Brain Neoplasms,brain neoplasms[MeSH Terms],122727
2,DOID:1324,lung cancer,D008175,Lung Neoplasms,lung neoplasms[MeSH Terms],180844
3,DOID:263,kidney cancer,D007680,Kidney Neoplasms,kidney neoplasms[MeSH Terms],60494
4,DOID:1793,pancreatic cancer,D010190,Pancreatic Neoplasms,pancreatic neoplasms[MeSH Terms],57863


In [8]:
cooc_df.head()

Unnamed: 0,doid_code_0,doid_code_1,cooccurrence,expected,enrichment,odds_ratio,p_fisher
0,DOID:11615,DOID:11615,4612,5.110938,902.378361,inf,0.0
1,DOID:11615,DOID:8577,1,31.349378,0.031899,0.031654,1.0
2,DOID:11615,DOID:5612,2,10.436864,0.191628,0.191106,0.999669
3,DOID:11615,DOID:14330,0,54.687703,0.0,0.0,1.0
4,DOID:11615,DOID:0050425,0,3.040853,0.0,0.0,1.0


In [9]:
cooc_df = cooc_df[cooc_df['doid_code_0'] != cooc_df['doid_code_1']]
doid_name_df = disease_df[['doid_code', 'doid_name']].drop_duplicates()
cooc_df = doid_name_df.rename(columns={'doid_code': 'doid_code_1', 'doid_name': 'doid_name_1'}).merge(cooc_df)
cooc_df = doid_name_df.rename(columns={'doid_code': 'doid_code_0', 'doid_name': 'doid_name_0'}).merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name_0', 'p_fisher'])

In [10]:
cooc_df.head()

Unnamed: 0,doid_code_0,doid_name_0,doid_code_1,doid_name_1,cooccurrence,expected,enrichment,odds_ratio,p_fisher
9444,DOID:10652,Alzheimer's disease,DOID:14330,Parkinson's disease,2760,827.098152,3.336968,3.577398,0.0
9465,DOID:10652,Alzheimer's disease,DOID:11949,Creutzfeldt-Jakob disease,332,96.723002,3.432482,3.593306,3.377672e-80
9456,DOID:10652,Alzheimer's disease,DOID:332,amyotrophic lateral sclerosis,451,227.754094,1.980206,2.020452,5.524978e-40
9496,DOID:10652,Alzheimer's disease,DOID:11555,Fuchs' endothelial dystrophy,1,13.106461,0.076298,0.075102,0.9999982
9490,DOID:10652,Alzheimer's disease,DOID:1595,endogenous depression,1221,1399.827043,0.872251,0.868045,0.9999997


In [11]:
len(cooc_df)

17556

In [12]:
len(cooc_df[cooc_df.p_fisher <= 0.005])

1086

In [13]:
cooc_df.to_csv('data/disease-disease-cooccurrence.tsv', index=False, sep='\t')