In [16]:
import io
import gzip

import pandas
import requests
import networkx

import eutility
import cooccurrence

In [18]:
# Read MeSH terms to MeSH names
url = 'https://raw.githubusercontent.com/dhimmel/mesh/e561301360e6de2140dedeaa7c7e17ce4714eb7f/data/terms.tsv'
mesh_df = pandas.read_table(url)

# Read MeSH terms mapped to DO Slim terms
disease_df = pandas.read_table('data/DO-slim-to-mesh.tsv')
disease_df.head()

Unnamed: 0,doid_code,doid_name,mesh_id,mesh_name
0,DOID:2531,hematologic cancer,D019337,Hematologic Neoplasms
1,DOID:1319,brain cancer,D001932,Brain Neoplasms
2,DOID:1324,lung cancer,D008175,Lung Neoplasms
3,DOID:263,kidney cancer,D007680,Kidney Neoplasms
4,DOID:1793,pancreatic cancer,D010190,Pancreatic Neoplasms


## Query PubMed

In [19]:
rows_out = list()

for i, row in disease_df.iterrows():
    term_query = '{disease}[MeSH Terms]'.format(disease = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 10000)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

disease_pmids_df = pandas.DataFrame(rows_out)

10263 articles for Hematologic Neoplasms
122381 articles for Brain Neoplasms
180289 articles for Lung Neoplasms
60370 articles for Kidney Neoplasms
57659 articles for Pancreatic Neoplasms
99737 articles for Skin Neoplasms
104309 articles for Bone Neoplasms
27228 articles for Pharyngeal Neoplasms
65838 articles for Ovarian Neoplasms
225989 articles for Breast Neoplasms
62947 articles for Glioma
107228 articles for Uterine Neoplasms
24402 articles for Adrenal Gland Neoplasms
39874 articles for Esophageal Neoplasms
14509 articles for Salivary Gland Neoplasms
96847 articles for Prostatic Neoplasms
77074 articles for Stomach Neoplasms
45123 articles for Urinary Bladder Neoplasms
18460 articles for Peripheral Nervous System Neoplasms
40411 articles for Thyroid Neoplasms
130560 articles for Liver Neoplasms
60707 articles for Uterine Cervical Neoplasms
4779 articles for Vaginal Neoplasms
248941 articles for Head and Neck Neoplasms
38905 articles for Rectal Neoplasms
34018 articles for Eye Neop

In [20]:
with gzip.open('data/disease-pmids-topic.tsv.gz', 'wt') as write_file:
    disease_pmids_df.to_csv(write_file, sep='\t', index=False)

## Analyze data

In [21]:
disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids-topic.tsv.gz', key='doid_code')

In [22]:
cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, disease_to_pmids, 'doid_code_0', 'doid_code_1')

Total articles containing a doid_code_0: 4437902
Total articles containing a doid_code_1: 4437902
Total articles containing both a doid_code_0 and doid_code_1: 4437902

After removing terms without any cooccurences:
+ 133 doid_code_0s remain
+ 133 doid_code_1s remain

Cooccurrence scores calculated for 17689 doid_code_0 -- doid_code_1 pairs


In [23]:
disease_df.head()

Unnamed: 0,doid_code,doid_name,mesh_id,mesh_name,term_query,n_articles
0,DOID:2531,hematologic cancer,D019337,Hematologic Neoplasms,hematologic neoplasms[MeSH Terms],10263
1,DOID:1319,brain cancer,D001932,Brain Neoplasms,brain neoplasms[MeSH Terms],122381
2,DOID:1324,lung cancer,D008175,Lung Neoplasms,lung neoplasms[MeSH Terms],180289
3,DOID:263,kidney cancer,D007680,Kidney Neoplasms,kidney neoplasms[MeSH Terms],60370
4,DOID:1793,pancreatic cancer,D010190,Pancreatic Neoplasms,pancreatic neoplasms[MeSH Terms],57659


In [24]:
cooc_df.head()

Unnamed: 0,doid_code_0,doid_code_1,cooccurrence,expected,enrichment,odds_ratio,p_fisher
0,DOID:6364,DOID:6364,22160,110.652646,200.266336,inf,0
1,DOID:6364,DOID:332,7,67.515092,0.103681,0.102933,1
2,DOID:6364,DOID:5612,3,46.967454,0.063874,0.063449,1
3,DOID:6364,DOID:824,1,120.594407,0.008292,0.008206,1
4,DOID:6364,DOID:9970,122,739.515203,0.164973,0.159658,1


In [25]:
cooc_df = cooc_df[cooc_df['doid_code_0'] != cooc_df['doid_code_1']]
doid_name_df = disease_df[['doid_code', 'doid_name']].drop_duplicates()
cooc_df = doid_name_df.rename(columns={'doid_code': 'doid_code_1', 'doid_name': 'doid_name_1'}).merge(cooc_df)
cooc_df = doid_name_df.rename(columns={'doid_code': 'doid_code_0', 'doid_name': 'doid_name_0'}).merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name_0', 'p_fisher'])

In [26]:
cooc_df.head()

Unnamed: 0,doid_code_0,doid_name_0,doid_code_1,doid_name_1,cooccurrence,expected,enrichment,odds_ratio,p_fisher
9444,DOID:10652,Alzheimer's disease,DOID:14330,Parkinson's disease,2747,768.944245,3.572431,3.836951,0.0
9465,DOID:10652,Alzheimer's disease,DOID:11949,Creutzfeldt-Jakob disease,330,90.115297,3.661975,3.83721,1.666666e-86
9456,DOID:10652,Alzheimer's disease,DOID:332,amyotrophic lateral sclerosis,447,211.499555,2.11348,2.159013,2.938743e-46
9490,DOID:10652,Alzheimer's disease,DOID:1595,endogenous depression,1221,1303.238329,0.936897,0.934812,0.9906655
9496,DOID:10652,Alzheimer's disease,DOID:11555,Fuchs' endothelial dystrophy,1,12.185353,0.082066,0.080873,0.9999954


In [27]:
len(cooc_df)

17556

In [30]:
len(cooc_df[cooc_df.p_fisher <= 0.005])

1160

In [31]:
cooc_df.to_csv('data/disease-disease-cooccurrence.tsv', index=False, sep='\t')