In [1]:
import io
import gzip

import pandas
import requests
import networkx

import eutility
import cooccurrence

# Tissues

In [2]:
# Read MeSH UBERON Anatomical structures
url = 'https://raw.githubusercontent.com/dhimmel/uberon/86a9b754871e5ce7d91d2ef15bcc8f6a0ef6cda1/data/hetio-slim.tsv'
uberon_df = pandas.read_table(url)
uberon_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_id,mesh_name
0,UBERON:0001716,secondary palate,D010159,Palate
1,UBERON:0001908,optic tract,D014795,Visual Pathways
2,UBERON:0002286,third ventricle,D020542,Third Ventricle
3,UBERON:0002349,myocardium,D009206,Myocardium
4,UBERON:0000978,leg,D035002,Lower Extremity


In [3]:
rows_out = list()

for i, row in uberon_df.iterrows():
    term_query = '{tissue}[MeSH Terms:noexp]'.format(tissue = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

uberon_pmids_df = pandas.DataFrame(rows_out)

9284 articles for Palate
15786 articles for Visual Pathways
1359 articles for Third Ventricle
139687 articles for Myocardium
8596 articles for Lower Extremity
39066 articles for Cerebellum
2471 articles for Arachnoid
382125 articles for Liver
3960 articles for Dermis
3295 articles for Sweat
16125 articles for Optic Nerve
13305 articles for Gallbladder
11676 articles for Parotid Gland
265 articles for Manubrium
6438 articles for Vena Cava, Superior
47991 articles for Arteries
26790 articles for Arm
28944 articles for Aorta, Thoracic
60440 articles for Pancreas
16194 articles for Mesencephalon
9469 articles for Common Bile Duct
4456 articles for Choroid Plexus
5769 articles for Nails
13016 articles for Joints
399 articles for Bulbourethral Glands
158666 articles for Skin
530 articles for Incus
14579 articles for Forearm
8129 articles for Trigeminal Nerve
1167 articles for Axillary Vein
3943 articles for Peroneal Nerve
465 articles for Stapedius
20418 articles for Vagus Nerve
24984 articl

In [4]:
with gzip.open('data/uberon-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    uberon_pmids_df.to_csv(write_file, sep='\t', index=False)

uberon_pmids_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_id,mesh_name,term_query,n_articles,pubmed_ids
0,UBERON:0001716,secondary palate,D010159,Palate,palate[MeSH Terms:noexp],9284,26023113|25975064|25895319|25872295|25869559|2...
1,UBERON:0001908,optic tract,D014795,Visual Pathways,visual pathways[MeSH Terms:noexp],15786,26113723|26089513|26080589|26080584|25972183|2...
2,UBERON:0002286,third ventricle,D020542,Third Ventricle,third ventricle[MeSH Terms:noexp],1359,26120619|26023696|25723723|25723303|25723298|2...
3,UBERON:0002349,myocardium,D009206,Myocardium,myocardium[MeSH Terms:noexp],139687,26072537|26062198|26040042|26040041|26039915|2...
4,UBERON:0000978,leg,D035002,Lower Extremity,lower extremity[MeSH Terms:noexp],8596,26118216|26072540|26062181|26047150|26047149|2...


# Tissue-Disease Cooccurrence

In [9]:
uberon_df, uberon_to_pmids = cooccurrence.read_pmids_tsv('data/uberon-pmids.tsv.gz', key='uberon_id')
disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')

In [10]:
cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, uberon_to_pmids, 'doid_code', 'uberon_id')

Total articles containing a doid_code: 3686312
Total articles containing a uberon_id: 4697277
Total articles containing both a doid_code and uberon_id: 696252

After removing terms without any cooccurences:
+ 133 doid_codes remain
+ 401 uberon_ids remain

Cooccurrence scores calculated for 53333 doid_code -- uberon_id pairs


In [11]:
cooc_df = uberon_df[['uberon_id', 'uberon_name']].drop_duplicates().merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].drop_duplicates().merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.head()

Unnamed: 0,doid_code,doid_name,uberon_id,uberon_name,cooccurrence,expected,enrichment,odds_ratio,p_fisher
28748,DOID:10652,Alzheimer's disease,UBERON:0000955,brain,11209,1182.634069,9.477995,74.210761,0.0
28553,DOID:10652,Alzheimer's disease,UBERON:0001890,forebrain,114,7.32635,15.560272,21.733764,5.971023e-99
28476,DOID:10652,Alzheimer's disease,UBERON:0002037,cerebellum,303,86.548368,3.500933,3.740149,3.504584e-76
28541,DOID:10652,Alzheimer's disease,UBERON:0002148,locus ceruleus,97,8.450598,11.478477,14.4497,1.183699e-70
28708,DOID:10652,Alzheimer's disease,UBERON:0000011,parasympathetic nervous system,103,14.72765,6.993648,7.952412,3.9852110000000002e-53


In [12]:
cooc_df.to_csv('data/disease-uberon-cooccurrence.tsv', index=False, sep='\t')