In [1]:
import io
import gzip

import pandas
import requests
import networkx

import eutility
import cooccurrence

# Tissues

In [3]:
# Read MeSH UBERON Anatomical structures
url = 'https://raw.githubusercontent.com/dhimmel/uberon/7de0ed6238c26ea82accae6fc57accd1b845111d/data/mesh-map.tsv'
uberon_df = pandas.read_table(url)
uberon_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_tree_number,mesh_id,mesh_name
0,UBERON:0001577,facial muscle,A02.633.567.400,D005152,Facial Muscles
1,UBERON:0001681,nasal bone,A02.835.232.781.324.665,D009295,Nasal Bone
2,UBERON:0001264,pancreas,A03.734,D010179,Pancreas
3,UBERON:0002356,perineum,A01.719,D010502,Perineum
4,UBERON:0001688,incus bone,A09.246.397.247.362,D007188,Incus


In [5]:
rows_out = list()

for i, row in uberon_df.iterrows():
    term_query = '{tissue}[MeSH Terms:noexp]'.format(tissue = row.mesh_name.lower())
    payload = {'db': 'pubmed', 'term': term_query}
    pmids = eutility.esearch_query(payload, retmax = 5000, sleep=2)
    row['term_query'] = term_query
    row['n_articles'] = len(pmids)
    row['pubmed_ids'] = '|'.join(pmids)
    rows_out.append(row)
    print('{} articles for {}'.format(len(pmids), row.mesh_name))

uberon_pmids_df = pandas.DataFrame(rows_out)

6400 articles for Facial Muscles
2401 articles for Nasal Bone
60200 articles for Pancreas
7214 articles for Perineum
527 articles for Incus
3083 articles for Chromaffin System
7501 articles for Pressoreceptors
4298 articles for Vestibulocochlear Nerve
8645 articles for Kidney Pelvis
841 articles for Epigastric Arteries
46609 articles for Muscle, Smooth, Vascular
2461 articles for Vibrissae
11188 articles for Pineal Gland
26993 articles for Veins
764 articles for Meningeal Arteries
180254 articles for Lung
328 articles for Comb and Wattles
9237 articles for Palate
2849 articles for Mandibular Nerve
1324 articles for Brachiocephalic Veins
4018 articles for Vulva
2186 articles for Splenic Vein
23061 articles for Aortic Valve
7307 articles for Femoral Vein
595 articles for Beak
32153 articles for Trachea
9315 articles for Exudates and Transudates
9315 articles for Exudates and Transudates
6947 articles for Ligaments
2008 articles for Cisterna Magna
1592 articles for Circle of Willis
19116 

In [6]:
with gzip.open('data/uberon-pmids.tsv.gz', 'w') as write_file:
    write_file = io.TextIOWrapper(write_file)
    uberon_pmids_df.to_csv(write_file, sep='\t', index=False)

uberon_pmids_df.head()

Unnamed: 0,uberon_id,uberon_name,mesh_tree_number,mesh_id,mesh_name,term_query,n_articles,pubmed_ids
0,UBERON:0001577,facial muscle,A02.633.567.400,D005152,Facial Muscles,facial muscles[MeSH Terms:noexp],6400,25876340|25785922|25675766|25626821|25626793|2...
1,UBERON:0001681,nasal bone,A02.835.232.781.324.665,D009295,Nasal Bone,nasal bone[MeSH Terms:noexp],2401,25881382|25636552|25626787|25544298|25533074|2...
2,UBERON:0001264,pancreas,A03.734,D010179,Pancreas,pancreas[MeSH Terms:noexp],60200,25876615|25864351|25842673|25796751|25775893|2...
3,UBERON:0002356,perineum,A01.719,D010502,Perineum,perineum[MeSH Terms:noexp],7214,25898473|25807837|25767947|25751805|25730811|2...
4,UBERON:0001688,incus bone,A09.246.397.247.362,D007188,Incus,incus[MeSH Terms:noexp],527,25182449|25085697|25045723|24969066|24834457|2...


# Tissue-Disease Cooccurrence

In [3]:
uberon_df, uberon_to_pmids = cooccurrence.read_pmids_tsv('data/uberon-pmids.tsv.gz', key='uberon_id')
disease_df, disease_to_pmids = cooccurrence.read_pmids_tsv('data/disease-pmids.tsv.gz', key='doid_code')

In [7]:
cooc_df = cooccurrence.score_pmid_cooccurrence(disease_to_pmids, uberon_to_pmids, 'doid_code', 'uberon_id')

Total articles containing a doid_code: 3686312
Total articles containing a uberon_id: 5387135
Total articles containing both a doid_code and uberon_id: 820072

After removing terms without any cooccurences:
+ 133 doid_codes remain
+ 639 uberon_ids remain

Cooccurrence scores calculated for 84987 doid_code -- uberon_id pairs


In [8]:
cooc_df = uberon_df[['uberon_id', 'uberon_name']].drop_duplicates().merge(cooc_df)
cooc_df = disease_df[['doid_code', 'doid_name']].drop_duplicates().merge(cooc_df)
cooc_df = cooc_df.sort(['doid_name', 'p_fisher'])
cooc_df.head()

Unnamed: 0,doid_code,doid_name,uberon_id,uberon_name,cooccurrence,expected,enrichment,odds_ratio,p_fisher
45934,DOID:10652,Alzheimer's disease,UBERON:0000955,brain,11209,1037.397404,10.804924,71.79288,0.0
45989,DOID:10652,Alzheimer's disease,UBERON:0001890,forebrain,114,6.426617,17.738725,24.829124,3.728344e-105
45651,DOID:10652,Alzheimer's disease,UBERON:0002037,cerebellum,303,75.919555,3.991067,4.274664,4.204287e-89
45929,DOID:10652,Alzheimer's disease,UBERON:0002148,locus ceruleus,97,7.412799,13.085475,16.508638,8.148977e-76
45576,DOID:10652,Alzheimer's disease,UBERON:0000011,parasympathetic nervous system,103,12.91898,7.972765,9.086085,2.664472e-58


In [9]:
cooc_df.to_csv('data/disease-uberon-cooccurrence.tsv', index=False, sep='\t')