In [2]:
# https://www.ncbi.nlm.nih.gov/research/pubtator/api.html

In [10]:
import requests
import json

def extract_annotations(annotation):
    entity_type = annotation['infons']['type']
    start_index = annotation['locations'][0]['offset']
    end_index = start_index+annotation['locations'][0]['length']
    
    return entity_type, [start_index, end_index]

In [11]:
pmids = '28483578,28483579'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}'#&concepts=gene,protein'

response = requests.get(url)
responses = response.text.split('\n')

In [12]:
# can also do full text I think, but we need the PMCIDs. 
# We can get that data later. 
pmid_to_entity_type_to_indices = dict()
pmid_to_indices_to_entity_type = dict()

for entry_txt in responses:
    if entry_txt == '':
        continue
    entry = json.loads(entry_txt)
    
    # PMID
    pmid = str(entry['id'])
    
    # Title
    title_section = entry['passages'][0]
    title = title_section['text']
    title_annotations = title_section['annotations']

    # Abstract
    abstract_section = entry['passages'][1]
    abstract = abstract_section['text']
    abstract_annotations = abstract_section['annotations']

    # Title Annotations
    entity_type_to_title_indices = dict()
    title_indices_to_entity_type = dict()
    for annotation in title_annotations:    
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_title_indices.setdefault(entity_type,list()).append(indices)
        title_indices_to_entity_type[tuple(indices)] = entity_type
        
    # Abstract Annotations
    entity_type_to_abstract_indices = dict() 
    abstract_indices_to_entity_type = dict()
    for annotation in abstract_annotations:
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_abstract_indices.setdefault(entity_type, list()).append(indices)
        abstract_indices_to_entity_type[tuple(indices)] = entity_type
        
    # PMID->Entities->Indices
    pmid_to_entity_type_to_indices[pmid] = dict()
    pmid_to_entity_type_to_indices[pmid]['title'] = entity_type_to_title_indices
    pmid_to_entity_type_to_indices[pmid]['abstract'] = entity_type_to_abstract_indices
    
    # PMID->Indices->Entity Type
    pmid_to_indices_to_entity_type[pmid] = dict()
    pmid_to_indices_to_entity_type[pmid]['title'] = title_indices_to_entity_type
    pmid_to_indices_to_entity_type[pmid]['abstract'] = abstract_indices_to_entity_type
    
    # PMID->Indices->Entity Type->Entity
    #...
    #...

In [13]:
pmid_to_indices_to_entity_type

{'28483579': {'title': {(83, 97): 'Disease', (101, 120): 'Disease'},
  'abstract': {(212, 226): 'Disease',
   (238, 246): 'Species',
   (252, 287): 'Disease',
   (334, 342): 'Species',
   (440, 448): 'Species',
   (604, 619): 'Disease',
   (715, 723): 'Species',
   (1091, 1105): 'Disease',
   (1210, 1224): 'Disease'}},
 '28483578': {'title': {(0, 6): 'Gene',
   (17, 23): 'Gene',
   (69, 77): 'Disease',
   (81, 85): 'Species',
   (89, 98): 'Disease',
   (104, 125): 'Species'},
  'abstract': {(202, 211): 'Chemical',
   (227, 241): 'Disease',
   (534, 538): 'Species',
   (544, 565): 'Species',
   (567, 579): 'Species',
   (581, 590): 'Disease',
   (659, 667): 'Disease',
   (681, 700): 'Disease',
   (772, 778): 'Gene',
   (809, 828): 'Disease',
   (859, 865): 'Gene',
   (876, 882): 'Gene',
   (992, 998): 'Gene',
   (1110, 1116): 'Gene',
   (1202, 1214): 'Chemical',
   (1274, 1280): 'Gene',
   (1330, 1348): 'Disease',
   (1349, 1353): 'Species',
   (1378, 1384): 'Gene',
   (1451, 1459): 'Di