In [2]:
# https://www.ncbi.nlm.nih.gov/research/pubtator/api.html

In [1]:
import requests
import json

def extract_annotations(annotation):
    entity_type = annotation['infons']['type']
    start_index = annotation['locations'][0]['offset']
    end_index = start_index+annotation['locations'][0]['length']
    
    return entity_type, [start_index, end_index]

In [2]:
pmids = '28483578,28483579'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}'#&concepts=gene,protein'

response = requests.get(url)
responses = response.text.split('\n')

In [59]:
# can also do full text I think, but we need the PMCIDs. 
# We can get that data later. 
pmid_to_entity_type_to_indices = dict()
pmid_to_indices_to_entity_type = dict()
abstracts = []

for entry_txt in responses:
    if entry_txt == '':
        continue
    entry = json.loads(entry_txt)
    
    # PMID
    pmid = str(entry['id'])
    
    # Title
    title_section = entry['passages'][0]
    title = title_section['text']
    title_annotations = title_section['annotations']
    # print(title)
    # Abstract
    abstract_section = entry['passages'][1]
    abstract = abstract_section['text']
    abstract_annotations = abstract_section['annotations']
    abstracts += [abstract]
    print(len(abstract_section))
    print(len(abstract))
    print(len(abstract_annotations))

    # Title Annotations
    entity_type_to_title_indices = dict()
    title_indices_to_entity_type = dict()
    for annotation in title_annotations:    
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_title_indices.setdefault(entity_type,list()).append(indices)
        title_indices_to_entity_type[tuple(indices)] = entity_type
        
    # Abstract Annotations
    entity_type_to_abstract_indices = dict() 
    abstract_indices_to_entity_type = dict()
    for annotation in abstract_annotations:
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_abstract_indices.setdefault(entity_type, list()).append(indices)
        abstract_indices_to_entity_type[tuple(indices)] = entity_type
        
    # PMID->Entities->Indices
    pmid_to_entity_type_to_indices[pmid] = dict()
    pmid_to_entity_type_to_indices[pmid]['title'] = entity_type_to_title_indices
    pmid_to_entity_type_to_indices[pmid]['abstract'] = entity_type_to_abstract_indices
    
    # PMID->Indices->Entity Type
    pmid_to_indices_to_entity_type[pmid] = dict()
    pmid_to_indices_to_entity_type[pmid]['title'] = title_indices_to_entity_type
    pmid_to_indices_to_entity_type[pmid]['abstract'] = abstract_indices_to_entity_type
    
    # PMID->Indices->Entity Type->Entity
    #...
    #...

6
1185
9
6
1419
22


In [60]:
abstract_annotations

[{'id': '34',
  'infons': {'identifier': 'MESH:D014801', 'type': 'Chemical'},
  'text': 'vitamin A',
  'locations': [{'offset': 202, 'length': 9}]},
 {'id': '35',
  'infons': {'identifier': 'MESH:D008103', 'type': 'Disease'},
  'text': 'liver fibrosis',
  'locations': [{'offset': 227, 'length': 14}]},
 {'id': '36',
  'infons': {'identifier': '10090', 'type': 'Species'},
  'text': 'mice',
  'locations': [{'offset': 534, 'length': 4}]},
 {'id': '37',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'Schistosoma japonicum',
  'locations': [{'offset': 544, 'length': 21}]},
 {'id': '38',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'S. japonicum',
  'locations': [{'offset': 567, 'length': 12}]},
 {'id': '39',
  'infons': {'identifier': 'MESH:D007239', 'type': 'Disease'},
  'text': 'infection',
  'locations': [{'offset': 581, 'length': 9}]},
 {'id': '40',
  'infons': {'identifier': 'MESH:D005355', 'type': 'Disease'},
  'text': 'fibrosis',
  'locations': [{

In [61]:
title_annotations

[{'id': '6',
  'infons': {'identifier': '111364', 'type': 'Gene'},
  'text': 'MHC II',
  'locations': [{'offset': 0, 'length': 6}]},
 {'id': '7',
  'infons': {'identifier': '111364', 'type': 'Gene'},
  'text': 'MHC II',
  'locations': [{'offset': 17, 'length': 6}]},
 {'id': '8',
  'infons': {'identifier': 'MESH:D005355', 'type': 'Disease'},
  'text': 'fibrosis',
  'locations': [{'offset': 69, 'length': 8}]},
 {'id': '9',
  'infons': {'identifier': '10090', 'type': 'Species'},
  'text': 'mice',
  'locations': [{'offset': 81, 'length': 4}]},
 {'id': '10',
  'infons': {'identifier': 'MESH:D007239', 'type': 'Disease'},
  'text': 'infection',
  'locations': [{'offset': 89, 'length': 9}]},
 {'id': '11',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'Schistosoma japonicum',
  'locations': [{'offset': 104, 'length': 21}]}]

In [4]:
pmid_to_indices_to_entity_type

{'28483579': {'title': {(83, 97): 'Disease', (101, 120): 'Disease'},
  'abstract': {(212, 226): 'Disease',
   (238, 246): 'Species',
   (252, 287): 'Disease',
   (334, 342): 'Species',
   (440, 448): 'Species',
   (604, 619): 'Disease',
   (715, 723): 'Species',
   (1091, 1105): 'Disease',
   (1210, 1224): 'Disease'}},
 '28483578': {'title': {(0, 6): 'Gene',
   (17, 23): 'Gene',
   (69, 77): 'Disease',
   (81, 85): 'Species',
   (89, 98): 'Disease',
   (104, 125): 'Species'},
  'abstract': {(202, 211): 'Chemical',
   (227, 241): 'Disease',
   (534, 538): 'Species',
   (544, 565): 'Species',
   (567, 579): 'Species',
   (581, 590): 'Disease',
   (659, 667): 'Disease',
   (681, 700): 'Disease',
   (772, 778): 'Gene',
   (809, 828): 'Disease',
   (859, 865): 'Gene',
   (876, 882): 'Gene',
   (992, 998): 'Gene',
   (1110, 1116): 'Gene',
   (1202, 1214): 'Chemical',
   (1274, 1280): 'Gene',
   (1330, 1348): 'Disease',
   (1349, 1353): 'Species',
   (1378, 1384): 'Gene',
   (1451, 1459): 'Di

In [34]:
len(pmid_to_indices_to_entity_type)

2

In [36]:
temp_indices_to_entity_type = list(pmid_to_indices_to_entity_type.values())[0]
print(temp_indices_to_entity_type)

{'title': {(83, 97): 'Disease', (101, 120): 'Disease'}, 'abstract': {(212, 226): 'Disease', (238, 246): 'Species', (252, 287): 'Disease', (334, 342): 'Species', (440, 448): 'Species', (604, 619): 'Disease', (715, 723): 'Species', (1091, 1105): 'Disease', (1210, 1224): 'Disease'}}


In [39]:
temp_indices_to_entity_type.keys()
# len(temp_indices_to_entity_type)

dict_keys(['title', 'abstract'])

In [40]:
temp_titles = temp_indices_to_entity_type['title']
temp_titles

{(83, 97): 'Disease', (101, 120): 'Disease'}

In [43]:
temp_title_string = "Comparison of 2-D Shear Wave Elastography and Transient Elastography for Assessing Liver Fibrosis in Chronic Hepatitis B."
temp_title_string[83:97]

'Liver Fibrosis'

In [44]:
temp_title_string[101:120]

'Chronic Hepatitis B'

In [45]:
temp_title = list(pmid_to_indices_to_entity_type.values())[1]['title']
temp_title


{(0, 6): 'Gene',
 (17, 23): 'Gene',
 (69, 77): 'Disease',
 (81, 85): 'Species',
 (89, 98): 'Disease',
 (104, 125): 'Species'}

In [48]:
temp_title_string = "MHC II-, but not MHC II+, hepatic Stellate cells contribute to liver fibrosis of mice in infection with Schistosoma japonicum."
for idx, etype in temp_title.items():
    print(idx,etype,temp_title_string[idx[0]:idx[1]])

(0, 6) Gene MHC II
(17, 23) Gene MHC II
(69, 77) Disease fibrosis
(81, 85) Species mice
(89, 98) Disease infection
(104, 125) Species Schistosoma japonicum


In [56]:
idx_etype = list(pmid_to_indices_to_entity_type.values())[1]['abstract']
temp_abstract = abstracts[1]
print(len(temp_abstract))
for idx, etype in idx_etype.items():
    print(idx,etype,temp_abstract[idx[0]:idx[1]])

1419
(202, 211) Chemical have plas
(227, 241) Disease ogeneity, whic
(534, 538) Species bros
(544, 565) Species sed by S. japonicum i
(567, 579) Species ection. Resu
(581, 590) Disease s reveale
(659, 667) Disease rogenic 
(681, 700) Disease  japonicum infectio
(772, 778) Gene oth tw
(809, 828) Disease he proliferation of
(859, 865) Gene  only 
(876, 882) Gene s disp
(992, 998) Gene d CIIT
(1110, 1116) Gene e expr
(1202, 1214) Chemical  japonicum-i
(1274, 1280) Gene  were 
(1330, 1348) Disease is and could be co
(1349, 1353) Species side
(1378, 1384) Gene  of pr
(1451, 1459) Disease 
(1509, 1521) Chemical 
(1527, 1535) Disease 


In [51]:
temp_abstract

'e'

In [57]:
for a in abstracts:
    print(len(a))

1185
1419


In [3]:
import requests
url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmid=20085714&concepts=chemical"

r = requests.get(url)
print(r.text)

["Could not export data : [ErrorDetail(string='Please submit a list of pmids', code='parameter_exception')]"]


# Scratch for 28483578
https://pubmed.ncbi.nlm.nih.gov/28483578/

In [5]:
import requests 

In [23]:
# set url
pmids = '29484645'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}&concept=chemical'#&concepts=gene,protein'

In [24]:
# get request
r = requests.get(url)
print(r.text)

{"_id": "29484645|None", "id": "29484645", "infons": {}, "passages": [{"infons": {"journal": "J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798", "year": "2018", "type": "title", "authors": "Klimek-Piotrowska W, Krawczyk-O\u017c\u00f3g A, Suski M, Kapusta P, Wo\u0142kow PP, Ho\u0142da MK", "section": "Title"}, "offset": 0, "text": "Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.", "sentences": [], "annotations": [{"id": "1", "infons": {"identifier": "9606", "type": "Species"}, "text": "human", "locations": [{"offset": 55, "length": 5}]}], "relations": []}, {"infons": {"type": "abstract", "section": "Abstract"}, "offset": 105, "text": "Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of 

In [25]:
# find keys
response = r.json()
response.keys()

dict_keys(['_id', 'id', 'infons', 'passages', 'relations', 'pmid', 'pmcid', 'created', 'accessions', 'journal', 'year', 'authors'])

In [29]:
for key in response.keys():
    print(key)
    print(response[key])
    print()

_id
29484645|None

id
29484645

infons
{}

passages
[{'infons': {'journal': 'J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798', 'year': '2018', 'type': 'title', 'authors': 'Klimek-Piotrowska W, Krawczyk-Ożóg A, Suski M, Kapusta P, Wołkow PP, Hołda MK', 'section': 'Title'}, 'offset': 0, 'text': 'Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.', 'sentences': [], 'annotations': [{'id': '1', 'infons': {'identifier': '9606', 'type': 'Species'}, 'text': 'human', 'locations': [{'offset': 55, 'length': 5}]}], 'relations': []}, {'infons': {'type': 'abstract', 'section': 'Abstract'}, 'offset': 105, 'text': 'Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of cardiovascular disease. We performed 

In [32]:
for i, text in enumerate(response["passages"]):
    print(f"Index: {i}")
    print(text)
    print()

Index: 0
{'infons': {'journal': 'J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798', 'year': '2018', 'type': 'title', 'authors': 'Klimek-Piotrowska W, Krawczyk-Ożóg A, Suski M, Kapusta P, Wołkow PP, Hołda MK', 'section': 'Title'}, 'offset': 0, 'text': 'Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.', 'sentences': [], 'annotations': [{'id': '1', 'infons': {'identifier': '9606', 'type': 'Species'}, 'text': 'human', 'locations': [{'offset': 55, 'length': 5}]}], 'relations': []}

Index: 1
{'infons': {'type': 'abstract', 'section': 'Abstract'}, 'offset': 105, 'text': 'Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of cardiovascular disease. We performed protein identification and quantita