# Initial scratch

In [223]:
# https://www.ncbi.nlm.nih.gov/research/pubtator/api.html

In [224]:
import requests
import json

def extract_annotations(annotation):
    entity_type = annotation['infons']['type']
    start_index = annotation['locations'][0]['offset']
    end_index = start_index+annotation['locations'][0]['length']
    
    return entity_type, [start_index, end_index]

In [225]:
pmids = '28483578,28483579'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}'#&concepts=gene,protein'

response = requests.get(url)
responses = response.text.split('\n')

In [226]:
# can also do full text I think, but we need the PMCIDs. 
# We can get that data later. 
pmid_to_entity_type_to_indices = dict()
pmid_to_indices_to_entity_type = dict()
abstracts = []

for entry_txt in responses:
    if entry_txt == '':
        continue
    entry = json.loads(entry_txt)
    
    # PMID
    pmid = str(entry['id'])
    
    # Title
    title_section = entry['passages'][0]
    title = title_section['text']
    title_annotations = title_section['annotations']
    # print(title)
    # Abstract
    abstract_section = entry['passages'][1]
    abstract = abstract_section['text']
    abstract_annotations = abstract_section['annotations']
    abstracts += [abstract]
    print(len(abstract_section))
    print(len(abstract))
    print(len(abstract_annotations))

    # Title Annotations
    entity_type_to_title_indices = dict()
    title_indices_to_entity_type = dict()
    for annotation in title_annotations:    
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_title_indices.setdefault(entity_type,list()).append(indices)
        title_indices_to_entity_type[tuple(indices)] = entity_type
        
    # Abstract Annotations
    entity_type_to_abstract_indices = dict() 
    abstract_indices_to_entity_type = dict()
    for annotation in abstract_annotations:
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_abstract_indices.setdefault(entity_type, list()).append(indices)
        abstract_indices_to_entity_type[tuple(indices)] = entity_type
        
    # PMID->Entities->Indices
    pmid_to_entity_type_to_indices[pmid] = dict()
    pmid_to_entity_type_to_indices[pmid]['title'] = entity_type_to_title_indices
    pmid_to_entity_type_to_indices[pmid]['abstract'] = entity_type_to_abstract_indices
    
    # PMID->Indices->Entity Type
    pmid_to_indices_to_entity_type[pmid] = dict()
    pmid_to_indices_to_entity_type[pmid]['title'] = title_indices_to_entity_type
    pmid_to_indices_to_entity_type[pmid]['abstract'] = abstract_indices_to_entity_type
    
    # PMID->Indices->Entity Type->Entity
    #...
    #...

6
1185
9
6
1419
22


In [227]:
abstract_annotations

[{'id': '34',
  'infons': {'identifier': 'MESH:D014801', 'type': 'Chemical'},
  'text': 'vitamin A',
  'locations': [{'offset': 202, 'length': 9}]},
 {'id': '35',
  'infons': {'identifier': 'MESH:D008103', 'type': 'Disease'},
  'text': 'liver fibrosis',
  'locations': [{'offset': 227, 'length': 14}]},
 {'id': '36',
  'infons': {'identifier': '10090', 'type': 'Species'},
  'text': 'mice',
  'locations': [{'offset': 534, 'length': 4}]},
 {'id': '37',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'Schistosoma japonicum',
  'locations': [{'offset': 544, 'length': 21}]},
 {'id': '38',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'S. japonicum',
  'locations': [{'offset': 567, 'length': 12}]},
 {'id': '39',
  'infons': {'identifier': 'MESH:D007239', 'type': 'Disease'},
  'text': 'infection',
  'locations': [{'offset': 581, 'length': 9}]},
 {'id': '40',
  'infons': {'identifier': 'MESH:D005355', 'type': 'Disease'},
  'text': 'fibrosis',
  'locations': [{

In [228]:
title_annotations

[{'id': '6',
  'infons': {'identifier': '111364', 'type': 'Gene'},
  'text': 'MHC II',
  'locations': [{'offset': 0, 'length': 6}]},
 {'id': '7',
  'infons': {'identifier': '111364', 'type': 'Gene'},
  'text': 'MHC II',
  'locations': [{'offset': 17, 'length': 6}]},
 {'id': '8',
  'infons': {'identifier': 'MESH:D005355', 'type': 'Disease'},
  'text': 'fibrosis',
  'locations': [{'offset': 69, 'length': 8}]},
 {'id': '9',
  'infons': {'identifier': '10090', 'type': 'Species'},
  'text': 'mice',
  'locations': [{'offset': 81, 'length': 4}]},
 {'id': '10',
  'infons': {'identifier': 'MESH:D007239', 'type': 'Disease'},
  'text': 'infection',
  'locations': [{'offset': 89, 'length': 9}]},
 {'id': '11',
  'infons': {'identifier': '6182', 'type': 'Species'},
  'text': 'Schistosoma japonicum',
  'locations': [{'offset': 104, 'length': 21}]}]

In [182]:
pmid_to_indices_to_entity_type

{'28483579': {'title': {(83, 97): 'Disease', (101, 120): 'Disease'},
  'abstract': {(212, 226): 'Disease',
   (238, 246): 'Species',
   (252, 287): 'Disease',
   (334, 342): 'Species',
   (440, 448): 'Species',
   (604, 619): 'Disease',
   (715, 723): 'Species',
   (1091, 1105): 'Disease',
   (1210, 1224): 'Disease'}},
 '28483578': {'title': {(0, 6): 'Gene',
   (17, 23): 'Gene',
   (69, 77): 'Disease',
   (81, 85): 'Species',
   (89, 98): 'Disease',
   (104, 125): 'Species'},
  'abstract': {(202, 211): 'Chemical',
   (227, 241): 'Disease',
   (534, 538): 'Species',
   (544, 565): 'Species',
   (567, 579): 'Species',
   (581, 590): 'Disease',
   (659, 667): 'Disease',
   (681, 700): 'Disease',
   (772, 778): 'Gene',
   (809, 828): 'Disease',
   (859, 865): 'Gene',
   (876, 882): 'Gene',
   (992, 998): 'Gene',
   (1110, 1116): 'Gene',
   (1202, 1214): 'Chemical',
   (1274, 1280): 'Gene',
   (1330, 1348): 'Disease',
   (1349, 1353): 'Species',
   (1378, 1384): 'Gene',
   (1451, 1459): 'Di

In [183]:
len(pmid_to_indices_to_entity_type)

2

In [184]:
temp_indices_to_entity_type = list(pmid_to_indices_to_entity_type.values())[0]
print(temp_indices_to_entity_type)

{'title': {(83, 97): 'Disease', (101, 120): 'Disease'}, 'abstract': {(212, 226): 'Disease', (238, 246): 'Species', (252, 287): 'Disease', (334, 342): 'Species', (440, 448): 'Species', (604, 619): 'Disease', (715, 723): 'Species', (1091, 1105): 'Disease', (1210, 1224): 'Disease'}}


In [185]:
temp_indices_to_entity_type.keys()
# len(temp_indices_to_entity_type)

dict_keys(['title', 'abstract'])

In [186]:
temp_titles = temp_indices_to_entity_type['title']
temp_titles

{(83, 97): 'Disease', (101, 120): 'Disease'}

In [187]:
temp_title_string = "Comparison of 2-D Shear Wave Elastography and Transient Elastography for Assessing Liver Fibrosis in Chronic Hepatitis B."
temp_title_string[83:97]

'Liver Fibrosis'

In [188]:
temp_title_string[101:120]

'Chronic Hepatitis B'

In [189]:
temp_title = list(pmid_to_indices_to_entity_type.values())[1]['title']
temp_title


{(0, 6): 'Gene',
 (17, 23): 'Gene',
 (69, 77): 'Disease',
 (81, 85): 'Species',
 (89, 98): 'Disease',
 (104, 125): 'Species'}

In [190]:
temp_title_string = "MHC II-, but not MHC II+, hepatic Stellate cells contribute to liver fibrosis of mice in infection with Schistosoma japonicum."
for idx, etype in temp_title.items():
    print(idx,etype,temp_title_string[idx[0]:idx[1]])

(0, 6) Gene MHC II
(17, 23) Gene MHC II
(69, 77) Disease fibrosis
(81, 85) Species mice
(89, 98) Disease infection
(104, 125) Species Schistosoma japonicum


In [191]:
idx_etype = list(pmid_to_indices_to_entity_type.values())[1]['abstract']
temp_abstract = abstracts[1]
print(len(temp_abstract))
for idx, etype in idx_etype.items():
    print(idx,etype,temp_abstract[idx[0]:idx[1]])

1419
(202, 211) Chemical have plas
(227, 241) Disease ogeneity, whic
(534, 538) Species bros
(544, 565) Species sed by S. japonicum i
(567, 579) Species ection. Resu
(581, 590) Disease s reveale
(659, 667) Disease rogenic 
(681, 700) Disease  japonicum infectio
(772, 778) Gene oth tw
(809, 828) Disease he proliferation of
(859, 865) Gene  only 
(876, 882) Gene s disp
(992, 998) Gene d CIIT
(1110, 1116) Gene e expr
(1202, 1214) Chemical  japonicum-i
(1274, 1280) Gene  were 
(1330, 1348) Disease is and could be co
(1349, 1353) Species side
(1378, 1384) Gene  of pr
(1451, 1459) Disease 
(1509, 1521) Chemical 
(1527, 1535) Disease 


In [192]:
temp_abstract

'Hepatic stellate cells (HSCs) are considered as the main effector cells in vitamin A metabolism and liver fibrosis, as well as in hepatic immune regulation. Recently, researches have revealed that HSCs have plasticity and heterogeneity, which depend on their lobular location and whether liver is normal or injured. This research aimed to explore the biological characteristics and heterogeneity of HSCs in mice with Schistosoma japonicum (S. japonicum) infection, and determine the subpopulation of HSCs in pathogenesis of hepatic fibrosis caused by S. japonicum infection. Results revealed that HSCs significantly increased the expressions of MHC II and fibrogenic genes after S. japonicum infection, and could be classified into MHC II+ HSCs and MHC II- HSCs subsets. Both two HSCs populations suppressed the proliferation of activated CD4+T cells, whereas only MHC II- HSCs displayed a myofibroblast-like phenotype. In response to IFN-gamma, HSCs up-regulated the expressions of MHC II and CIITA

In [193]:
for a in abstracts:
    print(len(a))

1185
1419


In [194]:
import requests
url = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmid=20085714&concepts=chemical"

r = requests.get(url)
print(r.text)

["Could not export data : [ErrorDetail(string='Please submit a list of pmids', code='parameter_exception')]"]


# Scratch for 28483578
https://pubmed.ncbi.nlm.nih.gov/28483578/

In [195]:
import requests 

In [196]:
# set url
pmids = '29484645'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}&concept=chemical'#&concepts=gene,protein'

In [197]:
# get request
r = requests.get(url)
print(r.text)

{"_id": "29484645|None", "id": "29484645", "infons": {}, "passages": [{"infons": {"journal": "J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798", "year": "2018", "type": "title", "authors": "Klimek-Piotrowska W, Krawczyk-O\u017c\u00f3g A, Suski M, Kapusta P, Wo\u0142kow PP, Ho\u0142da MK", "section": "Title"}, "offset": 0, "text": "Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.", "sentences": [], "annotations": [{"id": "1", "infons": {"identifier": "9606", "type": "Species"}, "text": "human", "locations": [{"offset": 55, "length": 5}]}], "relations": []}, {"infons": {"type": "abstract", "section": "Abstract"}, "offset": 105, "text": "Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of 

In [198]:
# find keys
response = r.json()
response.keys()

dict_keys(['_id', 'id', 'infons', 'passages', 'relations', 'pmid', 'pmcid', 'created', 'accessions', 'journal', 'year', 'authors'])

In [199]:
for key in response.keys():
    print(key)
    print(response[key])
    print()

_id
29484645|None

id
29484645

infons
{}

passages
[{'infons': {'journal': 'J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798', 'year': '2018', 'type': 'title', 'authors': 'Klimek-Piotrowska W, Krawczyk-Ożóg A, Suski M, Kapusta P, Wołkow PP, Hołda MK', 'section': 'Title'}, 'offset': 0, 'text': 'Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.', 'sentences': [], 'annotations': [{'id': '1', 'infons': {'identifier': '9606', 'type': 'Species'}, 'text': 'human', 'locations': [{'offset': 55, 'length': 5}]}], 'relations': []}, {'infons': {'type': 'abstract', 'section': 'Abstract'}, 'offset': 105, 'text': 'Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of cardiovascular disease. We performed 

In [200]:
for i, text in enumerate(response["passages"]):
    print(f"Index: {i}")
    print(text)
    print()

Index: 0
{'infons': {'journal': 'J. Anat.; 2018 Feb 27. doi:10.1111/joa.12798', 'year': '2018', 'type': 'title', 'authors': 'Klimek-Piotrowska W, Krawczyk-Ożóg A, Suski M, Kapusta P, Wołkow PP, Hołda MK', 'section': 'Title'}, 'offset': 0, 'text': 'Comparative iTRAQ analysis of protein abundance in the human sinoatrial node and working cardiomyocytes.', 'sentences': [], 'annotations': [{'id': '1', 'infons': {'identifier': '9606', 'type': 'Species'}, 'text': 'human', 'locations': [{'offset': 55, 'length': 5}]}], 'relations': []}

Index: 1
{'infons': {'type': 'abstract', 'section': 'Abstract'}, 'offset': 105, 'text': 'Our objective was to assess the changes in protein abundance in the human sinoatrial node (SAN) compared with working cardiomyocytes to identify SAN-specific protein signatures. Four pairs of samples (the SAN and working cardiomyocytes) were obtained postmortem from four human donors with no evidence of cardiovascular disease. We performed protein identification and quantita

# Pipeline

Collect report for one PMID

In [201]:
import requests
import json
import pandas as pd

PXD: PXD024135\
PMID: 34806902\
PMCID: PMC8687617

In [202]:
# Load in one PMID 
pmids = '34806902'
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}'#&concepts=gene,protein'
response = requests.get(url)
responses = response.text.split('\n')

In [203]:
# Using Dylan's code (this block), we can extract most of the keywords
def extract_annotations(annotation):
    entity_type = annotation['infons']['type']
    start_index = annotation['locations'][0]['offset']
    end_index = start_index+annotation['locations'][0]['length']
    
    return entity_type, [start_index, end_index]

pmid_to_entity_type_to_indices = dict()
pmid_to_indices_to_entity_type = dict()
abstracts = []

for entry_txt in responses:
    if entry_txt == '':
        continue
    entry = json.loads(entry_txt)
    
    # PMID
    pmid = str(entry['id'])
    
    # Title
    title_section = entry['passages'][0]
    title = title_section['text']
    title_annotations = title_section['annotations']
    # print(title)
    # Abstract
    abstract_section = entry['passages'][1]
    abstract = abstract_section['text']
    abstract_annotations = abstract_section['annotations']
    abstracts += [abstract]
    print(len(abstract_section))
    print(len(abstract))
    print(len(abstract_annotations))

    # Title Annotations
    entity_type_to_title_indices = dict()
    title_indices_to_entity_type = dict()
    for annotation in title_annotations:    
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_title_indices.setdefault(entity_type,list()).append(indices)
        title_indices_to_entity_type[tuple(indices)] = entity_type
        
    # Abstract Annotations
    entity_type_to_abstract_indices = dict() 
    abstract_indices_to_entity_type = dict()
    for annotation in abstract_annotations:
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_abstract_indices.setdefault(entity_type, list()).append(indices)
        abstract_indices_to_entity_type[tuple(indices)] = entity_type
        
    # PMID->Entities->Indices
    pmid_to_entity_type_to_indices[pmid] = dict()
    pmid_to_entity_type_to_indices[pmid]['title'] = entity_type_to_title_indices
    pmid_to_entity_type_to_indices[pmid]['abstract'] = entity_type_to_abstract_indices
    
    # PMID->Indices->Entity Type
    pmid_to_indices_to_entity_type[pmid] = dict()
    pmid_to_indices_to_entity_type[pmid]['title'] = title_indices_to_entity_type
    pmid_to_indices_to_entity_type[pmid]['abstract'] = abstract_indices_to_entity_type
    
    # PMID->Indices->Entity Type->Entity
    #...
    #...

6
2302
33


In [204]:
# Look at entity types 
temp_indices_to_entity_type = list(pmid_to_indices_to_entity_type.values())[0]
print(temp_indices_to_entity_type)

{'title': {(55, 62): 'Gene'}, 'abstract': {(155, 180): 'Disease', (229, 235): 'Species', (265, 272): 'Gene', (274, 338): 'Gene', (379, 386): 'Gene', (395, 415): 'Chemical', (476, 483): 'Gene', (575, 579): 'Species', (612, 619): 'Gene', (621, 628): 'Gene', (719, 727): 'Species', (926, 934): 'Species', (936, 945): 'Chemical', (986, 1002): 'Disease', (1029, 1035): 'Species', (1065, 1071): 'Gene', (1086, 1093): 'Gene', (1098, 1102): 'Species', (1104, 1110): 'Gene', (1168, 1183): 'Chemical', (1233, 1248): 'Gene', (1250, 1259): 'Gene', (1264, 1275): 'Gene', (1309, 1316): 'Gene', (1321, 1325): 'Species', (1411, 1417): 'Gene', (1428, 1447): 'Disease', (1600, 1608): 'Species', (1809, 1834): 'Gene', (1923, 1931): 'Species', (2041, 2049): 'Species', (2182, 2207): 'Disease', (2276, 2284): 'Species'}}


In [205]:
# Examine key types
temp_indices_to_entity_type.keys()

dict_keys(['title', 'abstract'])

In [206]:
# Examine keys in title
title_keys = temp_indices_to_entity_type["title"]
print(title_keys)
# Examine keys in abstract
abstract_keys = temp_indices_to_entity_type["abstract"]
print(abstract_keys)

{(55, 62): 'Gene'}
{(155, 180): 'Disease', (229, 235): 'Species', (265, 272): 'Gene', (274, 338): 'Gene', (379, 386): 'Gene', (395, 415): 'Chemical', (476, 483): 'Gene', (575, 579): 'Species', (612, 619): 'Gene', (621, 628): 'Gene', (719, 727): 'Species', (926, 934): 'Species', (936, 945): 'Chemical', (986, 1002): 'Disease', (1029, 1035): 'Species', (1065, 1071): 'Gene', (1086, 1093): 'Gene', (1098, 1102): 'Species', (1104, 1110): 'Gene', (1168, 1183): 'Chemical', (1233, 1248): 'Gene', (1250, 1259): 'Gene', (1264, 1275): 'Gene', (1309, 1316): 'Gene', (1321, 1325): 'Species', (1411, 1417): 'Gene', (1428, 1447): 'Disease', (1600, 1608): 'Species', (1809, 1834): 'Gene', (1923, 1931): 'Species', (2041, 2049): 'Species', (2182, 2207): 'Disease', (2276, 2284): 'Species'}


In [207]:
# Look into title keys
title_dict = {}
for key in title_keys:
    title_attrib_dict = {}
    start, end = key[0], key[1]
    entity = title_keys[key]
    entity_text = title[start:end]
    title_attrib_dict["start"] = int(start)
    title_attrib_dict["end"] = int(end)
    title_attrib_dict["entity-type"] = entity
    title_dict[entity_text] = title_attrib_dict

title_df = pd.DataFrame(title_dict)
title_df = title_df.transpose()
title_df = title_df.reset_index()
title_df = title_df.rename(columns={"index": "entity"})
title_df

Unnamed: 0,entity,end,entity-type,start
0,Adamts5,62,Gene,55


In [208]:
# Look into abstract keys
abstract_dict = {}
for key in abstract_keys:
    abstract_attrib_dict = {}
    start, end = key[0], key[1]
    entity = abstract_keys[key]
    entity_text = abstract[start:end]
    abstract_attrib_dict["start"] = int(start)
    abstract_attrib_dict["end"] = int(end)
    abstract_attrib_dict["entity-type"] = entity
    abstract_dict[entity_text] = abstract_attrib_dict

abstract_df = pd.DataFrame(abstract_dict)
abstract_df = abstract_df.transpose()
abstract_df = abstract_df.reset_index()
abstract_df = abstract_df.rename(columns={"index": "entity"})
abstract_df

Unnamed: 0,entity,start,end,entity-type
0,roblasts returned ADAMTS5,155,180,Disease
1,ospond,229,235,Species
2,t abund,265,272,Gene
3,t proteases. ADAMTS5 cleaves chondroitin sulph...,274,338,Gene
4,n of AD,379,386,Gene
5,its substrate versi,395,415,Chemical
6,sed in,476,483,Gene
7,ECM,575,579,Species
8,amples,612,619,Gene
9,om HF p,621,628,Gene


# Compare to sci-spaCy

In [209]:
import scispacy
import spacy
import en_ner_bionlp13cg_md

In [210]:
nlp = spacy.load("en_ner_bionlp13cg_md")

In [211]:
#examine title
text = title
text_doc = nlp(title)
title_ents = list(text_doc.ents)
title_entity_labels = []
for idx, ent in enumerate(title_ents):
    title_entity_labels.append(ent.label_)

title_df = pd.DataFrame()
title_df["entity"] = title_ents
title_df["labels"] = title_entity_labels
title_df

Unnamed: 0,entity,labels
0,"(Extracellular, Matrix)",CELLULAR_COMPONENT
1,(Heart),ORGAN
2,(Adamts5),GENE_OR_GENE_PRODUCT
3,(Proteoglycan),GENE_OR_GENE_PRODUCT


In [212]:
# examine abstracts
text = abstract
abstract_doc = nlp(text)
abstract_ents = list(abstract_doc.ents)
abstract_ents_labels = []
for idx, ent in enumerate(abstract_ents):
    abstract_ents_labels.append(ent.label_)


abstract_df = pd.DataFrame()
abstract_df["entity"] = abstract_ents
abstract_df["labels"] = abstract_ents_labels
abstract_df

Unnamed: 0,entity,labels
0,"(extracellular, matrix)",CELLULAR_COMPONENT
1,(ECM),CELLULAR_COMPONENT
2,(heart),ORGAN
3,"(murine, cardiac, fibroblasts)",ORGANISM
4,(ADAMTS5),GENE_OR_GENE_PRODUCT
5,(disintegrin),GENE_OR_GENE_PRODUCT
6,"(thrombospondin, motifs, 5)",GENE_OR_GENE_PRODUCT
7,(ADAMTS5),GENE_OR_GENE_PRODUCT
8,"(chondroitin, sulphate, proteoglycans)",GENE_OR_GENE_PRODUCT
9,(CSPGs),GENE_OR_GENE_PRODUCT


# Collect all entitites using PubTator and spaCy

In [238]:
import requests
import json
import pandas as pd
import scispacy
import spacy
import en_ner_bionlp13cg_md

In [239]:
# load nlp
nlp = spacy.load("en_ner_bionlp13cg_md")

In [256]:
# Load in PMIDs 
pmids = '34806902,33998164,34098726,31670476,29484645,29133944'
PMIDS = ['34806902','33998164','34098726','31670476','29484645','29133944']
url = f'https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids={pmids}'#&concepts=gene,protein'
response = requests.get(url)
responses = response.text.split('\n')

In [257]:
# Using Dylan's code (this block), we can extract most of the keywords
def extract_annotations(annotation):
    entity_type = annotation['infons']['type']
    start_index = annotation['locations'][0]['offset']
    end_index = start_index+annotation['locations'][0]['length']
    
    return entity_type, [start_index, end_index]

pmid_to_entity_type_to_indices = dict()
pmid_to_indices_to_entity_type = dict()
abstracts = []
titles = []

for entry_txt in responses:
    if entry_txt == '':
        continue
    entry = json.loads(entry_txt)
    
    # PMID
    pmid = str(entry['id'])
    
    # Title
    title_section = entry['passages'][0]
    title = title_section['text']
    titles += [title]
    title_annotations = title_section['annotations']
    # print(title)
    # Abstract
    abstract_section = entry['passages'][1]
    abstract = abstract_section['text']
    abstract_annotations = abstract_section['annotations']
    abstracts += [abstract]

    # Title Annotations
    entity_type_to_title_indices = dict()
    title_indices_to_entity_type = dict()
    for annotation in title_annotations:    
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_title_indices.setdefault(entity_type,list()).append(indices)
        title_indices_to_entity_type[tuple(indices)] = entity_type
        
    # Abstract Annotations
    entity_type_to_abstract_indices = dict() 
    abstract_indices_to_entity_type = dict()
    for annotation in abstract_annotations:
        entity_type, indices = extract_annotations(annotation)
        entity_type_to_abstract_indices.setdefault(entity_type, list()).append(indices)
        abstract_indices_to_entity_type[tuple(indices)] = entity_type
        
    # PMID->Entities->Indices
    pmid_to_entity_type_to_indices[pmid] = dict()
    pmid_to_entity_type_to_indices[pmid]['title'] = entity_type_to_title_indices
    pmid_to_entity_type_to_indices[pmid]['abstract'] = entity_type_to_abstract_indices
    
    # PMID->Indices->Entity Type
    pmid_to_indices_to_entity_type[pmid] = dict()
    pmid_to_indices_to_entity_type[pmid]['title'] = title_indices_to_entity_type
    pmid_to_indices_to_entity_type[pmid]['abstract'] = abstract_indices_to_entity_type
    
    # PMID->Indices->Entity Type->Entity

In [258]:
title_dict = {}
title_df_spacy = pd.DataFrame()
title_ents = []
title_entity_labels = []
title_pmid = []

abstract_dict = {}
abstract_df_spacy = pd.DataFrame()
abstract_ents = []
abstract_ents_labels = []
abstract_pmid = []


NUMBER_OF_PMIDS = len(pmid_to_indices_to_entity_type)

for i in range(NUMBER_OF_PMIDS):
    temp_indices_to_entity_type = list(pmid_to_indices_to_entity_type.values())[i]
    # Save title
    title_keys = temp_indices_to_entity_type["title"]
    # Save abstract
    abstract_keys = temp_indices_to_entity_type["abstract"]

    # Save title keys using PubTator
    for key in title_keys:
        title_attrib_dict = {}
        start, end = key[0], key[1]
        entity = title_keys[key]
        entity_text = titles[i][start:end]
        title_attrib_dict["start"] = int(start)
        title_attrib_dict["end"] = int(end)
        title_attrib_dict["entity-type"] = entity
        title_attrib_dict["PMID"] = PMIDS[i]
        title_dict[entity_text] = title_attrib_dict

    # Title keys using spacy
    text = titles[i]
    text_doc = nlp(text)
    title_ents += [*text_doc.ents]
    for idx, ent in enumerate(text_doc.ents):
        title_entity_labels.append(ent.label_)
        title_pmid.append(PMIDS[i])
    

    # Save abstract keys
    for key in abstract_keys:
        abstract_attrib_dict = {}
        start, end = key[0], key[1]
        entity = abstract_keys[key]
        entity_text = abstracts[i][start:end]
        abstract_attrib_dict["start"] = int(start)
        abstract_attrib_dict["end"] = int(end)
        abstract_attrib_dict["entity-type"] = entity
        abstract_attrib_dict["PMID"] = PMIDS[i]
        abstract_dict[entity_text] = abstract_attrib_dict

    # Title keys using spacy
    text = titles[i]
    text_doc = nlp(text)
    title_ents += [*text_doc.ents]
    for idx, ent in enumerate(text_doc.ents):
        title_entity_labels.append(ent.label_)
        title_pmid.append(PMIDS[i])
    
    # examine abstracts
    text = abstracts[i]
    abstract_doc = nlp(text)
    abstract_ents += [*abstract_doc.ents]
    for idx, ent in enumerate(abstract_doc.ents):
        abstract_ents_labels.append(ent.label_)
        abstract_pmid.append(PMIDS[i])
    

In [259]:
title_df_spacy["entity"] = title_ents
title_df_spacy["entity-labels"] = title_entity_labels
title_df_spacy["pmid"] = title_pmid
title_df_spacy.to_csv("results/scispacy_title_entities.csv")

In [262]:
import numpy as np
print(np.unqiue(title_pmid))
print(np.unqiue(abstract_pmid))

AttributeError: module 'numpy' has no attribute 'unqiue'

In [247]:
abstract_df_spacy["entity"] = abstract_ents
abstract_df_spacy["entity-labels"] = abstract_ents_labels
abstract_df_spacy["pmid"] = abstract_pmid
abstract_df_spacy.to_csv("results/scispacy_abstract_entities.csv")

In [250]:
title_df = pd.DataFrame(title_dict)
title_df = title_df.transpose()
title_df = title_df.reset_index()
title_df = title_df.rename(columns={"index": "entity"})
title_df.to_csv("results/pubtator_title_entities.csv")

In [249]:
abstract_df = pd.DataFrame(abstract_dict)
abstract_df = abstract_df.transpose()
abstract_df = abstract_df.reset_index()
abstract_df = abstract_df.rename(columns={"index": "entity"})
abstract_df.to_csv("results/pubtator_abstract_entities.csv")

In [251]:
# define 3 highlight colors: 1) pubtator, 2) spacy, 3) both 
print("\033[44;33mHello World!\033[m")

[44;33mHello World![m


In [252]:
abstract_temp = "Background: Remodelling of the extracellular matrix (ECM) is a hallmark of heart failure (HF). Our previous analysis of the secretome of murine cardiac fibroblasts returned ADAMTS5 (a disintegrin and metalloproteinase with thrombospondin motifs 5) as one of the most abundant proteases. ADAMTS5 cleaves chondroitin sulphate proteoglycans (CSPGs) such as versican. The contribution of ADAMTS5 and its substrate versican to HF is unknown. Methods: Versican remodelling was assessed in mice lacking the catalytic domain of ADAMTS5 (Adamts5 Cat). Proteomics was applied to study ECM remodelling in left ventricular samples from HF patients, with a particular focus on the effects of common medications used for the treatment of HF. Results: Versican and versikine, an ADAMTS-specific versican cleavage product, accumulated in ischemic HF patients. Versikine was also elevated in a porcine model of cardiac ischemia/reperfusion injury and in murine hearts after angiotensin II (Ang II) infusion. In Adamts5 Cat mice, Ang II infusion resulted in an aggravated versican build-up and hyaluronic acid disarrangement, accompanied by reduced levels of integrin beta 1, filamin A and connexin 43. Echocardiographic assessment of Adamts5 Cat mice revealed a reduced ejection fraction and an impaired global longitudinal strain upon Ang II infusion. Cardiac hypertrophy and collagen deposition, however, were similar to littermate controls. In a proteomics analysis of a larger cohort of cardiac explants from ischemic HF patients (n=65), the use of beta-blockers was associated with a reduction in ECM deposition, with versican being among the most pronounced changes. Subsequent experiments in cardiac fibroblasts confirmed that beta1-adrenergic receptor stimulation increased versican expression. Despite similar clinical characteristics, HF patients treated with beta-blockers had a distinct cardiac ECM profile. Conclusions: Our results in animal models and patients suggest that ADAMTS proteases are critical for versican degradation in the heart, and that versican accumulation is associated with impaired cardiac function. A comprehensive characterisation of the cardiac ECM in ischemic HF patients revealed that beta-blockers may have a previously unrecognized beneficial effect on the cardiac CSPG content."
abstract_temp[155:180]

'roblasts returned ADAMTS5'

In [253]:
temp = "Background: Remodelling of the extracellular matrix (ECM) is a hallmark of heart failure (HF)"
len(temp)

93

In [254]:
abstract_temp[93-(190-155):93]

'is a hallmark of heart failure (HF)'

In [None]:
# total number of proteins for all 6 
# num prots found in pub 1 vs the other
# important thing is we are just trying it oyt for now
# we have not done the full text