In [2]:
from docx import Document
from docx.shared import RGBColor

def get_highlight_color(run):
    """Get the highlight color of a run."""
    if run.font.highlight_color:
        return run.font.highlight_color  # Word highlight color index
    elif run.font.color and run.font.color.rgb:
        return run.font.color.rgb  # RGB color
    return None

def extract_peco_highlights_from_tables(doc_path):
    """Extract highlighted text from 'PECO statement' rows in tables."""
    doc = Document(doc_path)
    tables_data = []

    annotation_map = {
        'PINK (5)': 'Population',
        'BRIGHT_GREEN (4)': 'Exposure',
        'TURQUOISE (3)': 'Comparator',
        'YELLOW (7)': 'Outcome'
    }

    for table in doc.tables:
        table_dict = {}
        for row in table.rows:
            # doc title
            if "Title of manuscript" in row.cells[0].text.strip():
                table_dict["doc_title"] = row.cells[-1].text.strip()
            
            # doc name
            if "Last name of first author" in row.cells[0].text.strip():
                table_dict["doc_lastauthorname"] = row.cells[-1].text.strip()
            
            # doc year
            if "Year of publication" in row.cells[0].text.strip():
                table_dict["doc_year"] = row.cells[-1].text.strip()
            
            # url
            if "URL of HTML manuscript" in row.cells[0].text.strip():    
                table_dict["doc_url"] = row.cells[-1].text.strip()

            # section
            if "Section PECO statement is in" in row.cells[0].text.strip():
                table_dict["doc_peco_section"] = row.cells[-1].text.strip()

            # PECO statement
            if "PECO statement" in row.cells[0].text.strip():  # Check for PECO row
                peco_cell = row.cells[-1]  # Assume PECO text is in the 2nd column
                color_text_map = {}

                for para in peco_cell.paragraphs:
                    for run in para.runs:
                        color = get_highlight_color(run)
                        text = run.text.strip()
                        if color and text:  # Only include highlighted text
                            color_key = str(color)
                            if color_key in annotation_map:
                                color_key = annotation_map[color_key]
                                if color_key not in color_text_map:
                                    color_text_map[color_key] = []
                                color_text_map[color_key].append(text)

                table_dict["peco_elements"] = color_text_map
        if table_dict:
            tables_data.append(table_dict)

    return tables_data

doc_path = "data/PECO-examples_merged-example.docx"  
result = extract_peco_highlights_from_tables(doc_path)

# Print the structured output
for i, table in enumerate(result, 1):
    print(f"Table {i}:")
    for color, texts in table["peco_elements"].items():
        print(f"  Color: {color}, Text: {', '.join(texts)}")
    print("-" * 50)


In [3]:
result

In [1]:
from keybert import KeyBERT

kw_model = KeyBERT()

def extract_keyphrases(text):
    # Extract keyphrases with n-grams up to 2 words
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words="english")
    return [kw[0] for kw in keywords]

# Example
text = "compared to relatively lower or higher temperatures"
keyphrases = extract_keyphrases(text)
print(keyphrases)  # Output: ['higher temperatures', 'lower temperatures']


In [None]:
from keybert import KeyBERT

The following script is from https://github.com/IHTSDO/SNOMED-in-5-minutes/blob/master/python3-examples/examples.py

True to form, the starter script failed with an IP ban error code when I attempted to run it without modifying `user_agent` first.

In [1]:
# This script uses the Snowstorm SNOMED-CT API. A standardised FHIR API is also available.

# Note that we recommend running your own Snowstorm instance for heavy script use.
# See https://github.com/IHTSDO/snowstorm

from urllib.request import urlopen, Request
from urllib.parse import quote
import json

baseUrl = 'https://browser.ihtsdotools.org/snowstorm/snomed-ct'
edition = 'MAIN'
version = '2019-07-31'

# IMPORTANT! You must update this user agent to avoid having your IP banned for 24 hours.
# Replace with a contact email so that we can contact you if your script causes excessive load on the public server
# For example: user_agent = 'example@example.com'
user_agent = 'joelchan@umd.edu'

def urlopen_with_header(url):
    # adds User-Agent header otherwise urlopen on its own gets an IP blocked response
    req = Request(url)
    req.add_header('User-Agent', user_agent)
    return urlopen(req)

#Prints fsn of a concept
def getConceptById(id):
    url = baseUrl + '/browser/' + edition + '/' + version + '/concepts/' + id
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    print (data['fsn']['term'])

#Prints description by id
def getDescriptionById(id):
    url = baseUrl + '/' + edition + '/' + version + '/descriptions/' + id
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    print (data['term'])

#Prints number of concepts with descriptions containing the search term
def getConceptsByString(searchTerm):
    url = baseUrl + '/browser/' + edition + '/' + version + '/concepts?term=' + quote(searchTerm) + '&activeFilter=true&offset=0&limit=50'
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    print (data['total'])

#Prints number of descriptions containing the search term with a specific semantic tag
def getDescriptionsByStringFromProcedure(searchTerm, semanticTag):
    url = baseUrl + '/browser/' + edition + '/' + version + '/descriptions?term=' + quote(searchTerm) + '&conceptActive=true&semanticTag=' + quote(semanticTag) + '&groupByConcept=false&searchMode=STANDARD&offset=0&limit=50'
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    print (data['totalElements'])
    
 #Prints snomed code for searched disease or symptom
def getSnomedCodeSimilar(searchTerm):
    url = baseUrl + '/browser/' + edition + '/' + version + '/descriptions?term=' + quote(searchTerm) + '&conceptActive=true&groupByConcept=false&searchMode=STANDARD&offset=0&limit=50'
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    for term in data['items']:
      if searchTerm in term['term']:
        print("{} : {}".format(term['term'], term['concept']['conceptId']))
 
def getSnomedCode(searchTerm):
    url = baseUrl + '/browser/' + edition + '/' + version + '/descriptions?term=' + quote(searchTerm) + '&conceptActive=true&groupByConcept=false&searchMode=STANDARD&offset=0&limit=50'
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))

    for term in data['items']:
      if searchTerm == term['term']:
        print("{} : {}".format(term['term'], term['concept']['conceptId']))

In [3]:
getConceptById('109152007')
getDescriptionById('679406011')
getConceptsByString('heart attack')
getDescriptionsByStringFromProcedure('heart', 'procedure')
getSnomedCodeSimilar('Headache')
getSnomedCode('Bleeding from nose')

Bilirubin test kit (physical object)
Methylphenyltetrahydropyridine (substance)
471023
864
Headache : 25064002
Headache, NOS : 25064002
HA - Headache : 25064002
Headache site : 162297001
Headache stick : 462677001
Headache clinic : 702864009
Headache disorder : 230461009
Headache (finding) : 25064002
Headache character : 162306000
Bleeding from nose : 249366005


Failed code from Claude

In [12]:
import requests
import pandas as pd

class OntServerConnector:
    def __init__(self, base_url='https://ontoserver.csiro.au/fhir'):
        """
        Initialize connection to Ontoserver's public SNOMED CT FHIR endpoint
        
        :param base_url: Base URL for Ontoserver FHIR endpoint
        """
        self.base_url = base_url
        self.headers = {
            'Accept': 'application/fhir+json'
        }
    
    def search_snomed_concepts(self, search_term, limit=10):
        """
        Search SNOMED CT concepts using Ontoserver
        
        :param search_term: Term to search for
        :param limit: Maximum number of results to return
        :return: DataFrame of SNOMED concepts
        """
        endpoint = f"{self.base_url}/CodeSystem/$lookup"
        
        params = {
            'system': 'http://snomed.info/sct',
            'property': 'designation',
            'filter': search_term,
            '_count': limit
        }
        
        try:
            response = requests.get(endpoint, 
                                    headers=self.headers, 
                                    params=params)
            response.raise_for_status()
            
            results = response.json()
            
            # Parse FHIR search results
            concepts = []
            for entry in results.get('parameter', []):
                if entry.get('name') == 'result':
                    part = entry.get('part', [])
                    concept = {
                        'code': next((p['valueCode'] for p in part if p['name'] == 'code'), None),
                        'display': next((p['valueString'] for p in part if p['name'] == 'display'), None)
                    }
                    concepts.append(concept)
            
            return pd.DataFrame(concepts)
        
        except requests.RequestException as e:
            print(f"Error searching SNOMED concepts: {e}")
            return pd.DataFrame()
    
    def get_concept_details(self, concept_id):
        """
        Retrieve detailed information for a specific SNOMED concept
        
        :param concept_id: SNOMED CT Concept ID
        :return: Dictionary of concept details
        """
        endpoint = f"{self.base_url}/CodeSystem/$lookup"
        
        params = {
            'system': 'http://snomed.info/sct',
            'code': concept_id
        }
        
        try:
            response = requests.get(endpoint, 
                                    headers=self.headers, 
                                    params=params)
            response.raise_for_status()
            
            return response.json()
        
        except requests.RequestException as e:
            print(f"Error retrieving concept details: {e}")
            return {} 

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


I think the public fhir endpoint is no longer up.

In [13]:
 # Initialize Ontoserver connector
ontoserver = OntServerConnector()

# Search for concepts related to "diabetes"
diabetes_concepts = ontoserver.search_snomed_concepts("diabetes")
print("Diabetes Concepts:")
print(diabetes_concepts)

# If concepts found, get details of first concept
if not diabetes_concepts.empty:
    concept_id = diabetes_concepts.iloc[0]['code']
    concept_details = ontoserver.get_concept_details(concept_id)
    print("\nConcept Details:")
    print(concept_details)

Error searching SNOMED concepts: 404 Client Error: Not Found for url: https://ontoserver.csiro.au/fhir/CodeSystem/$lookup?system=http%3A%2F%2Fsnomed.info%2Fsct&property=designation&filter=diabetes&_count=10
Diabetes Concepts:
Empty DataFrame
Columns: []
Index: []


Failed simpler code from Claude with prompt:
> give me a much simpler and smaller starting point just so i can test connecting to the ols api

base url was wrong.

In [21]:
import requests

def search_snomed_term(term):
    """
    Simple function to search SNOMED terms using OLS API
    
    :param term: Search term for SNOMED concepts
    :return: List of matching concepts
    """
    base_url = 'https://www.ebi.ac.uk/ols4/api/ontologies/snomed/terms'
    
    # Parameters for the search
    params = {
        'q': term,
        'rows': 10  # Limit to 5 results
    }
    
    try:
        # Make the API request
        response = requests.get(base_url, params=params)
        
        # Raise an exception for bad responses
        response.raise_for_status()
        
        # Parse and return the JSON results
        results = response.json()

        print(results)
        
        # Extract labels from the results
        concepts = [
            {
                'label': term_info.get('label'),
                'iri': term_info.get('iri')
            } 
            for term_info in results.get('_embedded', {}).get('terms', [])
        ]
        
        return concepts
    
    except requests.RequestException as e:
        print(f"Error connecting to OLS API: {e}")
        return []


search_term = "neurological disorders"
results = search_snomed_term(search_term)

print(f"Search results for '{search_term}':")
for concept in results:
    print(f"- {concept['label']} (IRI: {concept['iri']})")


{'_embedded': {'terms': [{'iri': 'http://snomed.info/id/125413009', 'lang': 'en', 'description': [], 'synonyms': [], 'annotation': {'alternative label': ['Cytoplasmic matrix alteration'], 'has_dbxref': ['CTV3:XU6Tc'], 'preferred label': ['Cytoplasmic alteration']}, 'label': 'Cytoplasmic alteration', 'ontology_name': 'snomed', 'ontology_prefix': 'SNOMED', 'ontology_iri': 'http://snomed.info/sct/900000000000207008', 'is_obsolete': False, 'term_replaced_by': None, 'is_defining_ontology': True, 'has_children': False, 'is_root': False, 'short_form': 'SNOMED_125413009', 'obo_id': 'SNOMED:125413009', 'in_subset': None, 'obo_definition_citation': None, 'obo_xref': [{'database': 'CTV3', 'id': 'XU6Tc', 'description': None, 'url': None}], 'obo_synonym': None, 'is_preferred_root': False, '_links': {'self': {'href': 'https://www.ebi.ac.uk/ols4/api/ontologies/snomed/terms/http%253A%252F%252Fsnomed.info%252Fid%252F125413009?lang=en'}, 'parents': {'href': 'https://www.ebi.ac.uk/ols4/api/ontologies/sno

First working minimal version.

In [74]:
import requests
import json

base_url = 'https://www.ebi.ac.uk/ols4/api/search'
term = "alzheimer's"

params = {
    'q': term,
    'ontology': 'snomed',
    'rows': 20,  # Increased rows
    
}

try:
    # Make the API request
    response = requests.get(base_url, params=params)
    
    # Raise an exception for bad responses
    response.raise_for_status()
    
    # Parse the JSON results with full details
    results = response.json()
    
    # Print raw API response for diagnostics
    # print("Raw API Response:")
    # print(json.dumps(results, indent=2))
    
    # Extract detailed concept information
    concepts = []
    for term_info in results.get('response', {}).get('docs', []):
        # print(term_info)
        concept = {
            'label': term_info.get('label', 'No Label'),
            'iri': term_info.get('iri', 'No IRI'),
            'description': term_info.get('description', 'No Description'),
            'ontology_name': term_info.get('ontology_name', 'No Ontology Name'),
            'short_form': term_info.get('short_form', 'No Short Form')
        }
        concepts.append(concept)
    
    for idx, concept in enumerate(concepts, 1):
        print(f"\nResult {idx}:")
        print(f"Label: {concept['label']}")
        print(f"IRI: {concept['iri']}")
        print(f"Ontology Name: {concept['ontology_name']}")
        print(f"Description: {concept['description']}")
        print(f"Short Form: {concept['short_form']}")

except requests.RequestException as e:
    print(f"Error connecting to OLS API: {e}")


Result 1:
Label: Alzheimer's disease
IRI: http://snomed.info/id/26929004
Ontology Name: snomed
Description: []
Short Form: SNOMED_26929004

Result 2:
Label: Focal Alzheimer's disease
IRI: http://snomed.info/id/230269008
Ontology Name: snomed
Description: []
Short Form: SNOMED_230269008

Result 3:
Label: FH: Alzheimer's disease
IRI: http://snomed.info/id/394877006
Ontology Name: snomed
Description: []
Short Form: SNOMED_394877006

Result 4:
Label: Non-Alzheimer's progressive dysphasia
IRI: http://snomed.info/id/230279005
Ontology Name: snomed
Description: []
Short Form: SNOMED_230279005

Result 5:
Label: Delusions in Alzheimer's disease
IRI: http://snomed.info/id/141991000119109
Ontology Name: snomed
Description: []
Short Form: SNOMED_141991000119109

Result 6:
Label: Alzheimer's disease society member
IRI: http://snomed.info/id/161108005
Ontology Name: snomed
Description: []
Short Form: SNOMED_161108005

Result 7:
Label: Alzheimer's Disease Assessment Scale
IRI: http://snomed.info/id/

In [73]:
print(json.dumps(results, indent=2))

{
  "response": {
    "docs": [
      {
        "iri": "http://snomed.info/id/276587008",
        "ontology_name": "snomed",
        "ontology_prefix": "SNOMED",
        "short_form": "SNOMED_276587008",
        "description": [],
        "label": "Perinatal neurological disorder",
        "obo_id": "SNOMED:276587008",
        "type": "class"
      },
      {
        "iri": "http://snomed.info/id/247384001",
        "ontology_name": "snomed",
        "ontology_prefix": "SNOMED",
        "short_form": "SNOMED_247384001",
        "description": [],
        "label": "Neurological pain disorder",
        "obo_id": "SNOMED:247384001",
        "type": "class"
      },
      {
        "iri": "http://snomed.info/id/397929001",
        "ontology_name": "snomed",
        "ontology_prefix": "SNOMED",
        "short_form": "SNOMED_397929001",
        "description": [],
        "label": "Neurological morbidity",
        "obo_id": "SNOMED:397929001",
        "type": "class"
      },
      {
        

First attempt to consolidate into working subfunctions.

In [None]:
def retrieve_ontology_matches(term, numresults=50, ontology=None):
    """
    Retrieve ontology matches for a given term using OLS API

    params:
    - term (str) - a search term
    - ontologies (list) - a list of ontology names to search.
                            If not provided, all ontologies will be searched.   
    """
    base_url = 'https://www.ebi.ac.uk/ols4/api/search'
    # term = "alzheimer's"

    params = {
        'q': term,
        'rows': numresults,  
    }
    if ontology is not None:
        params['ontology'] = ontology

    try:
        # Make the API request
        response = requests.get(base_url, params=params)
        
        # Raise an exception for bad responses
        response.raise_for_status()
        
        # Parse the JSON results with full details
        results = response.json()

        jsonresults = json.dumps(results, indent=2)
        
        # Extract detailed concept information
        ontology_matches = []
        for term_info in results.get('response', {}).get('docs', []):
            # print(term_info)
            concept = {
                'label': term_info.get('label', 'No Label'),
                'iri': term_info.get('iri', 'No IRI'),
                'description': term_info.get('description', 'No Description'),
                'ontology_name': term_info.get('ontology_name', 'No Ontology Name'),
                'short_form': term_info.get('short_form', 'No Short Form')
            }
            ontology_matches.append(concept)

        return jsonresults, ontology_matches

    except requests.RequestException as e:
        print(f"Error connecting to OLS API: {e}")

def get_term_ancestors(ontology, iri):
    """
    Get ontology term properties using OLS API

    params:
    - ontology (str) - the ontology name
    - iri (str) - the IRI of the term

    returns:
    - ancestors (list) - list of ancestors, each with a dictionary of term properties

    Here is an example of a valid request:
    https://www.ebi.ac.uk/ols4/api/ontologies/duo/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDUO_0000017/ancestors?lang=en
    """
    
    
    # double quote here if we're passing it directly into the URL
    # single quote if we're passing as param
    iri_encoded = quote(quote(iri, safe=''))
    
    # print(f"Attempting to get ancestors for {iri_encoded}")

    url = f"https://www.ebi.ac.uk/ols4/api/ontologies/{ontology}/terms/{iri_encoded}/ancestors"
    params = {
        'lang': 'en'
    }

    try:
        # Make the API request
        response = requests.get(url, params=params)
        
        # Raise an exception for bad responses
        response.raise_for_status()
        
        # Parse the JSON results with full details
        result = response.json()

        # get the ancestors
        ancestors = [i for i in result.get('_embedded', {}).get('terms', [])]
        
        return ancestors

    except requests.RequestException as e:
        
        print(f"Error connecting to OLS API: {e}")

        return e

def rank_ontology_matches(ontology_matches):
    """
    Rank ontology matches

    Criteria for ranking:
    - relevance
    - abstraction (highest level that makes sense)
        - likely depends on get_ontology_term_properties to get the hierarchy

    params:
    - ontology_matches (list) - a list of ontology matches

    returns:
    - ranked_matches (list) - a list of ranked ontology matches
    """
    return ranked_matches

def get_ontology_term_properties(ontology, iri):
    """
    Get ontology term properties using OLS API

    params:
    - ontology (str) - the ontology name
    - iri (str) - the IRI of the term

    returns:
    - term_properties (dict) - a dictionary of term properties

    This is still broken. Fails with 500 Error Code with message "Expected at least 1 result for solr getFirst"
    Can confirm the IRI is properly encoded.
    """
    
    # double quote here if we're passing it directly into the URL
    # single quote if we're passing as param
    iri = quote(quote(iri, safe=''))

    url = f"https://www.ebi.ac.uk/ols4/api/ontologies/{ontology}/properties/{iri}"
    params = {
        # 'rows': 20,  # Increased rows
        # 'iri': iri,
        'lang': 'en'
    }

    try:
        # Make the API request
        response = requests.get(url, params=params)
        
        # Raise an exception for bad responses
        response.raise_for_status()
        
        # Parse the JSON results with full details
        term_properties = response.json()
        
        return term_properties

    except requests.RequestException as e:
        print(f"Error connecting to OLS API: {e}")



In [138]:
import time

# get initial set of ontology matches
term = "alzheimer's"
jsonresults, matches = retrieve_ontology_matches(term, numresults=50)

# then get the distance from root for each one
# which is just the length of the ancestors list
matches_with_metadata = []
for idx, match in enumerate(matches):
    
    # get ancestors
    iri = match['iri']
    ontology = match['ontology_name']
    ancestors = get_term_ancestors(ontology, iri)
    print(f"Retrieved {len(ancestors)} ancestors for {iri}")

    # update match with ancestor info
    match['ancestors'] = ancestors
    match['distance_from_root'] = len(ancestors)

    matches_with_metadata.append(match)

    # pause for a bit
    # so we don't get rate limited
    time.sleep(1)


Attempting to get ancestors for http%253A%252F%252Fsnomed.info%252Fid%252F26929004
Retrieved 20 ancestors for http://snomed.info/id/26929004
Attempting to get ancestors for http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDOID_10652
Retrieved 6 ancestors for http://purl.obolibrary.org/obo/DOID_10652
Attempting to get ancestors for http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FNCIT_C2866
Retrieved 15 ancestors for http://purl.obolibrary.org/obo/NCIT_C2866
Attempting to get ancestors for http%253A%252F%252Fsnomed.info%252Fid%252F230269008
Retrieved 20 ancestors for http://snomed.info/id/230269008
Attempting to get ancestors for http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDOID_0080348
Retrieved 11 ancestors for http://purl.obolibrary.org/obo/DOID_0080348
Attempting to get ancestors for http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDOID_0110035
Retrieved 11 ancestors for http://purl.obolibrary.org/obo/DOID_0110035
Attempting to get ancestors for http%253A%252F%252Fpurl.o

In [139]:
# sort by distance from root
matches_with_metadata.sort(key=lambda x: x['distance_from_root'])
# and print out
for idx, match in enumerate(matches_with_metadata, 1):
    print(f"\nResult {idx}:")
    print(f"Label: {match['label']}")
    print(f"IRI: {match['iri']}")
    print(f"Ontology Name: {match['ontology_name']}")
    print(f"Description: {match['description']}")
    print(f"Short Form: {match['short_form']}")
    print(f"Distance from Root: {match['distance_from_root']}")


Result 1:
Label: Alzheimer's Disease Pathway KEGG
IRI: http://purl.obolibrary.org/obo/NCIT_C38778
Ontology Name: ncit
Description: []
Short Form: NCIT_C38778
Distance from Root: 3

Result 2:
Label: Alzheimer's Disease Pathway BioCarta
IRI: http://purl.obolibrary.org/obo/NCIT_C39177
Ontology Name: ncit
Description: []
Short Form: NCIT_C39177
Distance from Root: 3

Result 3:
Label: Alzheimer's Disease Assessment Scale
IRI: http://snomed.info/id/273269001
Ontology Name: snomed
Description: []
Short Form: SNOMED_273269001
Distance from Root: 3

Result 4:
Label: Other Alzheimer's Disease
IRI: http://purl.obolibrary.org/obo/NCIT_C195947
Ontology Name: ncit
Description: ["Evidence of other Alzheimer's disease not specified elsewhere."]
Short Form: NCIT_C195947
Distance from Root: 4

Result 5:
Label: Alzheimer's disease pathway
IRI: http://purl.obolibrary.org/obo/PW_0000015
Ontology Name: pw
Description: ['A mostly sporadic, late-onset condition affecting the central nervous system, that is t

In [None]:
def get_term_ancestors(ontology, iri):
    """
    Get ontology term properties using OLS API

    params:
    - ontology (str) - the ontology name
    - iri (str) - the IRI of the term

    returns:
    - term_properties (dict) - a dictionary of term properties

    Here is an example of a valid request:
    https://www.ebi.ac.uk/ols4/api/ontologies/duo/terms/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDUO_0000017/ancestors?lang=en
    """
    
    
    # make sure the IRI is double encoded
    iri_encoded = quote(quote(iri, safe=''))
    
    print(f"Attempting to get ancestors for {iri_encoded}")

    url = f"https://www.ebi.ac.uk/ols4/api/ontologies/{ontology}/terms/{iri_encoded}/ancestors"
    params = {
        'lang': 'en'
    }

    try:
        # Make the API request
        response = requests.get(url, params=params)
        
        # Raise an exception for bad responses
        response.raise_for_status()
        
        # Parse the JSON results with full details
        term_properties = response.json()
        
        return term_properties

    except requests.RequestException as e:
        
        print(f"Error connecting to OLS API: {e}")

        return e

iri = "http://purl.obolibrary.org/obo/DOID_10652"
ontology = "doid"
# iri = "http://purl.obolibrary.org/obo/DUO_0000017"
# ontology = "duo"
result = get_term_ancestors(ontology, iri)
print(json.dumps(result, indent=2))



http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDOID_10652
Attempting to get ancestors for http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FDOID_10652
{
  "_embedded": {
    "terms": [
      {
        "iri": "http://purl.obolibrary.org/obo/DOID_680",
        "lang": "en",
        "description": [
          "A neurodegenerative disease that has_material_basis_in the pathological aggregation of tau protein in so-called neurofibrillary tangles (NFT) in the human brain."
        ],
        "synonyms": [],
        "annotation": {
          "has_dbxref": [
            "MESH:D024801",
            "UMLS_CUI:C0949664"
          ],
          "has_obo_namespace": [
            "disease_ontology"
          ],
          "id": [
            "DOID:680"
          ]
        },
        "label": "tauopathy",
        "ontology_name": "doid",
        "ontology_prefix": "DOID",
        "ontology_iri": "http://purl.obolibrary.org/obo/doid.owl",
        "is_obsolete": false,
        "term_replaced_by": n

In [115]:
ancestors = [i for i in result.get('_embedded', {}).get('terms', [])]
len(ancestors)

6

In [120]:
for a in ancestors:
    print(a['label'])
    print(f'is_root = {a["is_root"]}')

tauopathy
is_root = False
neurodegenerative disease
is_root = False
central nervous system disease
is_root = False
nervous system disease
is_root = False
disease of anatomical entity
is_root = False
disease
is_root = True


## Parse out stuff from docx

In [3]:
result

[{'doc_title': '',
  'doc_lastauthorname': '',
  'doc_year': '',
  'doc_url': '',
  'doc_peco_section': '',
  'peco_elements': {'Population': ['patients with suspected malaria'],
   'Exposure': ['Plasmodium', 'infection'],
   'Comparator': ['Plasmodium', '-uninfected individuals'],
   'Outcome': ['blood cortisol levels']}},
 {'doc_title': '',
  'doc_lastauthorname': '',
  'doc_year': '',
  'doc_url': '',
  'doc_peco_section': '',
  'peco_elements': {'Population': ['older population'],
   'Exposure': ['ambient temperatures'],
   'Comparator': ['compared to relatively lower or higher temperatures'],
   'Outcome': ['mortality',
    'morbidity',
    'mental and neurological disorders',
    'extreme temperature events']}}]

In [None]:
from keybert import KeyBERT