In [24]:
!wget -q -O data/efo_otar_slim.owl https://github.com/EBISPOT/efo/releases/download/v3.41.0/efo_otar_slim.owl

In [None]:
import logging
from typing import Optional

import pandas as pd
from pronto import Ontology
from txtai.embeddings import Embeddings

## Prepare index from EFO dump

Index consists of a list of documents. Each document:
- ID: will be the one from EFO
- Text: 2 approaches:
  - Only use the EFO labels
  - Use the concatenation of the EFO labels and the exact synonyms
- Tags: will be None (can be metadata as the therapeutic area)

In [None]:
efo_dump = 'data/efo_otar_slim.owl'

efo_terms = Ontology(efo_dump).terms()

In [7]:
def normalise_ontology_identifier(identifier: str) -> Optional[str]:
    """
    Normalise ontology identifier representation in order to make direct string-to-string comparison possible.
    Ex:
    'http://www.orpha.net/ORDO/Orphanet_178506' --> 'Orphanet_178506'
    'BTO:0000305' --> 'BTO_0000305'
    """

    return identifier.split('/')[-1].replace(':', '_')

all_therapeutic_areas = [
        'MONDO_0045024',
        'EFO_0005741',
        'OTAR_0000014',
        'EFO_0005932',
        'MONDO_0024458',
        'EFO_0000319',
        'EFO_0009605',
        'EFO_0010282',
        'OTAR_0000017',
        'EFO_0010285',
        'EFO_0001379',
        'OTAR_0000010',
        'EFO_0009690',
        'OTAR_0000006',
        'MONDO_0021205',
        'EFO_0000540',
        'EFO_0005803',
        'EFO_0000618',
        'MONDO_0002025',
        'MONDO_0024297',
        'OTAR_0000018',
        'OTAR_0000009',
        'EFO_0000651',
        'EFO_0001444',
        'GO_0008150',
    ]

In [13]:
data = []

for term in efo_terms:

    try:
        # Remove obsoletes
        if 'obsolete' in term.name:
            continue

        id: str = normalise_ontology_identifier(term.id)
        label: str = term.name
        exact_synonyms: list[str] = [synonym.description for synonym in term.synonyms if synonym.scope == 'EXACT']
        # I am going to merge the synonyms together with the labels in a way that they are equally equivalent to the id
        text: str = ";".join(list(set(exact_synonyms + [label])))
        ancestors : list[str] = [normalise_ontology_identifier(ancestor.id) for ancestor in term.superclasses()]
        therapeutic_areas: list[str] = [ancestor for ancestor in ancestors if ancestor in all_therapeutic_areas]

        document = {
            'id': id,
            'label': label,
            'exact_synonyms': exact_synonyms,
            'text': text,
            'therapeutic_areas': therapeutic_areas
        }
        data.append(document)
    except TypeError as e:
        logging.warning(f'{term.id} is problematic: {e}')



In [14]:
data[:3]

[{'id': 'DOID_0050890',
  'label': 'synucleinopathy',
  'exact_synonyms': ['alpha Synucleinopathies', 'synucleinopathy'],
  'text': 'alpha Synucleinopathies;synucleinopathy',
  'therapeutic_areas': ['OTAR_0000018', 'EFO_0000618']},
 {'id': 'DOID_10113',
  'label': 'trypanosomiasis',
  'exact_synonyms': ['trypanosomiasis',
   'Trypanosoma disease or disorder',
   'Trypanosoma caused disease or disorder',
   'Trypanosoma infectious disease'],
  'text': 'Trypanosoma caused disease or disorder;trypanosomiasis;Trypanosoma infectious disease;Trypanosoma disease or disorder',
  'therapeutic_areas': ['EFO_0005741']},
 {'id': 'DOID_10718',
  'label': 'giardiasis',
  'exact_synonyms': ['beaver fever',
   'beaver feaver',
   'giardiasis',
   'Giardia infection',
   'infection by Giardia lamblia'],
  'text': 'beaver fever;giardiasis;Giardia infection;beaver feaver;infection by Giardia lamblia',
  'therapeutic_areas': ['EFO_0005741', 'EFO_0010282']}]

In [15]:
df = pd.DataFrame(data)

df

Unnamed: 0,id,label,exact_synonyms,text,therapeutic_areas
0,DOID_0050890,synucleinopathy,"[alpha Synucleinopathies, synucleinopathy]",alpha Synucleinopathies;synucleinopathy,"[OTAR_0000018, EFO_0000618]"
1,DOID_10113,trypanosomiasis,"[trypanosomiasis, Trypanosoma disease or disor...",Trypanosoma caused disease or disorder;trypano...,[EFO_0005741]
2,DOID_10718,giardiasis,"[beaver fever, beaver feaver, giardiasis, Giar...",beaver fever;giardiasis;Giardia infection;beav...,"[EFO_0005741, EFO_0010282]"
3,DOID_13406,pulmonary sarcoidosis,"[Sarcoidosis, Pulmonary, lung Sarcoidosis, pul...",sarcoidosis of lung;lung sarcoidosis;lung Sarc...,"[OTAR_0000010, OTAR_0000006]"
4,DOID_1947,trichomoniasis,"[Trichomonas infection, Trichomonas Infections...",trichomoniasis;Trichomonas infection;Trichomon...,[EFO_0005741]
...,...,...,...,...,...
22991,Orphanet_99946,Autosomal dominant Charcot-Marie-Tooth disease...,[CMT2A1],CMT2A1;Autosomal dominant Charcot-Marie-Tooth ...,"[EFO_0000618, OTAR_0000018]"
22992,Orphanet_99947,Autosomal dominant Charcot-Marie-Tooth disease...,"[Charcot-Marie-Tooth neuropathy type 2A2, char...",Charcot-Marie-Tooth disease neuronal type 2A2;...,"[EFO_0000618, OTAR_0000018]"
22993,Orphanet_99960,Benign recurrent intrahepatic cholestasis type 1,"[BRIC1, BRIC type 1]",BRIC1;Benign recurrent intrahepatic cholestasi...,"[OTAR_0000018, EFO_0001379, EFO_0010282]"
22994,Orphanet_99961,Benign recurrent intrahepatic cholestasis type 2,"[BRIC2, BRIC type 2]",BRIC2;Benign recurrent intrahepatic cholestasi...,"[OTAR_0000018, EFO_0001379, EFO_0010282]"


In [16]:
df.to_json('data/documents.json', orient='records')

## Ingest features to index

In [64]:
embeddings_labels = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})

In [66]:
features_labels = df[['id', 'label']].drop_duplicates().to_records(index=False).tolist()

embeddings_labels.index([(uid, text, None) for uid, text in features_labels])

Error: Canceled future for execute_request message before replies were done

In [1]:
embeddings_labels.count()

NameError: name 'embeddings_labels' is not defined

In [None]:
embeddings_synonyms = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2"})
features_synonyms = df[['id', 'text']].drop_duplicates().to_records(index=False).tolist()

embeddings_synonyms.index([(uid, text, None) for uid, text in features_labels])


## Query diseases to get the most similar EFO



In [11]:
embeddings = Embeddings()
embeddings.load('embeddings/embeddings_labels.tar.gz')

In [14]:
embeddings.search('acondroplasya', 1)

Error: Canceled future for execute_request message before replies were done