# download data

In [26]:
import subprocess

REPO_ROOT = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True).stdout.strip()

In [27]:
import requests
import os

urls = {
    "DOID": "http://purl.obolibrary.org/obo/doid.owl",
    "SYMP": "http://purl.obolibrary.org/obo/symp.owl",
    "FMA": "http://purl.obolibrary.org/obo/fma.owl",
}
os.makedirs(f"{REPO_ROOT}/data/owl", exist_ok=True)

for name, url in urls.items():
    data_path = REPO_ROOT + f"/data/owl/{name}.owl"
    if os.path.exists(data_path):
        print(f"{name}.owl already exists. Redownload? [Y/n]: ", end="")
        if input().strip().lower() != "y":
            print(f"Skipping.")
            continue
    
    print("Downloading", f"{name}.owl ... ", end="")
    req = requests.get(url)
    with open(data_path, "wb") as fw:
        fw.write(req.content)
    print("done!")

DOID.owl already exists. Redownload? [Y/n]: Skipping.
SYMP.owl already exists. Redownload? [Y/n]: Skipping.
FMA.owl already exists. Redownload? [Y/n]: Skipping.


# read and filter data

In [28]:
from deeponto.onto import Ontology

# Ontology のロード
doid_onto = Ontology(f"{REPO_ROOT}/data/owl/DOID.owl")
symp_onto = Ontology(f"{REPO_ROOT}/data/owl/SYMP.owl")
fma_onto = Ontology(f"{REPO_ROOT}/data/owl/FMA.owl")

# 必要な Ontology に絞る
doid_list = [iri for iri in doid_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/DOID_")]
symp_list = [iri for iri in symp_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/SYMP_")]
fma_list  = [iri for iri in fma_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/FMA_")]
print("len(doid_list) =", len(doid_list))
print("len(symp_list) =", len(symp_list))
print("len(fma_list) =", len(fma_list))

len(doid_list) = 14452
len(symp_list) = 1019
len(fma_list) = 78977


# save master data

In [29]:
from org.semanticweb.owlapi.vocab import OWLRDFVocabulary

# アノテーションプロパティの IRI
RDFS_LABEL = str(OWLRDFVocabulary.RDFS_LABEL.getIRI())  # label = その IRI の名前を表す IRI
OBO_ID = "http://www.geneontology.org/formats/oboInOwl#id" # OBO ID = その IRI の短縮 ID を表す IRI
IAO_DEF = "http://purl.obolibrary.org/obo/IAO_0000115"  # IAO = その IRI の定義を表す IRI

def get_ontology_info(onto: Ontology, iri: str):
    """
    Ontology インスタンスと IRI から、その IRI の短縮 ID、名前、定義を取得する。

    Arguments:
        onto: owl ファイルをロードした Ontology インスタンス
        iri: "http://purl.obolibrary.org/obo/DOID_0002116" のような完全 IRI

    Returns:
        (short_id, name, definition)
    """
    obj = onto.get_owl_object(iri)

    id_annots = onto.get_annotations(obj, annotation_property_iri=OBO_ID)
    short_id = next(iter(id_annots), None)

    name_annots = onto.get_annotations(obj, annotation_property_iri=RDFS_LABEL)
    name = next(iter(name_annots), None)

    def_annots = onto.get_annotations(obj, annotation_property_iri=IAO_DEF)
    definition = next(iter(def_annots), None)

    return iri, short_id, name, definition


doid_master = [get_ontology_info(doid_onto, doid_iri) for doid_iri in doid_list]
symp_master = [get_ontology_info(symp_onto, symp_iri) for symp_iri in symp_list]
fma_master  = [get_ontology_info(fma_onto,  fma_iri)  for fma_iri  in fma_list]

# 出力
for onto_name, master in [("DOID", doid_master), ("SYMP", symp_master), ("FMA", fma_master)]:
    with open(f"{REPO_ROOT}/data/master/{onto_name}.tsv", "w") as fw:
        fw.write("iri\tid\tname\tdefinition\n")
        for iri, short_id, name, definition in master:
            fw.write(f"{iri}\t{short_id}\t{name}\t{definition}\n")
    print(f"{onto_name}_master.tsv saved.")

DOID_master.tsv saved.
SYMP_master.tsv saved.
FMA_master.tsv saved.


In [36]:
df_symp = pd.read_table(f"{REPO_ROOT}/data/master/SYMP.tsv", sep="\t", header=0, dtype=str)
df_symp

Unnamed: 0,iri,id,name,definition
0,http://purl.obolibrary.org/obo/SYMP_0000000,SYMP:0000000,cellulitis,Cellulitis is a musculoskeletal system symptom...
1,http://purl.obolibrary.org/obo/SYMP_0000001,SYMP:0000001,abdominal cramp,
2,http://purl.obolibrary.org/obo/SYMP_0000002,SYMP:0000002,abdominal distention,
3,http://purl.obolibrary.org/obo/SYMP_0000003,SYMP:0000003,obsolete acute enteritis in newborns,
4,http://purl.obolibrary.org/obo/SYMP_0000004,SYMP:0000004,obsolete arrested moulting,
...,...,...,...,...
1014,http://purl.obolibrary.org/obo/SYMP_0020060,SYMP:0020060,low white blood cell count,
1015,http://purl.obolibrary.org/obo/SYMP_0020061,SYMP:0020061,outlet dysfunction constipation,A constipation that is characterized by diffic...
1016,http://purl.obolibrary.org/obo/SYMP_0020062,SYMP:0020062,soft tissue necrosis,A musculoskeletal system symptom that is chara...
1017,http://purl.obolibrary.org/obo/SYMP_0020063,SYMP:0020063,blood pressure,


# desease and symptoms

In [56]:
import tqdm
from org.semanticweb.owlapi.model import AxiomType

# Annotation IRI
RO_SYMPTOM  = "RO_0002452"  # this means "has_symptom" https://www.ebi.ac.uk/ols4/ontologies/ro/properties/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FRO_0002452
SYN_EXACT   = "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym"
SYN_RELATED = "http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym"
RDFS_LABEL  = str(OWLRDFVocabulary.RDFS_LABEL.getIRI())

def get_symptoms_for_doid(doid_onto: Ontology, symp_onto: Ontology, doid_iri: str):
    """
    Arguments:
        doid_onto: Ontology instance for DOID.owl
        symp_onto: Ontology instance for SYMP.owl
        doid_iri:  Full IRI, e.g. "http://purl.obolibrary.org/obo/DOID_0002116"
    Returns:
        List of tuples (symptom_iri, method)
        where method is "restriction" or "synonym"
    """
    results = {}
    doid_obj = doid_onto.get_owl_object(doid_iri)

    # 1) RO_0002452 を使った restriction ベースのマッチング
    for ax in doid_onto.get_subsumption_axioms():
        if ax.getAxiomType().equals(AxiomType.SUBCLASS_OF) and doid_obj.equals(ax.getSubClass()):
            sup = ax.getSuperClass()
            try:
                prop = sup.getProperty()
                if RO_SYMPTOM in str(prop):
                    filler = sup.getFiller()
                    symp_iri = symp_onto.get_iri(filler)
                    results[symp_iri] = "restriction"
            except AttributeError:
                continue

    # 2) シノニムベース (hasExactSynonym, hasRelatedSynonym) のマッチング
    # Collect synonyms
    synonyms = []
    for prop_iri in (SYN_EXACT, SYN_RELATED):
        synonyms.extend(doid_onto.get_annotations(doid_obj, annotation_property_iri=prop_iri))
    
    for syn in synonyms:
        for lbl in df_symp['name'].dropna().tolist():
            if syn.lower() in lbl:
                if iri not in results:
                    results[iri] = "synonym"

    return results

In [57]:
# e.g. 
# doid_iri = "http://purl.obolibrary.org/obo/DOID_946"
# res = get_symptoms_for_doid(doid_onto, symp_onto, doid_iri)

doid_symptoms_dict = {}
for doid_iri in tqdm.tqdm(doid_list):
    res = get_symptoms_for_doid(doid_onto, symp_onto, doid_iri)
    if res:
        doid_symptoms_dict[doid_iri] = res

100%|██████████| 14452/14452 [1:23:29<00:00,  2.89it/s]    


In [62]:
with open(f"{REPO_ROOT}/data/relationship/doid_symp.tsv", "w") as fw:
    for doid_iri, symptoms in doid_symptoms_dict.items():
        for symp_iri, method in symptoms.items():
            print(f"{doid_iri}\t{symp_iri}\t{method}", file=fw)

# symptom and location

In [63]:
import pandas as pd
df_fma = pd.read_table(f"{REPO_ROOT}/data/master/FMA.tsv", sep="\t", header=0, dtype=str)
fma_labels = {row['iri']: row['name'] for _, row in df_fma.iterrows()}

df_symp = pd.read_table(f"{REPO_ROOT}/data/master/SYMP.tsv", sep="\t", header=0, dtype=str)
symp_labels = {row['iri']: row['name'] for _, row in df_symp.iterrows()}

In [64]:
# キーワードマッチで症状と身体の部位のマッチング
from rapidfuzz.process import cdist

df_symp_valid = df_symp[~df_symp.definition.isna()].copy()
queries = df_symp_valid.name + ": " + df_symp_valid.definition
choices = df_fma.name

score = cdist(queries, choices, workers=-1)
best_matched_body = df_fma.iloc[pd.DataFrame(score, index=queries, columns=choices).values.argmax(axis=1)].iri

with open(f"{REPO_ROOT}/data/relationship/symp_fma.tsv", "w") as fw:
    print("symptom_iri\tbody_part_iri\tmethod", file=fw)
    for symp_iri, body_iri in zip(df_symp_valid.iri, best_matched_body):
        print(f"{symp_iri}\t{body_iri}\tkeyword", file=fw)