# download data

In [49]:
import subprocess

REPO_ROOT = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True, check=True).stdout.strip()

In [None]:
import requests
import os

urls = {
    "DOID": "http://purl.obolibrary.org/obo/doid.owl",
    "SYMP": "http://purl.obolibrary.org/obo/symp.owl",
    "FMA": "http://purl.obolibrary.org/obo/fma.owl",
}
os.makedirs(f"{REPO_ROOT}/data/owl", exist_ok=True)

for name, url in urls.items():
    data_path = REPO_ROOT + f"/data/owl/{name}.owl"
    if os.path.exists(data_path):
        print(f"{name}.owl already exists. Redownload? [Y/n]: ", end="")
        if input().strip().lower() != "y":
            print(f"Skipping.")
            continue
    
    print("Downloading", f"{name}.owl ... ", end="")
    req = requests.get(url)
    with open(data_path, "wb") as fw:
        fw.write(req.content)
    print("done!")

DOID.owl already exists. Redownload? [Y/n]: 

# read and filter data

In [2]:
from deeponto.onto import Ontology

# Ontology のロード
doid_onto = Ontology(f"{REPO_ROOT}/data/owl/DOID.owl")
symp_onto = Ontology(f"{REPO_ROOT}/data/owl/SYMP.owl")
fma_onto = Ontology(f"{REPO_ROOT}/data/owl/FMA.owl")

# 必要な Ontology に絞る
doid_list = [iri for iri in doid_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/DOID_")]
symp_list = [iri for iri in symp_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/SYMP_")]
fma_list  = [iri for iri in fma_onto.owl_classes.keys() if iri.startswith("http://purl.obolibrary.org/obo/FMA_")]
print("len(doid_list) =", len(doid_list))
print("len(symp_list) =", len(symp_list))
print("len(fma_list) =", len(fma_list))

Please enter the maximum memory located to JVM [8g]:



INFO:deeponto:8g maximum memory allocated to JVM.
INFO:deeponto:JVM started successfully.


len(doid_list) = 14452
len(symp_list) = 1019
len(fma_list) = 78977


# save master data

In [107]:
from org.semanticweb.owlapi.vocab import OWLRDFVocabulary

# アノテーションプロパティの IRI
class OwlIri:
    RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"  # label = その IRI の名前を表す IRI
    OBO_ID = "http://www.geneontology.org/formats/oboInOwl#id" # OBO ID = その IRI の短縮 ID を表す IRI
    IAO_DEF = "http://purl.obolibrary.org/obo/IAO_0000115"  # IAO = その IRI の定義を表す IRI
    SYN_EXACT = "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym"  # 同義語 (厳密)
    SYN_RELATED = "http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym"  # 関連語
    RO_SYMPTOM  = "RO_0002452"  # this means `doid has_symptom symp` https://www.ebi.ac.uk/ols4/ontologies/ro/properties/http%253A%252F%252Fpurl.obolibrary.org%252Fobo%252FRO_0002452


def get_ontology_info(onto: Ontology, iri: str):
    """
    Ontology インスタンスと IRI から、その IRI の短縮 ID、名前、定義を取得する。

    Arguments:
        onto: owl ファイルをロードした Ontology インスタンス
        iri: "http://purl.obolibrary.org/obo/DOID_0002116" のような完全 IRI

    Returns:
        (short_id, name, definition)
    """
    obj = onto.get_owl_object(iri)
    properties = onto.owl_annotation_properties

    if OwlIri.OBO_ID in properties:
        id_annots = onto.get_annotations(obj, annotation_property_iri=OwlIri.OBO_ID)
        short_id = next(iter(id_annots), None)
    else:
        short_id = None

    if OwlIri.RDFS_LABEL in properties:
        name_annots = onto.get_annotations(obj, annotation_property_iri=OwlIri.RDFS_LABEL)
        name = next(iter(name_annots), None)
    else:
        name = None

    if OwlIri.SYN_EXACT in properties:
        synonym_exact_annots = onto.get_annotations(obj, annotation_property_iri=OwlIri.SYN_EXACT)
        synonym_exact = next(iter(synonym_exact_annots), None)
    else:
        synonym_exact = None

    if OwlIri.SYN_RELATED in properties:
        synonym_related_annots = onto.get_annotations(obj, annotation_property_iri=OwlIri.SYN_RELATED)
        synonym_related = next(iter(synonym_related_annots), None)
    else:
        synonym_related = None

    if OwlIri.IAO_DEF in properties:
        def_annots = onto.get_annotations(obj, annotation_property_iri=OwlIri.IAO_DEF)
        definition = next(iter(def_annots), None)
    else:
        definition = None

    return (iri, short_id, name, synonym_exact, synonym_related, definition)


doid_master = [get_ontology_info(doid_onto, doid_iri) for doid_iri in doid_list]
symp_master = [get_ontology_info(symp_onto, symp_iri) for symp_iri in symp_list]
fma_master  = [get_ontology_info(fma_onto,  fma_iri)  for fma_iri  in fma_list]

# 出力
for onto_name, master in [("DOID", doid_master), ("SYMP", symp_master), ("FMA", fma_master)]:
    with open(f"{REPO_ROOT}/data/master/{onto_name}.tsv", "w") as fw:
        fw.write("iri\tid\tname\texact_synonym\trelated_synonym\tdefinition\n")
        for iri, short_id, name, synonym_exact, synonym_related, definition in master:
            fw.write(f"{iri}\t{short_id}\t{name}\t{synonym_exact}\t{synonym_related}\t{definition}\n")

    print(f"{onto_name}_master.tsv saved.")

DOID_master.tsv saved.
SYMP_master.tsv saved.
FMA_master.tsv saved.


In [221]:
import pandas as pd
df_symp = pd.read_table(f"{REPO_ROOT}/data/master/SYMP.tsv", sep="\t", header=0, dtype=str)
df_doid = pd.read_table(f"{REPO_ROOT}/data/master/DOID.tsv", sep="\t", header=0, dtype=str)

# desease and symptoms

In [166]:
from pandas import DataFrame
import numpy as np
import tqdm
from rapidfuzz.process import cdist
from org.semanticweb.owlapi.model import AxiomType, OWLObjectSomeValuesFrom

def doid2symp_by_axiom(doid_onto: Ontology):
    """
    RO_0002452(has_symp) を使った axiom ベースのマッチング
    ontology の axiom では `doid has_symptom symp` という形式の axiom しか存在しないため
    doid から symptom を引く。
    e.g.
        Malaria ⊑ Disease ⊓ has_symptom some Fever

    Arguments:
        doid_onto:  Ontology instance for DOID.owl
    Returns:
        Dict of doid_iri -> List of tuples (symptom_iri, method, score)
        where method is "axiom" and score is 1.0
    """
    result = []
    for doid_iri in tqdm.tqdm(doid_list):
        for sup in doid_onto.get_asserted_parents(doid_obj, named_only=False):
            if isinstance(sup, OWLObjectSomeValuesFrom):
                prop = sup.getProperty()
                if OwlIri.RO_SYMPTOM in str(prop):
                    filler = sup.getFiller()
                    symp_iri = str(filler.getIRI())
                    result.append([doid_iri, symp_iri, "axiom", 1.0])
        
    return pd.DataFrame(result, columns=["doid_iri", "symptom_iri", "method", "score"])

def symp2doid_by_keyword(df_symp: DataFrame, df_doid: DataFrame, topk=5):
    """
    symp と doid の名前、定義、同義語を使ったキーワードベースのマッチング
    e.g.
        DOID:0050117  Malaria  hasExactSynonym: "malaria"
        SYMP:0000185 Fever   name: "fever", hasExactSynonym: "pyrexia"

    Arguments:
        df_symp: DataFrame for SYMP.tsv
        df_doid: DataFrame for DOID.tsv
        topk: Return top k results for each doid
    Returns:
        Dict of doid_iri -> List of tuples (symptom_iri, method)
        where method is "synonym"
    """
    results = []
    _df_symp = df_symp.copy().fillna("")
    _df_doid = df_doid.copy().fillna("")

    def _join_keywords(row):
        return f"name={row['name']}:exact_synonym={row['exact_synonym']}:related_synonym={row['related_synonym']}:definition={row['definition']}"

    _df_symp['keyword'] = _df_symp[['name', 'exact_synonym', 'related_synonym', 'definition']].apply(_join_keywords, axis=1).str.lower()
    _df_doid['keyword'] = _df_doid[['name', 'exact_synonym', 'related_synonym', 'definition']].apply(_join_keywords, axis=1).str.lower()

    queries = _df_symp['keyword'].tolist()
    choices = _df_doid['keyword'].tolist()

    scores = cdist(queries, choices, workers=-1)
    topk_indices = np.argsort(scores, axis=1)[:, -topk:][:, ::-1]
    rows = np.arange(scores.shape[0])[:, None]
    topk_doid_iris = [_df_doid.iloc[topk_index]['iri'].tolist() for topk_index in topk_indices]  # shape: (num_symptoms, topk)
    topk_confidences = scores[rows, topk_indices].tolist()  # shape: (num_symptoms, topk)

    for symp_iri, topk_doid_iri, topk_confidence in zip(_df_symp['iri'], topk_doid_iris, topk_confidences):
        for doid_iri, confidence in zip(topk_doid_iri, topk_confidence):
            results.append([doid_iri, symp_iri, "keyword", confidence])

    return pd.DataFrame(results, columns=["doid_iri", "symptom_iri", "method", "score"])

In [233]:
df_results_axiom = doid2symp_by_axiom(doid_onto)
(
    df_results_axiom
    .set_index("doid_iri")
    .join(
        df_doid.set_index("iri")["name"].rename("doid_name")
    )
    .reset_index()
    .set_index("symptom_iri")
    .join(
        df_symp.set_index("iri")[["name", "definition"]].rename({"name": "symptom_name", "definition": "symptom_definition"}, axis=1)
    )
    .reset_index()
    .rename({"index": "symptom_iri"}, axis=1)
).to_csv(f"{REPO_ROOT}/data/relationship/symp2doid_by_axiom.tsv", sep="\t", index=False)

100%|██████████| 14452/14452 [00:00<00:00, 22516.37it/s]


In [None]:
df_results_keyword = symp2doid_by_keyword(df_symp, df_doid, topk=5)
(
    df_results_keyword
    .set_index("doid_iri")
    .join(
        df_doid.set_index("iri")["name"].rename("doid_name")
    )
    .reset_index()
    .set_index("symptom_iri")
    .join(
        df_symp.set_index("iri")[["name", "definition"]].rename({"name": "symptom_name", "definition": "symptom_definition"}, axis=1)
    )
    .reset_index()
    .rename({"index": "symptom_iri"}, axis=1)
).to_csv(f"{REPO_ROOT}/data/relationship/symp2doid_by_keyword.tsv", sep="\t", index=False)

# symptom and location

In [63]:
import pandas as pd
df_fma = pd.read_table(f"{REPO_ROOT}/data/master/FMA.tsv", sep="\t", header=0, dtype=str)
fma_labels = {row['iri']: row['name'] for _, row in df_fma.iterrows()}

df_symp = pd.read_table(f"{REPO_ROOT}/data/master/SYMP.tsv", sep="\t", header=0, dtype=str)
symp_labels = {row['iri']: row['name'] for _, row in df_symp.iterrows()}

In [73]:
# キーワードマッチで症状と身体の部位のマッチング
from rapidfuzz.process import cdist

df_symp_valid = df_symp[~df_symp.definition.isna() | ~df_symp.name.isna()].copy()
queries = df_symp_valid.name + ": " + df_symp_valid.definition
choices = df_fma.name

score = cdist(queries, choices, workers=-1)
best_matched_body = df_fma.iloc[pd.DataFrame(score, index=queries, columns=choices).values.argmax(axis=1)].iri

with open(f"{REPO_ROOT}/data/relationship/symp_fma.tsv", "w") as fw:
    print("symptom_iri\tbody_part_iri\tmethod", file=fw)
    for symp_iri, body_iri in zip(df_symp_valid.iri, best_matched_body):
        print(f"{symp_iri}\t{body_iri}\tkeyword", file=fw)