In [1]:
import sys
import os

sys.path.insert(0, os.path.abspath(".."))
sys.path.append(os.path.abspath("../backend"))
sys.path.append(os.path.abspath(""))

In [2]:
from backend.ontology import *
base_path = "../data"
onto_path = base_path + "/hero-ontology/hereditary_clinical.ttl"
brainteaser_graph = Graph().parse(onto_path, format="turtle")
brainteaser_graph.bind("bto", "http://www.semanticweb.org/ontologies/2020/3/bto#")
oman = OntologyManager(OntologyConfig(), brainteaser_graph)

-- Loading and merging datasets
0 datasets loaded


In [3]:
classes=oman.q_to_df("""
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?s
WHERE {
    ?s rdf:type owl:Class.
}
""")[0].to_list()
classes

[rdflib.term.BNode('n77c7c2203b4a4474bc936853811f4bdfb1'),
 rdflib.term.URIRef('http://ontology.eil.utoronto.ca/GCI/Environment/Pollution.owl#Air_pollution_concentration'),
 rdflib.term.URIRef('http://purl.oclc.org/NET/ssnx/ssn#SensingDevice'),
 rdflib.term.URIRef('http://www.w3.org/2000/10/swap/pim/contact#Person'),
 rdflib.term.URIRef('http://www.w3.org/2006/time#Instant'),
 rdflib.term.URIRef('http://www.wurvoc.org/vocabularies/om-1.8/Unit_of_measure'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/ALSFRS'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/Activity'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/Administration'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/AdverseDrugReaction'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/AnatomicalSite'),
 rdflib.term.URIRef('https://w3id.org/brainteaser/ontology/schema/BeforeOnset'),
 rdflib.term.URIRef('https://w3id.org/brainteas

In [4]:
classes_enriched = [oman.enrich_subject(c, load_properties=True) for  c in classes]

In [24]:
classes_enriched[3]

Subject(subject_id='<http://www.w3.org/2000/10/swap/pim/contact#Person>', label='Person', spos={'rdf:type': ['owl:Class'], 'rdfs:comment': ['A human being. [Definition Source: NCI]'], 'rdfs:label': ['Person']}, subject_type='class', refcount=0, descendants={}, total_descendants=0, properties={'ObjectProperty': [Subject(subject_id='<https://w3id.org/brainteaser/ontology/schema/birthplace>', label='birthplace', spos={'rdf:type': ['owl:ObjectProperty'], 'rdfs:domain': ['<http://www.w3.org/2000/10/swap/pim/contact#Person>'], 'rdfs:range': ['<https://w3id.org/brainteaser/ontology/schema/Place>'], 'rdfs:comment': ['It defines the relationship between a person and his or her birthplace.'], 'rdfs:label': ['birthplace']}, subject_type='individual', refcount=0, descendants={}, total_descendants=0, properties={}), Subject(subject_id='<https://w3id.org/brainteaser/ontology/schema/enrolledIn>', label='enrolledIn', spos={'rdf:type': ['owl:ObjectProperty'], 'rdfs:domain': ['<http://www.w3.org/2000/10

In [76]:
documents = []
import regex as re


def to_readable(s: str):
    return re.sub(r"([a-z])([A-Z])", r"\1 \2", s).replace("_", " ")


for c in classes_enriched[1:]:
    if c.label.startswith("<"):
        print("ignoring", c.label)
        continue
    ne_doc= f"{c.subject_id} {c.label}\n"
    nis=oman.get_named_individuals(c.subject_id)
    for ne in nis:
        ne_doc += f"{ne.label} is a {c.label}\n"
    if len(nis)>0:
        documents.append(ne_doc)
    
    document_cls = f"{c.label}"
    if "rdfs:subClassOf" in c.spos:
        subcls = oman.enrich_subject(c.spos["rdfs:subClassOf"][0])
        if subcls.label.startswith("<"):
            continue
        document_cls += f" is subclass of  {subcls.label}\n"
    
    for pt, p in c.properties.items():
        for prop in p:
            if prop.label.startswith("<"):
                continue
            if pt=="DatatypeProperty":
                document_cls += f"{c.label} is defined by {to_readable(prop.label)} "
            else:
                document_cls += f"{c.label} {to_readable(prop.label)} "
            if "rdfs:range" in prop.spos:
                range = oman.enrich_subject(prop.spos["rdfs:range"][0])
                if range.label.startswith("<"):
                    continue
                document_cls += f" of type {range.label}"
            if "rdfs:subPropertyOf" in prop.spos:
                subprop = oman.enrich_subject(prop.spos["rdfs:subPropertyOf"][0])
                if subprop.label.startswith("<"):
                    continue
                document_cls += f" is subproperty of {subprop.label}"
            document_cls += "\n"
    documents.append(document_cls)

ignoring <http://ontology.eil.utoronto.ca/GCI/Environment/Pollution.owl#Air_pollution_concentration>
ignoring <http://purl.oclc.org/NET/ssnx/ssn#SensingDevice>
ignoring <http://www.wurvoc.org/vocabularies/om-1.8/Unit_of_measure>
ignoring <https://w3id.org/brainteaser/ontology/schema/C6H6_concentration>


In [77]:
print(documents[10])

Before Onset is subclass of  Event
Before Onset is defined by onset Elapsed Time  of type Neck



In [87]:
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from sklearn.datasets import fetch_20newsgroups

representation_model = MaximalMarginalRelevance(diversity=0.1)
topic_model = BERTopic(embedding_model="paraphrase-MiniLM-L6-v2",verbose=True
                )
topic_model.fit(documents) 

2024-11-19 08:40:37,211 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

2024-11-19 08:40:38,736 - BERTopic - Embedding - Completed ✓
2024-11-19 08:40:38,737 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-19 08:40:38,843 - BERTopic - Dimensionality - Completed ✓
2024-11-19 08:40:38,843 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-19 08:40:38,846 - BERTopic - Cluster - Completed ✓
2024-11-19 08:40:38,847 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-19 08:40:38,853 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x39d555ff0>

In [88]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,29,-1_pharmacologicsubstance_is_and_occupation,"[pharmacologicsubstance, is, and, occupation, ...",[<https://w3id.org/brainteaser/ontology/schema...
1,0,28,0_disorder_disease_or_finding,"[disorder, disease, or, finding, is, of, by, d...",[Patient is subclass of Person\nPatient has E...
2,1,12,1_procedure_surgical_type_therapeutic,"[procedure, surgical, type, therapeutic, is, o...",[Surgical Procedure is subclass of Interventi...


In [81]:
hierarchical_topics = topic_model.hierarchical_topics(documents)
hierarchical_topics

100%|██████████| 1/1 [00:00<00:00, 365.61it/s]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
0,2,type_of_procedure_by_defined,"[0, 1]",0,of_by_type_defined_relapse,1,procedure_surgical_type_anatomic_site,0.76809


In [82]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)