In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.insert(0,os.path.abspath('..'))
sys.path.append(os.path.abspath('../backend'))
sys.path.append(os.path.abspath(''))

In [4]:

from owlready2 import *
from rdflib import Graph, URIRef, Literal
from rdflib.plugins.sparql import prepareQuery
import networkx as nx

In [5]:
from backend.ontology import *

In [6]:
base_path='../data'
onto_path=base_path+'/brainteaser-ontology/bto.ttl'

In [7]:

brainteaser_graph=Graph().parse(onto_path, format='turtle')
brainteaser_graph.bind('bto', 'http://www.semanticweb.org/ontologies/2020/3/bto#')

In [8]:
oman=OntologyManager(OntologyConfig(), brainteaser_graph)

In [9]:
root_classes=oman.load_full_graph()

In [10]:
root_classes[1]

Subject(subject_id='bto:Activity', label='Activity', spos={'rdf:type': ['owl:Class'], 'dcterms:conformsTo': ['https://uts.nlm.nih.gov/uts/umls/concept/C0441655'], 'rdfs:comment': ['An active process; excludes processes and mechanisms which fulfill biological functions. [Definition Source: NCI]'], 'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/NCIT_C43431'], 'rdfs:label': ['Activity'], 'skos:note': ['In our data we refer to it as "Behaviour".']}, subject_type='class', refcount=0, descendants={'subClass': [Subject(subject_id='bto:Lifestyle', label='Lifestyle', spos={'rdf:type': ['owl:Class'], 'rdfs:subClassOf': ['bto:Activity'], 'dcterms:conformsTo': ['https://uts.nlm.nih.gov/uts/umls/concept/C0023676'], 'rdfs:comment': ["A manner of living that reflects the person's values and attitudes. [Definition Source: NCI]"], 'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/NCIT_C16795'], 'rdfs:label': ['Lifestyle']}, subject_type='class', refcount=0, descendants={'subClass': [], 'namedI

In [11]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from glob import glob
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [12]:
onto_relations = []


def get_relations(subj: Subject, path: str = ""):
    for prop_typ, props in subj.properties.items():
        for prop in props:
            onto_relations.append(f"{path}.{subj.label}.{prop.label}")
    
    for desc_typ, descs in subj.descendants.items():
        for desc in descs:
            if desc.subject_type != subj.subject_type:
                onto_relations.append(f"{path}.{subj.label}.{desc.label}")
            else:
                get_relations(desc, path=f"{path}.{subj.label}")


[get_relations(root) for root in root_classes]
all_relations = pd.DataFrame(onto_relations, columns=["relation"])
all_relations=all_relations.map(lambda x: x.replace(".", " "))
all_relations.to_csv("all_relations.csv", index=False)

In [13]:
encoding = model.encode(all_relations["relation"].values)
all_relations["embeddings"] = [encoding[e, :] for e in range(encoding.shape[0])]

In [14]:
all_relations

Unnamed: 0,relation,embeddings
0,Person birthplace,"[-0.018654888, 0.07895222, -0.04690001, 0.0107..."
1,Person enrolledIn,"[-0.035732675, -0.0038437918, -0.060788415, -0..."
2,Person hasDisease,"[-0.07380283, 0.06160015, 0.0008665606, 0.0184..."
3,Person residence,"[0.088017456, 0.0051908987, 0.03949787, -0.021..."
4,Person birthYear,"[-0.039230842, 0.089079805, -0.013136638, -0.0..."
...,...,...
888,Therapeutic Treatment administration,"[-0.008879084, 0.00890982, -0.007270886, 0.054..."
889,Therapeutic Treatment pharmacologicSubstance,"[-0.013776337, -0.019011427, -0.014824607, -0...."
890,Therapeutic Treatment therapyType,"[-0.013458574, -0.017691838, 0.009163428, 0.04..."
891,Trauma traumaDate,"[-0.012793325, 0.056337398, 0.049347922, 0.087..."


In [15]:
os.getcwd()

'/Users/benedikt/tugraz/cgv/hereditary/ontology_provenance/exploration'

In [16]:
all_csvs = glob("../data/**/*.csv", recursive=True)
all_csvs


['../data/ALS/U/datasetA/test/datasetA_test-outcome.csv',
 '../data/ALS/U/datasetA/test/datasetA_test-static-vars.csv',
 '../data/ALS/U/datasetA/test/datasetA_test-visits.csv',
 '../data/ALS/U/datasetA/test/datasetA_test-environmental.csv',
 '../data/ALS/U/datasetA/train/datasetA_train-visits.csv',
 '../data/ALS/U/datasetA/train/datasetA_train-outcome.csv',
 '../data/ALS/U/datasetA/train/datasetA_train-environmental.csv',
 '../data/ALS/U/datasetA/train/datasetA_train-static-vars.csv',
 '../data/ALS/U/datasetC/test/datasetC_test-static-vars.csv',
 '../data/ALS/U/datasetC/test/datasetC_test-visits.csv',
 '../data/ALS/U/datasetC/test/datasetC_test-outcome.csv',
 '../data/ALS/U/datasetC/test/datasetC_test-environmental.csv',
 '../data/ALS/U/datasetC/train/datasetC_train-outcome.csv',
 '../data/ALS/U/datasetC/train/datasetC_train-environmental.csv',
 '../data/ALS/U/datasetC/train/datasetC_train-visits.csv',
 '../data/ALS/U/datasetC/train/datasetC_train-static-vars.csv',
 '../data/ALS/U/data

In [33]:
import regex as re
def make_readable(txt:str, split_chars=["_", "-", "/", ":", "."]):
    txt = re.sub(r"([a-z])([A-Z])", r"\1 \2", txt)
    txt = re.sub(r"([A-Z])([A-Z][a-z])", r"\1 \2", txt)
    txt = re.sub(r"([a-z])([0-9])", r"\1 \2", txt)
    txt = re.sub(r"([0-9])([a-z])", r"\1 \2", txt)
    for split_char in split_chars:
        txt = " ".join(txt.split(split_char))
    return txt
    

In [42]:
class DataSetRepresentation:
    def __init__(self, path: str, index_col: str | int = 0):
        self.paths = [path]
        self.df = pd.read_csv(path, index_col=index_col)
        self.columns = self.df.columns.tolist()
        self.columns_embedding: list[np.ndarray] = None

    def expand_categorical(self):
        self.unique_counts = self.df.nunique()
        self.categorical_columns = self.df.columns[
            (self.unique_counts / len(self.df) < 0.1)
            & (self.unique_counts > 1)
            & (self.unique_counts < 512)
        ].to_list()
        self.df = pd.get_dummies(self.df, columns=self.categorical_columns)

    def merge(self, other: "DataSetRepresentation"):
        self.df = pd.concat([self.df, other.df])
        self.columns = self.df.columns
        self.paths.append(other.paths)

    def col_id(self):
        return ",".join(self.df.columns.tolist())

    def cols_readable(self):
        col_names_flipped = [
            f"{make_readable(col)} {make_readable(self.df.index.name)} {make_readable(self.paths[0])}"
            for col in self.df.columns
        ]
        return col_names_flipped

    def embed_cols(self):
        columns_embedding = model.encode(self.cols_readable())
        self.columns_embedding = [columns_embedding[e, :] for e in range(columns_embedding.shape[0])]
        return self.columns_embedding


representations = [DataSetRepresentation(csv) for csv in all_csvs]

In [43]:
mergeables: dict[str, list[DataSetRepresentation]] = {}
for rep in representations:
    if rep.col_id() not in mergeables:
        mergeables[rep.col_id()] = []
    mergeables[rep.col_id()].append(rep)
for k,mergeable in mergeables.items():
    print(k,len(mergeable))

Occured,Type,Time,centre 18
onsetDate,diagnosisDate,alive,sex,ethnicity,height,weight_before_onset,weight,moreThan10PercentWeightloss,ALS_familiar_history,age_onset,prevalentLMN,prevalentUMN,mixedMN,onset_bulbar,onset_axial,onset_generalized,onset_limbs,onset_limb_type,occupation,retired_at_diagnosis,smoking,smoking_startYear,smoking_endYear,dailyCigarettes,packYear,C9orf72,SOD1 mutation,TARDBP mutation,FUS mutation,CK_level,CK_lower_range,CK_upper_range,CK_unit,Albumin_level,Albumin_lower_range,Albumin_upper_range,Albumin_unit,Creatinine_level,Creatinine_lower_range,Creatinine_upper_range,Creatinine_unit,Total_Cholesterol_level,Total_Cholesterol_lower_range,Total_Cholesterol_upper_range,Total_Cholesterol_unit,HDL_Cholesterol_level,HDL_Cholesterol_lower_range,HDL_Cholesterol_upper_range,HDL_Cholesterol_unit,LDL_Cholesterol_level,LDL_Cholesterol_lower_range,LDL_Cholesterol_upper_range,LDL_Cholesterol_unit,Triglycerides_level,Triglycerides_lower_range,Triglycerides_upper_range,Triglyceri

In [44]:
for col_id, reps in mergeables.items():
    if len(reps) > 1:
        for rep in reps[1:]:
            reps[0].merge(rep)
        mergeables[col_id] = reps[:1]
representations_merged = [rep for reps in mergeables.values() for rep in reps]

  self.df = pd.concat([self.df, other.df])


In [45]:
for rep in representations_merged:
    print(rep.df.shape)

(6081, 4)
(6081, 98)
(20508, 20)
(11661330, 5)


In [46]:
for rep in representations_merged:
    rep.expand_categorical()
    print(rep.df.shape)

(6081, 10)
(6081, 1493)
(20508, 715)
(11661330, 21)


In [47]:
for rep in representations_merged:
    rep.embed_cols()    

In [49]:
representations_merged[0].columns_embedding

[array([-4.52009849e-02,  1.00217871e-01, -3.81474346e-02,  1.82596035e-02,
        -3.49404886e-02,  2.67207306e-02,  5.97217353e-03,  2.69929785e-03,
        -5.67455776e-04, -6.84852526e-02,  6.30307794e-02, -4.93182950e-02,
        -3.17105949e-02, -7.14172423e-02, -5.48195504e-02, -6.92298487e-02,
        -3.50465178e-02, -3.34395431e-02, -1.74243953e-02, -2.35541351e-02,
        -4.72949073e-02,  4.05702181e-02, -1.78538039e-02,  1.42876478e-02,
        -6.86009079e-02, -8.60406086e-03,  5.50530739e-02, -7.40963151e-04,
         1.56803802e-02,  6.30996330e-03,  6.49528801e-02,  1.10121323e-02,
         7.45233148e-02,  6.03338145e-02, -7.02053234e-02, -2.98296986e-03,
         1.62166879e-02,  6.72340989e-02, -9.42308828e-02,  5.07299900e-02,
         2.63147671e-02, -5.32235391e-02,  7.04404712e-02,  5.59285246e-02,
         4.23427783e-02, -2.45572366e-02, -3.22340615e-02, -3.19382995e-02,
        -3.00835799e-02,  1.15879625e-01, -5.11642136e-02,  3.07009816e-02,
         3.5