In [None]:
# Installation block: Installs the required libraries.
# Run this in environments like Jupyter Notebook or Colab for proper setup.
# - rdflib: For working with RDF data and ontologies.
# - torch: PyTorch library for building and training neural networks.
# - networkx: For creating, analyzing, and manipulating graph structures.
# - matplotlib: For data visualization and plotting.
# - sentence-transformers: For generating sentence embeddings (e.g., BERT-based models).
# - pandas: For data manipulation and analysis.
# - lxml: For parsing and processing XML and HTML.
# - beautifulsoup4: For web scraping and parsing HTML/XML documents.
!pip install rdflib torch networkx matplotlib sentence-transformers pandas lxml beautifulsoup4

Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


In [None]:
# Importing all necessary libraries:

# Core libraries
import numpy as np  # For numerical computations and array operations.
import json  # For handling JSON data.

# Libraries for ontology and graph processing
from rdflib import Graph, Namespace  # For working with RDF data and creating namespaces.
from bs4 import BeautifulSoup  # For parsing HTML and XML documents.
from lxml import etree  # For efficient XML and HTML parsing.

# Libraries for data manipulation and machine learning
import pandas as pd  # For data manipulation and analysis.
import torch  # For building and training machine learning models.

# Libraries for file handling and serialization
import pickle  # For serializing and deserializing Python objects.
import os  # For interacting with the file system.

# Graph processing and visualization
import networkx as nx  # For creating and analyzing graph structures.
import matplotlib.pyplot as plt  # For visualizing data and graphs.

# Importing PyTorch for tensor operations and deep learning workflows.
import torch

# Importing classes from Hugging Face's transformers library:
# - AutoTokenizer: Automatically loads the appropriate tokenizer for a given pre-trained model.
# - AutoModel: Automatically loads the appropriate pre-trained transformer model.
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
# Importing the 'drive' module from Google Colab to interact with Google Drive
from google.colab import drive

# Mount the user's Google Drive to the Colab environment
# After running this, a link will appear to authorize access, and Google Drive will be mounted at '/content/gdrive'
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Define the source ontology name
src_ent = "ncit"

# Define the target ontology name
tgt_ent = "doid"

# Define the task name for this ontology matching process
task = "ncit2doid"

In [None]:
dir = "/content/gdrive/My Drive/BioGITOM-VLDB/"

# Define the directory for the dataset containing source and target ontologies
dataset_dir = f"{dir}/Datasets/{task}"

# Define the data directory for storing embeddings, adjacency matrices, and related files
prepath = f"{dir}/{task}/Data"

In [None]:
# Load the Source ontology using the Ontology class from DeepOnto
# This initializes the source ontology by loading its .owl file.
src_onto = f"{dataset_dir}/{src_ent}.owl"

# Load the Target ontology using the Ontology class from DeepOnto
# This initializes the target ontology by loading its .owl file.
tgt_onto = f"{dataset_dir}/{tgt_ent}.owl"

In [None]:
def get_exact_matches(graph, class_iri, label, namespace, annotation):
    """
    Retrieves exact matches for a given class in an ontology graph based on a specified annotation property.

    Args:
        graph: The RDF graph containing ontology data.
        class_iri (str): The IRI of the class to retrieve exact matches for.
        label (str): The initial label of the class, which will be included in the matches.
        namespace: The namespace of the ontology (e.g., 'http://www.w3.org/2004/02/skos/core#').
        annotation (str): The specific annotation property to identify exact matches (e.g., 'exactMatch').

    Returns:
        list: A list of unique exact matches, including the original label.
    """
    # Start with the given label as the initial match for the class.
    exact_matches = [label]

    # Iterate over all triples in the graph.
    # Triple pattern: (subject, predicate, object) where the predicate matches the specified annotation property.
    for iri, _, exact_match in graph.triples((None, namespace[annotation], None)):
        # Convert the subject IRI to a string and check if it matches the provided class IRI.
        if str(iri) == class_iri:
            # If the subject matches the class IRI, add the object of the triple (exact match) to the list.
            exact_matches.append(str(exact_match))

    # Ensure all entries in the exact_matches list are unique using numpy's unique function.
    return list(np.unique(exact_matches))

In [None]:
def get_file_synonym_properties(file_path):
    """
    Function to retrieve synonym properties for a specific ontology file.

    Args:
    file_path (str): The path of the file for which synonym properties are required.

    Returns:
    list: A list of synonym properties associated with the given file name.
    """
    # Open the JSON file containing the dictionary of synonym properties
    f = open(f"{dir}/dictionary.json", "r")

    # Load the content of the JSON file into a Python dictionary
    dic = json.loads(f.read())

    # Close the file after reading to free system resources
    f.close()

    # Extract the file name from the provided file path
    file_name = file_path.split("/")[-1]

    # Return the synonym properties corresponding to the file name
    return dic.get(file_name, [])

In [None]:
def extract_nodes(file_path):
    """
    Extract nodes from an OWL file and map IRIs to their labels and exact matches.

    Args:
        file_path (str): The file path to the .owl file.

    Returns:
        dict: A dictionary where the key is the IRI and the value is the label along with exact matches.
    """
    class_dict = {}

    # Initialize an RDF graph and parse the OWL file
    g = Graph()
    g.parse(file_path, format='xml')

    # Define namespaces for RDFS and custom annotations
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

    # Retrieve SKOS namespace and annotation from the provided file path
    SKOS, annotation = get_file_synonym_properties(file_path)

    # Initialize counters
    i = L = 0

    # Count the total number of subjects in the graph
    for subj in g.subjects(predicate=None):
        L += 1
    # Iterate over each subject in the graph
    for subj in g.subjects(predicate=None):
        # Check if the subject is a valid IRI and has a label
        if str(subj).startswith('http') and g.value(subject=subj, predicate=RDFS.label) is not None:
            # Retrieve the label of the subject
            label = str(g.value(subject=subj, predicate=RDFS.label))

            # Retrieve exact matches for the subject
            class_dict[str(subj)] = get_exact_matches(g, str(subj), label, Namespace(SKOS), annotation)
        # Increment the counter
        i += 1
        # Print progress
        print(f"{i}/{L} : {i/L*100:.2f}%\t\t\t", end="\r")

    return class_dict

In [None]:
def oht(class_dict, offset=0):
    """
    One-hot encoding of IRIs.

    Args:
        class_dict (dict): A dictionary where keys are IRIs and values are labels.
        offset (int, optional): A value to start the encoding from. Defaults to 0.

    Returns:
        dict: A dictionary where keys are the IRIs from the input dictionary and values are their corresponding one-hot encoded positions.
    """
    return dict(zip(list(class_dict.keys()), [i + offset for i in range(len(class_dict))]))

In [None]:
def extract_links(file_path, class_dict):
    """
    Extract binary links (subClassOf) from an OWL file and store them in an array.

    Args:
        file_path (str): The file path to the .owl file.
        class_dict (dict): A dictionary where keys are IRIs (classes) that we are interested in.

    Returns:
        ndarray: A NumPy array of shape (n, 2) where each row represents a link [subject, object].
    """
    # Initialize an RDF graph and parse the OWL file
    g = Graph()
    g.parse(file_path, format='xml')

    # Initialize a list to store the subject-object pairs
    all_predicates = []

    # Iterate over all triples in the graph
    for subj, pred, obj in g:
        # Check if the subject and object are valid IRIs, and the predicate is 'subClassOf'
        if (str(subj).startswith('http') and str(obj).startswith('http') and
                str(pred).split("#")[-1].split("/")[-1] == "subClassOf" and
                str(subj) in class_dict and str(obj) in class_dict):
            # Append the subject-object pair to the list
            all_predicates.append([str(subj), str(obj)])

    # Convert the list to a NumPy array
    arr = np.array(all_predicates)

    return arr

In [None]:
# Load SapBERT model and tokenizer
sapbert_model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")
sapbert_tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Load SentenceTransformer-compatible BERT model
sentence_model = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

In [None]:
def gen_embeddings(sentences, batch_size=8, max_length=128, use_gpu=True):
    """
    Generate sentence embeddings using a SentenceTransformer model with SapBERT weights.

    Args:
        sentences (list of str): A list of sentences to encode.
        batch_size (int): Number of sentences to process at a time (for batching).
        max_length (int): Maximum sequence length for tokenization.
        use_gpu (bool): Whether to use GPU for computation. Defaults to True.

    Returns:
        np.ndarray: The embeddings for the input sentences.
    """
    # Determine the device to use: GPU (if available and requested) or CPU
    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")

    # Load the SentenceTransformer model with SapBERT weights
    sentence_model = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens").to(device)
    sentence_transformer_model = sentence_model._first_module().auto_model  # Access underlying BERT model
    sentence_transformer_model.load_state_dict(sapbert_model.state_dict(), strict=False)

    # Tokenizer for the SentenceTransformer model
    tokenizer = sapbert_tokenizer  # Reuse SapBERT tokenizer

    # Store all embeddings here
    all_embeddings = []

    # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
        # Get the current batch
        batch_sentences = sentences[i:i + batch_size]

        # Tokenize the batch with truncation to limit sequence length
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)

        # Generate embeddings without computing gradients (for efficiency)
        with torch.no_grad():
            model_output = sentence_transformer_model(**encoded_input)
            batch_embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings)

        # Clear the GPU cache to free memory
        torch.cuda.empty_cache()

    # Concatenate all batch embeddings into a single array
    all_embeddings = np.vstack(all_embeddings)

    return all_embeddings

In [None]:
def save(class_dict, embeddings, adjacence, file_path):
    """
    Save class dictionary, embeddings, and adjacency matrix to files.

    Args:
        class_dict (dict): Dictionary of classes (IRIs and their labels).
        embeddings (np.ndarray): Array of embeddings for the classes.
        adjacence (np.ndarray): Adjacency matrix representing relationships between classes.
        file_path (str): File path of the original OWL file to generate output file names.
    """
    # Construct the base path for saving files by using the file name from the provided file path
    base_path = f"{prepath}{file_path.split('/')[-1]}"
    # Save the class dictionary as a JSON file
    with open(base_path + "_classes.json", "w") as file:
        json.dump(class_dict, file, indent=3)

    # Save the embeddings as a CSV file
    pd.DataFrame(embeddings).to_csv(base_path + "_emb.csv")

    # Save the adjacency matrix as a CSV file
    pd.DataFrame(adjacence).to_csv(base_path + "_adjacence.csv")

In [None]:
def depth_search(graph, node, depths: dict, depth: int, pred, nodes):
    """
    Perform a depth-first search on the graph, tracking the depth of each node.

    Args:
        graph (rdflib.Graph): The RDF graph to search.
        node (rdflib.URIRef or str): The starting node for the depth-first search.
        depths (dict): A dictionary to store the depth of each node. Keys are node URIs (as strings), values are their depths.
        depth (int): The current depth in the search.
        pred (rdflib.URIRef or str): The predicate to use for navigating the graph.
        nodes (set or list): A collection of node URIs (as strings) that are of interest.
    """
    # Iterate over all objects that are connected to the current node by the specified predicate
    for child in graph.objects(subject=node, predicate=pred):
        print(str(child))
        if str(child) in nodes:
            # Record the depth of the child node
            depths[str(child)] = depth
            # Recursively search the child node
            depth_search(graph, child, depths, depth + 1, pred, nodes)

In [None]:
def extract_triplets_binary_relations(owl_file_path, nodes=None):
    """
    Extract all binary relations (triplets) from an OWL file where both the subject and object
    are in the specified nodes list.

    Args:
        owl_file_path (str): The file path to the OWL file.
        nodes (set or list, optional): A collection of node IRIs (as strings) to filter the triplets.
                                       If None, no filtering will be applied, and all triplets will be returned.

    Returns:
        list: A list of tuples, each containing (subject, predicate, object) for the binary relations.
    """
    g = Graph()
    g.parse(owl_file_path, format='xml')
    triplets = []

    # Convert nodes to a set for faster lookup if provided
    if nodes is not None:
        nodes = set(nodes)

    for subj, pred, obj in g:
        # If nodes are provided, filter by checking if both subj and obj are in the nodes set
        if nodes is None or (str(subj) in nodes and str(obj) in nodes):
            triplets.append((subj, pred, obj))

    return triplets

In [None]:
def get_json_name_of_dict(owl_file_path):
    """
    Generate the file path for the JSON file associated with the given OWL file.

    Args:
        owl_file_path (str): The file path to the OWL file.

    Returns:
        str: The file path for the corresponding JSON file.
    """
    return f"{prepath}{owl_file_path.split('/')[-1]}_classes.json"

In [None]:
def get_json_file(json_file_path):
    """
    Read a JSON file and return its contents as a Python dictionary.

    Args:
        json_file_path (str): The file path to the JSON file.

    Returns:
        dict: The contents of the JSON file as a Python dictionary, or None if an error occurs.
    """
    try:
        with open(json_file_path, "r") as f:
            d = json.load(f)
        return d
    except FileNotFoundError:
        print(f"Error: File {json_file_path} not found.")
    except json.JSONDecodeError:
        print(f"Error: File {json_file_path} is not a valid JSON file.")
    return None

In [None]:
# Extract nodes (IRIs and their labels/synonyms) from the OWL file.
class_dict = extract_nodes(src_onto)

# Extract binary relationships (e.g., subClassOf) between the nodes in `class_dict` from the OWL file.
links = extract_links(src_onto, class_dict)

# Generate one-hot encoding codes for the nodes in `class_dict`.
oht_codes = oht(class_dict)

# Prepare the data for generating embeddings by concatenating the lists of synonyms into single strings.
# Each string represents the concatenated synonyms or related terms for a given IRI.
concat_arr = [", ".join(list(x)) for x in class_dict.values()]

In [None]:

# Generate embeddings for the concatenated synonym strings using a model like SentenceTransformer.
emb = gen_embeddings(concat_arr)

# Define a function to replace node IRIs with their corresponding one-hot encoded values.
def replace(val, dict=oht_codes):
    return dict[val]

# Replace IRIs in the class dictionary with their one-hot encoded values.
noeuds = np.vectorize(replace)(np.array(list(class_dict.keys())))

# Replace IRIs in the links array with their one-hot encoded values.
adjacence = np.vectorize(replace)(links)

# Save the processed data: class dictionary, embeddings, and adjacency matrix.
# These are saved to files derived from the original OWL file's path.
save(class_dict, emb, adjacence, src_onto)

In [None]:

# Generate and Save Adjacency Matrices for Each Binary Predicate

# Define the OWL file path

# Load the necessary files
class_dict = get_json_file(get_json_name_of_dict(src_onto))
oht_encoding = oht(class_dict)

# Extract binary relation triplets (subject, predicate, object)
triplets = np.array(extract_triplets_binary_relations(src_onto))

# Process each unique binary predicate
unique_predicates = np.unique(triplets[:, 1])

for j, binary_predicate in enumerate(unique_predicates, start=1):
    print(f"\tpredicate {j}/{len(unique_predicates)}")

    # Filter triplets matching the current predicate
    predicate_triplets = triplets[triplets[:, 1] == binary_predicate]

    # Convert subject and object IRIs to their one-hot encoded values
    subjects = []
    objects = []

    for subj, obj in zip(predicate_triplets[:, 0], predicate_triplets[:, 2]):
        # Only include triplets where both subject and object exist in the one-hot encoding
        if subj in oht_encoding and obj in oht_encoding:
            subjects.append(oht_encoding[subj])
            objects.append(oht_encoding[obj])
        else:
            print(f"Skipping triplet with missing key: ({subj}, {binary_predicate}, {obj})")

    # If no valid triplets are found, skip this predicate
    if not subjects or not objects:
        print(f"No valid triplets found for predicate {binary_predicate}. Skipping...")
        continue

    # Extract the label of the predicate (e.g., "subClassOf") for the file name
    predicate_label = str(binary_predicate).split("#")[-1].split("/")[-1]

    # Create a DataFrame to store the adjacency matrix for the current predicate
    predicate_adjacence_matrix = pd.DataFrame({"Src": subjects, "Trg": objects})

    # Define the path where the adjacency matrix CSV file will be saved
    adjacence_file_path = f"{prepath}{src_onto.split('/')[-1]}_adjacence_{predicate_label}.csv"

    # Save the adjacency matrix
    predicate_adjacence_matrix.to_csv(adjacence_file_path)