In [None]:
# Install the Owlready2 library, which is used for ontology manipulation and reasoning.
# Upgrade it to the latest version if already installed.

# Install the RDFLib library for working with RDF (Resource Description Framework) data,
# commonly used in ontology and semantic web tasks.

# Install additional libraries:
# - `torch`: PyTorch, a machine learning framework for deep learning models.
# - `networkx`: For working with graph data structures, often used in ontology matching.
# - `matplotlib`: A library for creating visualizations like graphs and plots.
# - `sentence-transformers`: For sentence-level embedding generation using pre-trained models.
# - `pandas`: For data manipulation and analysis.
# - `lxml`: A library for parsing and handling XML and HTML data, useful for processing ontologies.
# - `beautifulsoup4`: A library for web scraping and parsing HTML documents.


Collecting owlready2
  Downloading owlready2-0.47.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: owlready2
  Building wheel for owlready2 (pyproject.toml) ... [?25l[?25hdone
  Created wheel for owlready2: filename=owlready2-0.47-cp310-cp310-linux_x86_64.whl size=24075204 sha256=bc58693620ed012

In [2]:
# Importing necessary libraries for ontology manipulation, data processing, machine learning, and graph analysis

# JSON library for working with JSON data, often used for saving and loading structured data like ontology metadata.
import json

# `owlready2` for working with OWL ontologies, including loading, querying, and reasoning.
from owlready2 import *

# `pandas` for data manipulation and analysis, especially useful for handling tabular data.
import pandas as pd

# `numpy` for numerical computations, used in operations like matrix manipulations and array processing.
import numpy as np

# Importing `json` again (already imported above, this is redundant and can be removed).
import json

# PyTorch library for building and training machine learning models.
import torch

# `os` module for interacting with the operating system, e.g., file and directory operations.
import os

# AutoTokenizer and AutoModel from Hugging Face Transformers library for tokenizing text and loading pre-trained transformer models.
from transformers import AutoTokenizer, AutoModel

# `Graph`, `Namespace`, and `URIRef` from RDFLib for working with RDF graphs and resources.
from rdflib import Graph, Namespace, URIRef

# `defaultdict` from collections module, useful for creating dictionaries with default values.
from collections import defaultdict

# Importing typing utilities for type hints (e.g., specifying function parameter types like `List`).
from typing import List

# `math` library for mathematical functions, such as calculating logarithms or trigonometric operations.
import math

# CSV library for reading and writing CSV (Comma-Separated Values) files.
import csv

# Importing `Graph` and `Namespace` again from RDFLib (redundant and can be removed).
from rdflib import Graph, Namespace

# `BeautifulSoup` from `bs4` for parsing HTML or XML documents, often used in web scraping or ontology enrichment tasks.
from bs4 import BeautifulSoup

# `etree` from lxml for advanced XML and HTML parsing and processing.
from lxml import etree

# `pickle` for serializing and deserializing Python objects, often used to save or load models, embeddings, or intermediate results.
import pickle

# `networkx` for creating and analyzing graph-based data structures, useful in ontology matching and graph neural networks.
import networkx as nx

In [3]:
# Class to process and structure ontology text data
class OntoText:
    def __init__(self, data):
        """
        Initialize the OntoText class with data and process it.
        :param data: Dictionary with concept IDs as keys and lists of labels/synonyms as values.
        """
        self.data = data  # JSON data with concept IDs and their lists of labels/synonyms
        self.texts = defaultdict(list)  # Stores processed labels/synonyms for each concept
        self.class2idx = {}  # Mapping from concept IDs to unique indices
        self.idx2class = {}  # Mapping from indices back to concept IDs
        self.extract_texts()  # Extract and preprocess texts
        self.create_class_idx_mappings()  # Create mappings for concept IDs and indices

    def extract_texts(self):
        """
        Extracts labels/synonyms from the input data and converts them to lowercase.
        """
        for concept_id, labels in self.data.items():
            # Add all entries in the list as lowercase labels/synonyms
            for label in labels:
                self.texts[concept_id].append(label.lower())

    def create_class_idx_mappings(self):
        """
        Creates mappings between concept IDs and unique numerical indices.
        """
        for idx, concept_id in enumerate(self.texts.keys()):
            self.class2idx[concept_id] = idx
            self.idx2class[idx] = concept_id


# Class to construct an inverted index for ontology text data
class OntoInvertedIndex:
    def __init__(self, ontotext: OntoText, tokenizer_path: str, cut: int = 0):
        """
        Initialize the OntoInvertedIndex class.
        :param ontotext: An OntoText object containing processed ontology data.
        :param tokenizer_path: Path to a pre-trained tokenizer (e.g., BioBERT tokenizer).
        :param cut: Minimum token length to include in the index.
        """
        self.ontotext = ontotext  # OntoText object for processed ontology data
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)  # Load tokenizer
        self.cut = cut  # Minimum token length to include in the index
        self.index = self.construct_index()  # Build the inverted index

    def tokenize(self, texts: List[str]) -> List[str]:
        """
        Tokenizes a list of texts using the pre-trained tokenizer.
        :param texts: List of strings to tokenize.
        :return: List of tokens.
        """
        return [token for text in texts for token in self.tokenizer.tokenize(text)]

    def construct_index(self):
        """
        Constructs an inverted index mapping tokens to concept indices.
        :return: A dictionary with tokens as keys and lists of concept indices as values.
        """
        index = defaultdict(list)  # Initialize an empty inverted index
        for concept_id, labels in self.ontotext.texts.items():
            tokens = self.tokenize(labels)  # Tokenize all labels/synonyms
            for token in tokens:
                if len(token) > self.cut:  # Include tokens that meet the length threshold
                    index[token].append(self.ontotext.class2idx[concept_id])  # Map token to concept index
        return index


# Class to manage source and target ontology data and generate mapping candidates
class OntoBox:
    def __init__(self, src_data, tgt_data, tokenizer_path="cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR", cut=0):
        """
        Initialize the OntoBox class with source and target ontologies.
        :param src_data: Source ontology data (concept IDs and labels/synonyms).
        :param tgt_data: Target ontology data (concept IDs and labels/synonyms).
        :param tokenizer_path: Path to a pre-trained tokenizer.
        :param cut: Minimum token length for the inverted index.
        """
        self.src_ontotext = OntoText(src_data)  # Process source ontology data
        self.tgt_ontotext = OntoText(tgt_data)  # Process target ontology data
        # Create inverted indices for both source and target ontologies
        self.src_onto_index = OntoInvertedIndex(self.src_ontotext, tokenizer_path, cut=cut)
        self.tgt_onto_index = OntoInvertedIndex(self.tgt_ontotext, tokenizer_path, cut=cut)

    def select_candidates(self, concept_texts: List[str], candidate_limit: int = 10):
        """
        Select candidate concepts from the target ontology based on token overlap and IDF scores.
        :param concept_texts: List of source concept labels/synonyms to find matches for.
        :param candidate_limit: Maximum number of candidates to return.
        :return: List of candidate concept IDs from the target ontology.
        """
        candidate_pool = defaultdict(lambda: 0)  # Pool of candidate scores
        tokens = self.tgt_onto_index.tokenize(concept_texts)  # Tokenize source concept texts
        D = len(self.tgt_ontotext.class2idx)  # Total number of target concepts

        for token in tokens:
            potential_candidates = self.tgt_onto_index.index.get(token, [])  # Get concepts containing the token
            if not potential_candidates:
                continue
            # Calculate Inverse Document Frequency (IDF) for the token
            idf = math.log10(D / len(potential_candidates))
            for class_id in potential_candidates:
                candidate_pool[class_id] += idf  # Accumulate IDF scores for candidates

        # Sort candidates by score (descending) and limit to the specified number
        sorted_candidates = sorted(candidate_pool.items(), key=lambda x: x[1], reverse=True)[:candidate_limit]
        # Retrieve the original concept IDs for the selected candidates
        selected_classes = [self.tgt_ontotext.idx2class[c[0]] for c in sorted_candidates]

        return selected_classes

    def generate_candidates(self, candidate_limit: int = 10):
        """
        Generate candidate mappings between source and target ontologies.
        :param candidate_limit: Maximum number of candidates per source concept.
        :return: List of candidate pairs (source concept ID, target concept ID).
        """
        candidate_pairs = []
        # Iterate through all source concepts
        for src_id, text_dict in self.src_ontotext.texts.items():
            src_texts = text_dict  # Use all source labels/synonyms
            candidates = self.select_candidates(src_texts, candidate_limit)  # Get target candidates
            # Pair each source concept with its selected target candidates
            for tgt_id in candidates:
                candidate_pairs.append((src_id, tgt_id))

        return candidate_pairs


In [4]:
def clean_json_by_alignment(file_path, json_path, output_path):
    """
    Cleans a JSON file by removing concepts marked as not used for alignment.

    Args:
        file_path (str): Path to the OWL file (ontology in RDF/XML format).
        json_path (str): Path to the JSON file to clean.
        output_path (str): Path to save the cleaned JSON file.
    """
    # Initialize the RDF graph
    g = Graph()
    # Parse the OWL file into the RDF graph
    g.parse(file_path, format='xml')

    # Define namespaces for the RDF graph
    ALIGNMENT_NS = Namespace("http://oaei.ontologymatching.org/bio-ml/ann/")
    # Define the specific property used to indicate alignment usability
    USE_IN_ALIGNMENT = URIRef("http://oaei.ontologymatching.org/bio-ml/ann/use_in_alignment")

    # Load the JSON file containing concepts and labels/synonyms
    with open(json_path, "r") as f:
        data = json.load(f)

    # Set to hold valid IRIs (concepts marked as usable for alignment)
    valid_iris = set()

    # Iterate over all subjects (classes) in the ontology
    for subj in g.subjects(predicate=None):
        # Check if the class has a `use_in_alignment` annotation
        alignment_value = g.value(subject=subj, predicate=USE_IN_ALIGNMENT)
        # Include the IRI if `use_in_alignment` is not present or is set to "true"
        if alignment_value is None or str(alignment_value).lower() == "true":
            valid_iris.add(str(subj))

    # Filter the JSON data to include only IRIs present in the valid set
    cleaned_data = {iri: labels for iri, labels in data.items() if iri in valid_iris}

    # Save the cleaned JSON data to the specified output file
    with open(output_path, "w") as f:
        json.dump(cleaned_data, f, indent=4, ensure_ascii=False)

    # Print summary information about the cleaning process
    print(f"Cleaned JSON file saved to: {output_path}")
    print(f"Excluded concepts: {len(data) - len(cleaned_data)}")
    print(f"Remaining concepts: {len(cleaned_data)}")


In [5]:
def clean_json_using_tsv(json_file_path, tsv_file_path, output_json_path):
    """
    Cleans a JSON file by removing entries not present in the 'SrcEntity' column of a TSV file.

    Args:
    - json_file_path (str): Path to the JSON file to be cleaned.
    - tsv_file_path (str): Path to the TSV file containing valid 'SrcEntity' entries.
    - output_json_path (str): Path to save the cleaned JSON file.

    Returns:
    - None
    """
    # Load the JSON file
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)

    # Load the TSV file and extract 'SrcEntity' column as a set
    tsv_data = pd.read_csv(tsv_file_path, sep="\t")
    valid_entities = set(tsv_data['SrcEntity'])

    # Filter JSON entries to retain only those keys present in 'SrcEntity'
    original_count = len(json_data)
    cleaned_data = {key: value for key, value in json_data.items() if key in valid_entities}
    cleaned_count = len(cleaned_data)

    # Calculate excluded entries
    excluded_count = original_count - cleaned_count

    # Save the cleaned JSON data
    with open(output_json_path, 'w') as output_file:
        json.dump(cleaned_data, output_file, indent=4)

    print(f"Cleaned JSON file saved to {output_json_path}")
    print(f"Original entries: {original_count}")
    print(f"Remaining entries: {cleaned_count}")
    print(f"Excluded entries: {excluded_count}")

In [6]:
# Function to build indexed dictionary
def build_indexed_dict(file_path):
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Create a new dictionary with numeric indexes for each key (URI)
    indexed_dict = {key: index for index, key in enumerate(data.keys())}

    return indexed_dict

In [7]:
# Function to encode URIs using the provided indexed dictionaries
def encode_uris(row, source_dict, target_dict):
    """
    Encodes URIs from a DataFrame row using provided dictionaries.

    Args:
        row (pd.Series): A single row from a DataFrame containing `SrcEntity` and `TgtEntity` columns.
        source_dict (dict): Dictionary mapping source URIs to their integer encodings.
        target_dict (dict): Dictionary mapping target URIs to their integer encodings.

    Returns:
        pd.Series: A series containing the encoded source and target URIs as integers.
    """
    # Extract source and target URIs from the row
    uri_1, uri_2 = row['SrcEntity'], row['TgtEntity']

    # Encode the source URI using the source dictionary, defaulting to -1 if not found
    encoded_uri_1 = source_dict.get(uri_1, -1)  # -1 indicates a missing value
    # Encode the target URI using the target dictionary, defaulting to -1 if not found
    encoded_uri_2 = target_dict.get(uri_2, -1)  # -1 indicates a missing value

    # Ensure the encoded URIs are integers (in case they are stored as floats or other types)
    return pd.Series([int(encoded_uri_1), int(encoded_uri_2)])

In [9]:
# Define the source ontology name
src_ent = "snomed.body"

# Define the target ontology name
tgt_ent = "fma.body"

# Define the task name for this ontology matching process
task = "body"

In [None]:
dir = "../"

# Define the directory for the dataset containing source and target ontologies
dataset_dir = f"{dir}/Datasets/{task}"

# Define the data directory for storing embeddings, adjacency matrices, and related files
data_dir = f"{dir}/{task}/Data"

In [11]:
# Load the Source ontology (OMIM) using the Ontology class from DeepOnto
src_onto_path = f"{dataset_dir}/{src_ent}.owl"

# Load the Target ontology (ORDO) using the Ontology class from DeepOnto
tgt_onto_path = f"{dataset_dir}/{tgt_ent}.owl"

# Define the file path for the JSON file containing the Source ontology (OMIM) class labels
src_class_path = f"{data_dir}/{src_ent}_classes.json"

# Define the file path for the JSON file containing the Target ontology (ORDO) class labels
tgt_class_path = f"{data_dir}/{tgt_ent}_classes.json"

# Define the file path for the candidate mappings between Source to Target entities
Candidates = f"{data_dir}/{task}_candidates.csv"

# Define the file path for the cleaned JSON file containing source class labels used for testing
src_class_cleaned_test = f"{data_dir}/{src_ent}_cleaned_test_classes.json"  # Cleaned test classes for source

# Define the file path for the TSV file containing reference equivalences for testing
tsv_file_path = f"{dataset_dir}/refs_equiv/test.tsv"  # Reference file with equivalences for evaluation

# Define the file path for the cleaned and encoded JSON file containing source class labels
src_class_cleaned = f"{data_dir}/{src_ent}_cleaned_classes.json"  # Cleaned and encoded classes for source

# Define the file path for the cleaned JSON file containing target class labels
tgt_class_cleaned = f"{data_dir}/{tgt_ent}_cleaned_classes.json"  # Cleaned classes for target

# Define the file paths for candidate mappings
candidate_path = f"{data_dir}/candidates_cleaned.csv"  # CSV file with candidate mappings

candidate_encoded_path = f"{data_dir}/{task}_candidates_cleaned_encoded.csv"  # Encoded candidate mappings

candidates_cleaned_encoded_combined = f"{data_dir}/{task}_candidates_cleaned_combined_encoded.csv"

candidates_cleaned_encoded_enriched = f"{data_dir}/{task}_candidates_cleaned_enriched_encoded.csv"

# **Cleaning**

In [12]:
# Clean the JSON file for the source ontology
# This removes unnecessary concepts from the source ontology JSON file (`src_class_path`)
# based on alignment annotations in the source ontology OWL file (`src_onto_path`).
# The cleaned JSON is saved to `src_class_cleaned_encodage`.
clean_json_by_alignment(src_onto_path, src_class_path, src_class_cleaned)

# Clean the JSON file for the target ontology
# Similarly, this removes unnecessary concepts from the target ontology JSON file (`tgt_class_path`)
# based on alignment annotations in the target ontology OWL file (`tgt_onto_path`).
# The cleaned JSON is saved to `tgt_class_cleaned`.
clean_json_by_alignment(tgt_onto_path, tgt_class_path, tgt_class_cleaned)


Cleaned JSON file saved to: /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/snomed.body_cleaned_classes.json
Excluded concepts: 10236
Remaining concepts: 24318
Cleaned JSON file saved to: /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/fma.body_cleaned_classes.json
Excluded concepts: 24229
Remaining concepts: 64890


In [13]:
# Call the function to clean the source JSON file based on the TSV file
# The `clean_json_using_tsv` function uses the reference equivalence file (`tsv_file_path`)
# to further filter the cleaned source JSON file (`src_class_cleaned_encodage`).
# The resulting JSON file, containing only relevant concepts for testing, is saved to `src_class_cleaned_test`.
clean_json_using_tsv(src_class_cleaned, tsv_file_path, src_class_cleaned_test)

Cleaned JSON file saved to /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/snomed.body_cleaned_test_classes.json
Original entries: 24318
Remaining entries: 4604
Excluded entries: 19714


# **Candidates Generation**

In [14]:
# Load JSON files for both source and target ontologies
with open(src_class_cleaned_test, 'r') as f:
    src_data = json.load(f)

# Assuming the target data JSON file path is correct
with open(tgt_class_cleaned, 'r') as f:
    tgt_data = json.load(f)

In [15]:
# Instantiate OntoBox and generate candidate pairs
ontobox = OntoBox(src_data, tgt_data)
candidates = ontobox.generate_candidates()

# Print candidates for verification
for src, tgt in candidates:
    print(f"SrcEntity: {src}, TgtEntity: {tgt}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma76785
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45063
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma48264
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45081
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45084
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45075
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma43794
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45067
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45066
SrcEntity: http://snomed.info/id/289075000, TgtEntity: http://purl.org/sig/ont/fma/fma45068
SrcEntity: http

In [16]:
# Convert candidates to a DataFrame
candidate_df = pd.DataFrame(candidates, columns=["SrcEntity", "TgtEntity"])

# Add an incremental "ID" column starting from 0
candidate_df.insert(0, "ID", range(len(candidate_df)))

# Save the DataFrame to a CSV file
candidate_df.to_csv(candidate_path, index=False)

print(f"Candidates saved successfully to {candidate_path}")

Candidates saved successfully to /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/candidates_cleaned.csv


In [17]:
# Load Source and Target and ORDO class indexed dictionaries
indexed_dict_source = build_indexed_dict(src_class_path)
indexed_dict_target = build_indexed_dict(tgt_class_path)

In [18]:
# Create a DataFrame from the list of candidate mappings
# The `candidates` variable is expected to be a list of tuples containing source and target entities.
candidate_df = pd.DataFrame(candidates, columns=["SrcEntity", "TgtEntity"])

# Apply the `encode_uris` function to encode the source and target URIs into integers
# The `source_dict` and `target_dict` are dictionaries mapping URIs to their integer indices.
# This ensures the URIs are replaced with numeric representations in the DataFrame.
candidate_df[["SrcEntity", "TgtEntity"]] = candidate_df.apply(
    encode_uris, axis=1, source_dict=indexed_dict_source, target_dict=indexed_dict_target
)

# Add an ID column to the DataFrame
# The ID column is a unique identifier for each candidate pair, starting from 1.
candidate_df["ID"] = candidate_df.index + 1

# Reorder the columns for the final output: ID, SrcEntity, TgtEntity
# Save the DataFrame to a CSV file with the specified `candidate_encoded_path`.
# The `quoting=csv.QUOTE_NONNUMERIC` ensures that numeric values are not enclosed in quotes.
candidate_df[["ID", "SrcEntity", "TgtEntity"]].to_csv(candidate_encoded_path, index=False, quoting=csv.QUOTE_NONNUMERIC)

# Print confirmation message with the file path
print(f"Candidates saved to {candidate_encoded_path}")


Candidates saved to /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/body_candidates_cleaned_encoded.csv


In [19]:
# Load the existing candidates file
candidates_df = pd.read_csv(candidate_encoded_path)

# Load the source file with additional candidates
additional_candidates_df = pd.read_csv(Candidates)

# Concatenate the two DataFrames
combined_df = pd.concat([candidates_df, additional_candidates_df])

# Remove duplicate rows based on SrcEntity and TgtEntity columns
combined_df = combined_df.drop_duplicates(subset=["SrcEntity", "TgtEntity"])

# Save the combined DataFrame back to a CSV file
combined_df.to_csv(candidates_cleaned_encoded_combined, index=False, quoting=csv.QUOTE_NONNUMERIC)

print(f"Combined candidates saved to {candidates_cleaned_encoded_combined}")


Combined candidates saved to /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/body_candidates_cleaned_combined_encoded.csv


In [20]:
# Load the existing candidates file
candidates_df = pd.read_csv(candidate_encoded_path)

# Load the source file with additional candidates
additional_candidates_df = pd.read_csv(Candidates)

# Identify entries in the additional candidates file that are not in the existing candidates file
# Assuming 'SrcEntity' and 'TgtEntity' columns are used to uniquely identify rows
new_entries = additional_candidates_df[
    ~additional_candidates_df.set_index(["SrcEntity", "TgtEntity"]).index.isin(
        candidates_df.set_index(["SrcEntity", "TgtEntity"]).index
    )
]

# Append the new entries to the original candidates DataFrame
enriched_candidates_df = pd.concat([candidates_df, new_entries])

# Save the enriched DataFrame to a new file or overwrite the original file
enriched_candidates_df.to_csv(candidates_cleaned_encoded_enriched, index=False, quoting=csv.QUOTE_NONNUMERIC)

print(f"Enriched candidates saved to {candidates_cleaned_encoded_enriched}")

Enriched candidates saved to /content/gdrive/My Drive/BioGITOM-VLDB//body/Data/body_candidates_cleaned_enriched_encoded.csv
