In [None]:
# Import the pandas library for data manipulation and analysis
import pandas as pd

# Import the numpy library for numerical operations and handling arrays
import numpy as np

# Import the json library for working with JSON data, such as reading and writing JSON files
import json

In [None]:
# Import the Google Colab drive module to access Google Drive
from google.colab import drive

# Mount Google Drive to the specified directory '/content/gdrive' in the Colab environment
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Define the source ontology name
src_ent = "snomed.pharm"

# Define the target ontology name
tgt_ent = "ncit.pharm"

# Define the task name for this ontology matching process
task = "pharm"

In [None]:
dir = "/content/gdrive/My Drive/BioGITOM-VLDB"

# Define the directory for the dataset containing source and target ontologies
dataset_dir = f"{dir}/Datasets/{task}/refs_equiv"

# Define the data directory for storing embeddings, adjacency matrices, and related files
data_dir = f"{dir}/{task}/Data"

In [None]:
# Define the file path for the training set in TSV format
train_path = f"{dataset_dir}/train.tsv"  # Update with your path to the train set

# Define the file path for the JSON file containing the Source ontology class labels
# 'src_class' is the path to the JSON file that stores class labels for the source ontology.
src_class = f"{data_dir}/{src_ent}_classes2.json"

# Define the file path for the JSON file containing the Target ontology class labels
# 'tgt_class' is the path to the JSON file that stores class labels for the target ontology.
tgt_class = f"{data_dir}/{tgt_ent}_classes.json"

# Define the file path where the encoded training set will be saved
# 'encoded_train_path' is the path where the output encoded training set will be stored in CSV format.
encoded_train_path = f"{data_dir}/{task}_train.encoded.csv"  # Update with your desired output path

# Define the file paths for the source and target embeddings
# 'src_emb' is the path to the CSV file containing the embeddings of source entities.
# 'tgt_emb' is the path to the CSV file containing the embeddings of target entities.
src_emb = f"{data_dir}/{src_ent}_emb2.csv"
tgt_emb = f"{data_dir}/{tgt_ent}_emb.csv"

In [None]:
def build_indexed_dict(file_path):
    # Load the JSON file
    # The input is a file path to a JSON file. The file is opened in read mode ('r').
    # The JSON content is then loaded into the 'data' variable using json.load(),
    # which parses the JSON into a Python dictionary.
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Create a new dictionary with numeric indexes for each key (URI)
    # The enumerate() function is used to generate index numbers (starting from 0).
    # For each key in 'data', the function creates a new dictionary called 'indexed_dict',
    # where each key from 'data' is mapped to a unique index.
    indexed_dict = {key: index for index, key in enumerate(data.keys())}

    # Return the resulting dictionary, where keys are from 'data' and values are their corresponding indexes.
    return indexed_dict

In [None]:
# Function to encode URIs using the indexed dictionaries
def encode_uris(row, src_dict, tgt_dict):
    uri_1, uri_2 = row['SrcEntity'], row['TgtEntity']
    encoded_uri_1 = src_dict.get(uri_1, -1)  # -1 if not found
    encoded_uri_2 = tgt_dict.get(uri_2, -1)  # -1 if not found

    # Ensure the URIs are integers (not floats)
    return pd.Series([int(encoded_uri_1), int(encoded_uri_2)])

In [None]:
def extract_negatives(f1, f2, df, n_negatives):
    # Function to calculate the Euclidean distance between two embedding vectors (from source and target)
    def dist(node1, node2, embs1, embs2):
        return np.sqrt(np.sum((embs1[node1] - embs2[int(node2)]) ** 2))

    # Load embeddings from CSV files for source and target entities
    embs1 = pd.read_csv(f1, index_col=0).to_numpy()
    embs2 = pd.read_csv(f2, index_col=0).to_numpy()

    # Convert the DataFrame (df) to a numpy array for easier manipulation
    him = df.to_numpy()

    # Initialize an empty list to store the negative samples
    negative_samples = []

    # Iterate through each source entity in the DataFrame (df)
    for i, src in enumerate(df['SrcEntity'].values):
        # Get the already positive target entities for the current source entity
        already_positive = him[np.where(him[:, 0] == src), 1].astype(int).flatten()

        # Prepare a list of candidate target entities by excluding already positive entities
        candidate_tgt = np.setdiff1d(np.arange(embs2.shape[0]), already_positive)

        # Check if there are enough candidates to sample from
        if len(candidate_tgt) < n_negatives:
            current_n_negatives = len(candidate_tgt)
        else:
            current_n_negatives = n_negatives

        # Randomly sample 'current_n_negatives' entities from the candidate target entities
        kept = np.random.choice(candidate_tgt, size=current_n_negatives, replace=False)

        # Append the selected negative entities to the list
        for tgt in kept:
            negative_samples.append([src, tgt, 0])

        # Print progress (percentage of processed entities)
        print(f"{i + 1}/{df.shape[0]} : {(i + 1) / df.shape[0] * 100:.2f}%\t\t\t", end="\r")

    # Return a DataFrame containing the negative samples with columns "SrcEntity", "TgtEntity", and "Score"
    return pd.DataFrame(negative_samples, columns=["SrcEntity", "TgtEntity", "Score"])

In [None]:
# Build an indexed dictionary for the source ontology classes
# src_class is the file path to the JSON file containing the source ontology classes
indexed_dict_src = build_indexed_dict(src_class)

# Build an indexed dictionary for the target ontology classes
# tgt_class is the file path to the JSON file containing the target ontology classes
indexed_dict_tgt = build_indexed_dict(tgt_class)

In [None]:
# Load the train CSV file
entity_pairs_df = pd.read_csv(train_path, sep='\t')

In [None]:
# Apply encoding to the Train DataFrame
# 'apply()' is used to apply the 'encode_uris' function to each row of 'entity_pairs_df'.
# The function is applied along the rows (axis=1), and two dictionaries 'indexed_dict_omim' and 'indexed_dict_ordo'
# are passed as arguments to the 'encode_uris' function to map the URIs to their corresponding indexes.
encoded_entity_pairs_df = entity_pairs_df.apply(encode_uris, axis=1, src_dict=indexed_dict_src, tgt_dict=indexed_dict_tgt)

# Add a new column 'Score' with a default value of 1 for all rows
encoded_entity_pairs_df['Score'] = 1

# Rename the DataFrame columns to match the desired structure:
# 'SrcEntity' (source entity), 'TgtEntity' (target entity), and 'Score'
encoded_entity_pairs_df.columns = ['SrcEntity', 'TgtEntity', 'Score']

# Add a new 'ID' column with incremental integer values starting from 0,
# which assigns a unique identifier to each row.
encoded_entity_pairs_df['ID'] = range(0, len(encoded_entity_pairs_df))

# Reorder the columns so that 'ID' is the first column, followed by 'SrcEntity', 'TgtEntity', and 'Score'
encoded_entity_pairs_df = encoded_entity_pairs_df[['ID', 'SrcEntity', 'TgtEntity', 'Score']]

# Save the updated DataFrame to a new CSV file specified by 'encoded_train_path'
# The 'index=False' argument ensures that row indices are not saved in the CSV.
encoded_entity_pairs_df.to_csv(encoded_train_path, index=False)

# Print a confirmation message indicating the file has been saved with the new structure
print(f"Encoded entity pairs with incremental ID (starting from 0) saved to: {encoded_train_path}")

Encoded entity pairs with incremental ID (starting from 0) saved to: /content/gdrive/My Drive/BioGITOM-VLDB/pharm/Data/pharm_train.encoded.csv


In [None]:
# Load the encoded file (containing entity pairs) into a DataFrame
df = pd.read_csv(encoded_train_path, sep=',')

# Iterate over different numbers of negative examples to generate
for nb_negs in [20, 50, 100, 200]:
    # Create a copy of the original DataFrame to avoid modifying the original data
    df_copy = df.copy()

    # Generate random negative examples
    # 'extract_negatives' generates negative samples by pairing non-matching entities.
    # It uses the embeddings from 'f1' (source) and 'f2' (target) and creates 'nb_negs' negative pairs for each entity.
    df_negs = extract_negatives(src_emb, tgt_emb, df, n_negatives=nb_negs)

    # Concatenate the original entity pairs with the newly generated negative examples
    # 'df_final' will contain both positive and negative examples.
    df_final = pd.concat([df, df_negs], axis=0).reset_index().drop(columns=["index"])

    # Save the resulting DataFrame with positive and negative examples to a new CSV file
    # The file name reflects the number of negatives added (e.g., omim2ordo_rdm_20.csv).
    df_final.to_csv(f"{data_dir}/{task}_train_2_{nb_negs}.csv", index=False)

