In [None]:
!pip install transformers --upgrade sentence-transformers

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
Successfully installed transformers-4.47.1


In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

In [None]:
# Importing all necessary libraries
import json
import pandas as pd
import numpy as np
import torch
import os
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
# Importing the 'drive' module from Google Colab to interact with Google Drive
from google.colab import drive

# Mount the user's Google Drive to the Colab environment
# After running this, a link will appear to authorize access, and Google Drive will be mounted at '/content/gdrive'
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Define the task name for this ontology matching process
task = "body"

In [None]:
dir = f"/content/gdrive/My Drive/BioGITOM-VLDB/{task}"

dataset="/content/gdrive/My Drive/BioGITOM-VLDB/"

# Define the directory for the dataset containing source and target ontologies
dataset_dir = f"{dataset}/Datasets/{task}"

# Define the data directory for storing embeddings, adjacency matrices, and related files
data_dir = f"{dir}/Data"

In [None]:
src_class = f"{data_dir}/snomed.body_classes.json"

src_Emb = f"{data_dir}/snomed.body_BERT_Hybrid_emb.csv"

In [None]:

tgt_class = f"{data_dir}/fma.body_classes.json"

tgt_Emb = f"{data_dir}/fma.body_BERT_Hybrid_emb.csv"


In [None]:
# Load SapBERT model and tokenizer
sapbert_model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")
sapbert_tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Load SentenceTransformer-compatible BERT model
sentence_model = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens")

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def gen_embeddings(sentences, batch_size=8, max_length=128, use_gpu=True):
    """
    Generate sentence embeddings using a SentenceTransformer model with SapBERT weights.

    Args:
        sentences (list of str): A list of sentences to encode.
        batch_size (int): Number of sentences to process at a time (for batching).
        max_length (int): Maximum sequence length for tokenization.
        use_gpu (bool): Whether to use GPU for computation. Defaults to True.

    Returns:
        np.ndarray: The embeddings for the input sentences.
    """
    # Determine the device to use: GPU (if available and requested) or CPU
    device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")

    # Load the SentenceTransformer model with SapBERT weights
    sentence_model = SentenceTransformer("sentence-transformers/bert-base-nli-mean-tokens").to(device)
    sentence_transformer_model = sentence_model._first_module().auto_model  # Access underlying BERT model
    sentence_transformer_model.load_state_dict(sapbert_model.state_dict(), strict=False)

    # Tokenizer for the SentenceTransformer model
    tokenizer = sapbert_tokenizer  # Reuse SapBERT tokenizer

    # Store all embeddings here
    all_embeddings = []

    # Process the sentences in batches
    for i in range(0, len(sentences), batch_size):
        # Get the current batch
        batch_sentences = sentences[i:i + batch_size]

        # Tokenize the batch with truncation to limit sequence length
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)

        # Generate embeddings without computing gradients (for efficiency)
        with torch.no_grad():
            model_output = sentence_transformer_model(**encoded_input)
            batch_embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings)

        # Clear the GPU cache to free memory
        torch.cuda.empty_cache()

    # Concatenate all batch embeddings into a single array
    all_embeddings = np.vstack(all_embeddings)

    return all_embeddings


In [None]:
# Charger le fichier JSON
with open(src_class, "r") as f:
    class_dict = json.loads(f.read())

# Prepare the data for generating embeddings
concat_arr = [", ".join(list(x)) for x in class_dict.values()]

 # Generate embeddings
emb = gen_embeddings(concat_arr)

In [None]:
df = pd.DataFrame(emb)
df.to_csv(src_Emb)

In [None]:
# Charger le fichier JSON
with open(tgt_class, "r") as f:
    class_dict = json.loads(f.read())

# Prepare the data for generating embeddings
concat_arr = [", ".join(list(x)) for x in class_dict.values()]

 # Generate embeddings
embtgt = gen_embeddings(concat_arr)

In [None]:
df = pd.DataFrame(embtgt)
df.to_csv(tgt_Emb)