**Uses BioBERT pre-trained model for medical domain embeddings**

Supports GPU acceleration
Generates embeddings by:

**Tokenizing both sentences**
Generating embeddings using mean pooling
Concatenating embeddings from both sentences


**Saves embeddings and relationships to a .npz file**
Provides error handling and progress tracking

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


In [18]:
def create_biobert_embeddings(csv_path: str, output_path: str = "biobert_embeddings.npz"):
    """
    Create embeddings from a CSV file using BioBERT.

    :param csv_path: Path to the CSV file with columns: 'first_sentence', 'second_sentence', 'relationship'.
    :param output_path: Path to save the embeddings as a .npz file.
    """
    # Load the dataset
    try:
        data = pd.read_csv(csv_path)
    except Exception as e:
        raise ValueError(f"Error reading CSV file: {e}")

    # Check required columns
    required_columns = ['Sentence 1', 'Sentence 2', 'Relationship']
    if not all(column in data.columns for column in required_columns):
        raise ValueError(f"CSV file must contain the following columns: {required_columns}")

    # Load BioBERT tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
    model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
    model.eval()  # Set the model to evaluation mode

    # Return the model, tokenizer, and data  # This line is modified
    return model, tokenizer, data

In [19]:
# Call the function and get the model and tokenizer
model, tokenizer, data = create_biobert_embeddings("/content/Data_relationships.csv") # Replace with your actual CSV file path

In [20]:
  # Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [21]:
# Prepare to store embeddings and metadata
sentence_embeddings = []
relationships = []

In [22]:
# Function to get sentence embedding
def get_sentence_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling to get sentence embedding
    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    return embedding.cpu().numpy()

In [23]:
# Process sentences and generate embeddings
def process_sentences_generate_embeddings(data, tokenizer, model, sentence_embeddings, relationships):
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Generating Embeddings"):
        try:
            # Get embeddings for both sentences
            first_embedding = get_sentence_embedding(row['Sentence 1'], tokenizer, model) # Use 'Sentence 1'
            second_embedding = get_sentence_embedding(row['Sentence 2'], tokenizer, model) # Use 'Sentence 2'

            # Combine embeddings
            combined_embedding = np.concatenate([first_embedding, second_embedding])
            sentence_embeddings.append(combined_embedding)
            relationships.append(row['Relationship']) # Use 'Relationship'
        except Exception as e:
            print(f"Error processing row: {e}")
            continue
    return sentence_embeddings, relationships

In [24]:
# Process sentences and generate embeddings
def process_sentences_generate_embeddings(data, tokenizer, model, sentence_embeddings, relationships):
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Generating Embeddings"):
        try:
            # Get embeddings for both sentences
            first_embedding = get_sentence_embedding(row['Sentence 1'], tokenizer, model) # Use 'Sentence 1'
            second_embedding = get_sentence_embedding(row['Sentence 2'], tokenizer, model) # Use 'Sentence 2'

            # Combine embeddings
            combined_embedding = np.concatenate([first_embedding, second_embedding])
            sentence_embeddings.append(combined_embedding)
            relationships.append(row['Relationship']) # Use 'Relationship'
        except Exception as e:
            print(f"Error processing row: {e}")
            continue
    return sentence_embeddings, relationships

In [25]:
# Process sentences and generate embeddings
def process_sentences_generate_embeddings(data, output_path: str = "biobert_embeddings.npz"):
    sentence_embeddings = []
    relationships = []
    for _, row in tqdm(data.iterrows(), total=len(data), desc="Generating Embeddings"):
        try:
            # Get embeddings for both sentences
            first_embedding = get_sentence_embedding(row['Sentence 1'])
            second_embedding = get_sentence_embedding(row['Sentence 2'])

            # Combine embeddings
            combined_embedding = np.concatenate([first_embedding, second_embedding])
            sentence_embeddings.append(combined_embedding)
            relationships.append(row['Relationship'])

        except Exception as e:
            print(f"Error processing row: {e}")
            continue

    # Convert to numpy arrays
    sentence_embeddings = np.array(sentence_embeddings)
    relationships = np.array(relationships)

    # Save embeddings and relationships
    np.savez(
        output_path,
        embeddings=sentence_embeddings,
        relationships=relationships
    )

    print(f"Embeddings saved to {output_path}")
    print(f"Embeddings shape: {sentence_embeddings.shape}")
    print(f"Relationships shape: {relationships.shape}")

    return sentence_embeddings, relationships



In [28]:
# Example usage
if __name__ == "__main__":
    # Replace with your actual CSV path
    csv_path = "/content/Data_relationships.csv"
    output_path = "biobert_embeddings.npz"

    # Modify this line to unpack all three returned values
    model, tokenizer, data = create_biobert_embeddings(csv_path, output_path)

    # Now call process_sentences_generate_embeddings with the necessary arguments
    embeddings, rels = process_sentences_generate_embeddings(data, output_path) # Pass data and output_path

Generating Embeddings: 100%|██████████| 250/250 [01:01<00:00,  4.05it/s]

Embeddings saved to biobert_embeddings.npz
Embeddings shape: (250, 2, 768)
Relationships shape: (250,)





**#To Read Embedding Files#**

In [30]:
import numpy as np

# Load the .npz file
data = np.load('/content/biobert_embeddings.npz')

# List all the arrays stored in the .npz file
print(data.files)



['embeddings', 'relationships']


In [31]:
# Access a specific array by its name (for example, 'embedding')
embedding = data['embeddings']

# Print the embedding array (or do further processing)
print(embedding)

[[[-0.26180524 -0.00606371 -0.09929299 ... -0.03283149 -0.07074795
   -0.07370254]
  [-0.052624   -0.10554495 -0.23050874 ... -0.04975342 -0.03628622
   -0.23326284]]

 [[-0.06664198 -0.05004142 -0.12933666 ...  0.03189946  0.15915684
   -0.18128423]
  [-0.17897642 -0.10246386 -0.22736183 ...  0.10066219  0.22736196
   -0.26042587]]

 [[-0.07911068  0.05578955  0.03445188 ...  0.09288554  0.06558892
   -0.1920433 ]
  [-0.16268839  0.10344014 -0.1179441  ... -0.01996229  0.09928265
   -0.23303708]]

 ...

 [[-0.13020998  0.19109687  0.09628036 ... -0.06239978  0.2223844
   -0.19263579]
  [-0.15140349  0.1632779   0.07422288 ...  0.02373421  0.2683528
   -0.32095304]]

 [[-0.00179012  0.21672682 -0.15787332 ...  0.18023299  0.09780022
   -0.0065886 ]
  [ 0.10490052  0.0658789  -0.18372914 ...  0.17768392  0.06081073
   -0.24605867]]

 [[-0.18936013  0.21297717 -0.16273376 ... -0.00419721  0.02917262
   -0.21053913]
  [-0.16725007  0.29874015 -0.00400542 ... -0.04367919  0.04564876
   -0.

In [33]:
# Access a specific array by its name (for example, 'embedding')
relationship = data['relationships']

# Print the embedding array (or do further processing)
print(relationship)

['causality' 'causality' 'causality' 'causality' 'causality' 'causality'
 'causality' 'causality' 'causality' 'causality' 'condition' 'condition'
 'condition' 'condition' 'condition' 'condition' 'condition' 'condition'
 'condition' 'condition' 'sequence' 'sequence' 'sequence' 'sequence'
 'sequence' 'sequence' 'sequence' 'sequence' 'sequence' 'sequence'
 'comparison' 'comparison' 'comparison' 'comparison' 'comparison'
 'comparison' 'comparison' 'comparison' 'comparison' 'comparison'
 'sequence' 'sequence' 'sequence' 'sequence' 'sequence' 'condition'
 'condition' 'comparison' 'comparison' 'comparison' 'causality'
 'causality' 'causality' 'sequence' 'causality' 'causality' 'causality'
 'sequence' 'causality' 'causality' 'causality' 'causality' 'causality'
 'causality' 'causality' 'causality' 'condition' 'sequence' 'sequence'
 'sequence' 'comparison' 'comparison' 'condition' 'causality' 'causality'
 'causality' 'sequence' 'causality' 'sequence' 'causality' 'causality'
 'causality' 'causali