Embed PM abstracts

In [None]:
import json
import h5py
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Load BGE base model and tokenizer
model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.half()  # Convert model to fp16
model.to('cuda')  # Move model to GPU
model.eval()  # Set model to evaluation mode

# Load the JSON file with the data
input_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/NEW_PM_id_text.json"
with open(input_file, "r") as f:
    data = json.load(f)

# Open the HDF5 file to store the embeddings
output_file = "/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/CLS_NEW_PM_id_text_W_BGE_L.h5"
with h5py.File(output_file, "w") as h5f:
    for item in tqdm(data, desc="Processing articles"):
        article_id = item["article_id"]
        text = item["text"]

        # Tokenize the input text
        encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to('cuda')

        # Generate embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)

        # Perform CLS pooling
        # Extract the embedding for the [CLS] token (the first token in the sequence)
        cls_embedding = model_output.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        # Convert CLS embedding to numpy
        cls_embedding = cls_embedding.squeeze().cpu().numpy()

        # Store the embeddings in the HDF5 file
        h5f.create_dataset(article_id, data=cls_embedding)

print("Embeddings have been successfully saved to", output_file)


Test complete pipeline 10-17

In [None]:
import json
import h5py
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import faiss
import time
from tqdm import tqdm

# Set top_k values for easier modification
top_k_initial = 30
top_k_rerank_10 = 10
top_k_rerank_1 = 1
First_N = 2000

# Start measuring time
start_time = time.time()

# Load BGE base model and tokenizer
model_name = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')  # Move model to GPU
model.eval()  # Set model to evaluation mode

# Open the HDF5 file and load all embeddings into a list
embedding_file = "/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/CLS_NEW_PM_id_text_W_BGE_L.h5"
embeddings = []
article_ids = []
with h5py.File(embedding_file, "r") as h5f:
    for article_id in h5f.keys():
        article_embedding = h5f[article_id][:]
        embeddings.append(article_embedding)
        article_ids.append(article_id)

# Convert embeddings list to a numpy array
embeddings = np.array(embeddings, dtype='float32')

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Build a FAISS index (using cosine similarity)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
index.add(embeddings)

# Load the initial JSON input file
input_file = "/home/jh537/Clinical_Trial_Embending/Clinical_Trial_data/Retrievial/v2_/NEW_PM_id_text.json"
with open(input_file, "r") as f:
    data = json.load(f)

# Load the file with the first First_N summaries
summary_file = "/n/data1/hsph/biostat/celehs/lab/jh537/Retrivial_task/DATA/LONG_CTG_id_text_refs_test.json"
with open(summary_file, "r") as f:
    summaries_data = json.load(f)

# Load the reranker model and tokenizer
reranker_model_name = "/n/data1/hsph/biostat/celehs/lab/jh537/Models/NEW_cop_reranker_HN_10_17"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name).to('cuda')  # GPU
reranker_model.eval()  # Set model to evaluation mode

# Function to perform CLS pooling
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0, :]  # CLS token is at index 0

# Track how many times the reference is found in top_k_rerank=10 and top_k_rerank=1
found_count_10 = 0
found_count_1 = 0

# Iterate over the first First_N summaries with a progress bar
for summary_item in tqdm(summaries_data[:First_N], desc="Processing summaries"):
    query = summary_item['summary']
    reference_id = str(summary_item['ref'])

    # Tokenize the query
    encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt').to('cuda')  # Move input to GPU

    # Generate query embedding
    with torch.no_grad():
        model_output = model(**encoded_input)
    query_embedding = cls_pooling(model_output).squeeze().cpu().numpy().astype('float32')

    # Normalize query embedding for cosine similarity
    faiss.normalize_L2(query_embedding.reshape(1, -1))

    # Search for the top_k_initial most similar embeddings
    scores, indices = index.search(np.array([query_embedding]), top_k_initial)

    # Store the top initial article IDs in a list
    top_article_ids = [article_ids[idx] for idx in indices[0]]

    # Create a dictionary to store the text of the top articles
    article_texts = {}
    for article_id in top_article_ids:
        for item in data:
            if item["article_id"] == article_id:
                article_texts[article_id] = item["text"]
                break

    # Prepare the reranking inputs and store the (article_id, score)
    rerank_scores = []
    for article_id, article_text in article_texts.items():
        # Tokenize the concatenation of the query and the article text
        inputs = reranker_tokenizer(query, article_text, padding=True,
                                    truncation=True, return_tensors='pt').to('cuda')
        # Generate reranking score using CLS token
        with torch.no_grad():
            outputs = reranker_model(**inputs)
            cls_embedding = outputs.logits[:, 0]  # Use the CLS token for reranking
            score = torch.sigmoid(cls_embedding).squeeze().item()  # Sigmoid to get [0, 1] range
        # Store the score along with the article ID
        rerank_scores.append((article_id, score))

    # Sort the rerank scores in descending order
    rerank_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the top_k_rerank=10 article IDs
    top_10_article_ids = [article_id for (article_id, _) in rerank_scores[:top_k_rerank_10]]

    # Get the top_k_rerank=1 article ID (the first in the reranked list, if available)
    top_1_article_ids = [article_id for (article_id, _) in rerank_scores[:top_k_rerank_1]]

    # Check if the reference ID is in the top 10
    if reference_id in top_10_article_ids:
        found_count_10 += 1

    # Check if the reference ID is in the top 1
    if reference_id in top_1_article_ids:
        found_count_1 += 1

# Calculate the percentage of summaries for which the reference was found
percentage_found_10 = (found_count_10 / len(summaries_data[:First_N])) * 100
percentage_found_1 = (found_count_1 / len(summaries_data[:First_N])) * 100

print(f"Percentage of summaries with the reference in the top {top_k_rerank_10}: {percentage_found_10:.2f}%")
print(f"Percentage of summaries with the reference in the top {top_k_rerank_1}: {percentage_found_1:.2f}%")

# Print the total running time
total_time = time.time() - start_time
print(f"Total running time: {total_time:.2f} seconds")
