In [1]:
import warnings

import pandas as pd

from src.nlp.EmbeddingCreator import EmbeddingCreator
from src.nlp.TextProcessor import TextProcessor

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
# Load DataFrame
df = pd.read_pickle("../data/03-interim/merged/merged_data_with_refs.pkl")

In [3]:
import os
import gc

import pandas as pd
import torch
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
from tqdm import tqdm


class PaperEmbeddingProcessor:
    def __init__(
        self,
        df,
        model_name,
        adapter_name,
        save_dir,
        batch_size=32,
        chunk_size=100,  # Manageable chunk size for memory
    ):
        self.df = df
        self.df["title"] = self.df["title"].fillna("")
        self.df["abstract"] = self.df["abstract"].fillna("")
        self.chunk_size = chunk_size
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoAdapterModel.from_pretrained(model_name)
        self.model.load_adapter(
            "allenai/specter2",
            source="hf",
            load_as=adapter_name,
            set_active=True,
        )
        self.save_dir = save_dir
        self.batch_size = batch_size
        os.makedirs(save_dir, exist_ok=True)

    def process_batch(self, batch):
        text_batch = [
            d["title"] + self.tokenizer.sep_token + (d.get("abstract") or "")
            for d in batch
        ]
        inputs = self.tokenizer(
            text_batch,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512,
        )
        with torch.no_grad():
            output = self.model(**inputs)
        embeddings = output.last_hidden_state[:, 0, :]
        return embeddings.cpu()

    def process_papers(self):
        all_embeddings = []
        total_records = len(self.df)

        for start_idx in tqdm(range(0, total_records, self.chunk_size)):
            end_idx = min(start_idx + self.chunk_size, total_records)
            batch_embeddings = []

            for batch_start in range(start_idx, end_idx, self.batch_size):
                batch_end = min(batch_start + self.batch_size, end_idx)
                batch = self.df.iloc[batch_start:batch_end].to_dict(orient="records")

                if batch:  # Check if the batch is not empty
                    embeddings = self.process_batch(batch)
                    batch_embeddings.append(embeddings)

                    # Clear memory
                    del embeddings
                    gc.collect()

            # Save and accumulate batch embeddings
            if batch_embeddings:  # Check if there are embeddings to concatenate
                batch_embeddings = torch.cat(batch_embeddings, dim=0)
                batch_file = os.path.join(
                    self.save_dir, f"embeddings_chunk_{start_idx//self.chunk_size}.pt"
                )
                torch.save(batch_embeddings, batch_file)
                all_embeddings.append(batch_embeddings)

        total_embeddings = (
            torch.cat(all_embeddings, dim=0) if all_embeddings else torch.tensor([])
        )
        torch.save(total_embeddings, os.path.join(self.save_dir, "total_embeddings.pt"))
        return total_embeddings

    def save_embeddings_with_data(
        self, embeddings, file_name="df_with_specter2_embeddings.pkl"
    ):
        self.df["specter2_embeddings"] = list(embeddings.numpy())
        self.df.to_pickle(os.path.join(self.save_dir, file_name))

In [4]:
processor = PaperEmbeddingProcessor(
    df=df,
    model_name="allenai/specter2_base",
    adapter_name="specter2",
    save_dir="../data/03-interim/specter-embeddings",
    batch_size=32,
    chunk_size=2500,
)
total_embeddings = processor.process_papers()
processor.save_embeddings_with_data(total_embeddings)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 15/15 [7:09:10<00:00, 1716.70s/it]  


In [None]:
#

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


def compute_cosine_similarity_matrix(df, embeddings_column):
    """
    Computes the cosine similarity matrix for the embeddings in a DataFrame column.

    Parameters:
    df (DataFrame): The DataFrame containing the embeddings.
    embeddings_column (str): The name of the column containing embeddings.
    sample_size (int, optional): The number of samples to consider. Defaults to 1000.
    random_state (int, optional): The seed for random sampling. Defaults to 1887.

    Returns:
    ndarray: A cosine similarity matrix.
    """

    # Convert the embeddings column to a 2D NumPy array
    embeddings = np.stack(df[embeddings_column])

    # Check for NaN or infinite values and handle them
    embeddings = np.nan_to_num(embeddings)

    # Normalize the embeddings to unit length
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    normalized_embeddings = embeddings / norms

    # Compute the cosine similarity matrix
    cosine_similarity_matrix = np.dot(normalized_embeddings, normalized_embeddings.T)

    return cosine_similarity_matrix


# Usage example:
# cosine_similarity_matrix = compute_cosine_similarity_matrix(df_connected, "embeddings_allmpnet")

In [None]:
cosine_similarity_matrix = compute_cosine_similarity_matrix(
    processor.df, "specter2_embeddings"
)

In [None]:
pd.DataFrame(cosine_similarity_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.000000,0.831603,0.819804,0.913932,0.867501,0.840011,0.780502,0.829427,0.794649,0.850203,...,0.862365,0.880004,0.833291,0.872294,0.848742,0.845483,0.862239,0.875230,0.856609,0.887444
1,0.831603,1.000000,0.836459,0.849612,0.874288,0.854254,0.845415,0.865543,0.830590,0.845462,...,0.859995,0.851207,0.858259,0.851953,0.876773,0.902258,0.845239,0.893188,0.799374,0.834765
2,0.819804,0.836459,1.000000,0.823196,0.853579,0.843751,0.818995,0.850880,0.861743,0.829988,...,0.862416,0.855353,0.883752,0.842998,0.832910,0.865265,0.888510,0.877493,0.817226,0.815044
3,0.913932,0.849612,0.823196,1.000000,0.877501,0.859489,0.818500,0.821718,0.818351,0.852304,...,0.864038,0.886404,0.833654,0.877436,0.858027,0.849932,0.849155,0.878593,0.815456,0.909829
4,0.867501,0.874288,0.853579,0.877501,1.000000,0.867502,0.859598,0.857779,0.846112,0.879685,...,0.871438,0.902499,0.859350,0.896759,0.873450,0.891842,0.872624,0.898147,0.834942,0.880941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.845483,0.902258,0.865265,0.849932,0.891842,0.877556,0.860332,0.853012,0.847901,0.860254,...,0.862043,0.857571,0.870559,0.858161,0.880292,1.000000,0.886039,0.904251,0.845558,0.853190
996,0.862239,0.845239,0.888510,0.849155,0.872624,0.845260,0.814057,0.865789,0.823734,0.858040,...,0.901945,0.855924,0.879595,0.859307,0.916865,0.886039,1.000000,0.888929,0.827643,0.845197
997,0.875230,0.893188,0.877493,0.878593,0.898147,0.893017,0.845420,0.871651,0.849364,0.877527,...,0.883821,0.877932,0.881745,0.897596,0.885275,0.904251,0.888929,1.000000,0.855151,0.863556
998,0.856609,0.799374,0.817226,0.815456,0.834942,0.842685,0.770327,0.813183,0.788695,0.826106,...,0.811351,0.825791,0.828062,0.820177,0.795500,0.845558,0.827643,0.855151,1.000000,0.820599


In [None]:
def is_pair_already_printed(p1, p2, printed_pairs):
    """Check if the pair has already been printed."""
    return (p1, p2) in printed_pairs or (p2, p1) in printed_pairs


def print_second_most_similar(df, cosine_similarity_matrix):
    printed_pairs = set()

    for i, row in df.iterrows():
        sorted_indices = np.argsort(cosine_similarity_matrix[i])
        second_most_similar = sorted_indices[-2]  # -1 is the most similar (itself)

        # Skip if the pair has already been printed
        if is_pair_already_printed(i, second_most_similar, printed_pairs):
            continue

        # Handle the case where the most similar paper might be the paper itself
        if second_most_similar == i:
            second_most_similar = sorted_indices[-3]

        # Check if the similarity is above the threshold
        if cosine_similarity_matrix[i][second_most_similar] > 0.96:
            printed_pairs.add((i, second_most_similar))

            print(f"Paper {i}: {row['title']}")
            print("Second most similar paper:")
            print(
                f"Paper {second_most_similar}: {df.iloc[second_most_similar]['title']}"
            )
            print(
                f"Cosine similarity: {cosine_similarity_matrix[i][second_most_similar]}"
            )
            print("----")


print_second_most_similar(processor.df, cosine_similarity_matrix)

Paper 16: Selective serotonin-reuptake inhibitor-induced movement disorders
Second most similar paper:
Paper 491: Movement disorders associated with the serotonin selective reuptake inhibitors
Cosine similarity: 0.9632427096366882
----
Paper 46: Central nervous system active medications and risk for fractures in older women
Second most similar paper:
Paper 784: Use of antidepressant medications and risk of fracture in older women
Cosine similarity: 0.9643115401268005
----
Paper 60: Contribution of post-secretory mechanisms to the observed pattern of histamine and 5-hydroxytryptamine secretion from peritoneal rat mast cells in response to compound 48/80
Second most similar paper:
Paper 529: Differential release of histamine and 5-hydroxytryptamine from rat mast cells: The contribution of amine uptake to the apparent pattern of secretion
Cosine similarity: 0.9715805053710938
----
Paper 65: Achieving remission from depression with venlafaxine and venlafaxine extended release: A literature