# Create Embeddings

Model: pritamdeka/S-PubMedBert-MS-MARCO


In [1]:
import warnings

import pandas as pd

from src.nlp.EmbeddingCreator import EmbeddingCreator
from src.nlp.TextProcessor import TextProcessor

warnings.simplefilter(action="ignore", category=FutureWarning)

## load data


In [2]:
# Load DataFrame
df = pd.read_pickle("../data/processed/merged_data_refsfetched.pkl")

## Process text

remove NoTitle NoAbstract strings, extra spaces, extra punctuation, and ending statements (copytright...)


In [3]:
# Apply the cleaning function
text_preprocessor = TextProcessor()
df["title_abstract"] = df["title_abstract"].apply(
    text_preprocessor.clean_text_and_remove_ending_statements
)

## get embeddings


In [4]:
# model_path = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
model_path = "pritamdeka/S-PubMedBert-MS-MARCO"

# Create an instance of TextProcessor
embedding_creator = EmbeddingCreator(df, modelpath=model_path, batch_size=32)

# Process and save the chunks
df_embeddings = embedding_creator.create_embeddings(
    text_column_name="title_abstract",
    embeddings_column_name="embeddings_biomedbert_sentence_transformer",
    save_directory="../data/processed/embeddings",
    return_df=True,
    start_chunk=0,
    chunk_size=2000,
    max_length=512,
)

Using autotokenizer and automodel
Model: pritamdeka/S-PubMedBert-MS-MARCO
Df Shape: (40481, 20)
Dataframe sorted by year and split into 21 chunks of size 2000


100%|██████████| 21/21 [00:00<00:00, 137.16it/s]
Processing batches: 100%|██████████| 63/63 [14:17<00:00, 13.61s/it]


Saved chunk 0 to pickle: ../data/processed/embeddings/0_1983-1993.pkl


Processing batches: 100%|██████████| 63/63 [12:33<00:00, 11.96s/it]


Saved chunk 1 to pickle: ../data/processed/embeddings/1_1993-1996.pkl


Processing batches: 100%|██████████| 63/63 [11:28<00:00, 10.93s/it]


Saved chunk 2 to pickle: ../data/processed/embeddings/2_1996-1999.pkl


Processing batches: 100%|██████████| 63/63 [12:40<00:00, 12.07s/it]


Saved chunk 3 to pickle: ../data/processed/embeddings/3_1999-2001.pkl


Processing batches: 100%|██████████| 63/63 [11:45<00:00, 11.20s/it]


Saved chunk 4 to pickle: ../data/processed/embeddings/4_2001-2002.pkl


Processing batches: 100%|██████████| 63/63 [13:39<00:00, 13.00s/it]


Saved chunk 5 to pickle: ../data/processed/embeddings/5_2002-2004.pkl


Processing batches: 100%|██████████| 63/63 [13:21<00:00, 12.73s/it]


Saved chunk 6 to pickle: ../data/processed/embeddings/6_2004-2006.pkl


Processing batches: 100%|██████████| 63/63 [12:12<00:00, 11.63s/it]


Saved chunk 7 to pickle: ../data/processed/embeddings/7_2006-2007.pkl


Processing batches: 100%|██████████| 63/63 [11:09<00:00, 10.63s/it]


Saved chunk 8 to pickle: ../data/processed/embeddings/8_2007-2008.pkl


Processing batches: 100%|██████████| 63/63 [11:10<00:00, 10.64s/it]


Saved chunk 9 to pickle: ../data/processed/embeddings/9_2008-2010.pkl


Processing batches: 100%|██████████| 63/63 [11:10<00:00, 10.64s/it]


Saved chunk 10 to pickle: ../data/processed/embeddings/10_2010-2011.pkl


Processing batches: 100%|██████████| 63/63 [11:12<00:00, 10.67s/it]


Saved chunk 11 to pickle: ../data/processed/embeddings/11_2011-2012.pkl


Processing batches: 100%|██████████| 63/63 [11:12<00:00, 10.68s/it]


Saved chunk 12 to pickle: ../data/processed/embeddings/12_2012-2014.pkl


Processing batches: 100%|██████████| 63/63 [11:12<00:00, 10.67s/it]


Saved chunk 13 to pickle: ../data/processed/embeddings/13_2014-2015.pkl


Processing batches: 100%|██████████| 63/63 [11:10<00:00, 10.64s/it]


Saved chunk 14 to pickle: ../data/processed/embeddings/14_2015-2016.pkl


Processing batches: 100%|██████████| 63/63 [11:14<00:00, 10.70s/it]


Saved chunk 15 to pickle: ../data/processed/embeddings/15_2016-2017.pkl


Processing batches: 100%|██████████| 63/63 [11:18<00:00, 10.76s/it]


Saved chunk 16 to pickle: ../data/processed/embeddings/16_2017-2019.pkl


Processing batches: 100%|██████████| 63/63 [11:15<00:00, 10.72s/it]


Saved chunk 17 to pickle: ../data/processed/embeddings/17_2019-2020.pkl


Processing batches: 100%|██████████| 63/63 [11:14<00:00, 10.70s/it]


Saved chunk 18 to pickle: ../data/processed/embeddings/18_2020-2021.pkl


Processing batches: 100%|██████████| 63/63 [11:32<00:00, 11.00s/it]


Saved chunk 19 to pickle: ../data/processed/embeddings/19_2021-2022.pkl


Processing batches: 100%|██████████| 16/16 [02:49<00:00, 10.61s/it]


Saved chunk 20 to pickle: ../data/processed/embeddings/20_2022-9999.pkl


In [5]:
import os

p = "/Users/jlq293/Projects/Study-1-Bibliometrics/data/processed/embeddings/"

# read and merge all pkl files
df_embeddings = pd.concat(
    [pd.read_pickle(p + f) for f in os.listdir(p) if f.endswith(".pkl")]
)

In [6]:
# cluster
from src.models.KMeansClustering import KMeansClustering

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
num_clusters = 14
embeddings_column = "embeddings_biomedbert_sentence_transformer"
clusterer = KMeansClustering(
    df=df_embeddings,
    num_clusters=num_clusters,
    embeddings_column=embeddings_column,
)

clusterer.kmeans_clustering(
    # save_path="../data/processed/embeddings/kmeans_clustered.pkl",
)

Silhouette Score: 0.02


In [8]:
top_n = clusterer.df.groupby("cluster").apply(
    lambda x: x.nsmallest(15, "distance_to_centroid")
)

In [9]:
for cluster in range(num_clusters):
    with open(f"../data/processed/embeddings/cluster_{cluster}.txt", "w") as f:
        f.write("Cluster " + str(cluster) + "\n")
        f.write(" --------------------------------------------- \n")

        # Filter the DataFrame for the current cluster
        cluster_rows = top_n[top_n["cluster"] == cluster]

        for i, row in cluster_rows.iterrows():
            f.write(str(row["title"]) + "\n")
            f.write(str(row["abstract"]) + "\n")
            f.write(str(row["paper_id"]) + "\n")  # Convert to string if necessary
            f.write(str(row["distance_to_centroid"]) + "\n")  # Convert to string
            f.write("\n --------------------------------------------- \n")