# Create Embeddings

Model: pritamdeka/S-PubMedBert-MS-MARCO


In [6]:
import warnings

import pandas as pd

from src.nlp.EmbeddingCreator import EmbeddingCreator
from src.nlp.TextProcessor import TextProcessor

warnings.simplefilter(action="ignore", category=FutureWarning)

## load data


In [7]:
# Load DataFrame
df = pd.read_pickle("../data/interim/merged/merged_data_with_refs.pkl")

## Process text

remove NoTitle NoAbstract strings, extra spaces, extra punctuation, and ending statements (copytright...)


In [8]:
# Apply the cleaning function
text_preprocessor = TextProcessor()
df["title_abstract"] = df["title_abstract"].apply(
    text_preprocessor.clean_text_and_remove_ending_statements
)

## get embeddings


In [5]:
# model_path = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
model_path = "sentence-transformers/all-mpnet-base-v2"

# Create an instance of TextProcessor
embedding_creator = EmbeddingCreator(df, modelpath=model_path, batch_size=32)

# Process and save the chunks
df_embeddings = embedding_creator.create_embeddings(
    text_column_name="title_abstract",
    embeddings_column_name="embeddings_allmpnet",
    save_directory="../data/processed/allmpnet-embeddings",
    return_df=True,
    start_chunk=0,
    chunk_size=2000,
    max_length=512,
)

Using autotokenizer and automodel
Model: sentence-transformers/all-mpnet-base-v2
Df Shape: (40481, 20)
Dataframe sorted by year and split into 21 chunks of size 2000


100%|██████████| 21/21 [00:00<00:00, 367.53it/s]
Processing batches: 100%|██████████| 63/63 [14:41<00:00, 13.99s/it]


Saved chunk 0 to pickle: ../data/processed/allmpnet-embeddings/0_1983-1993.pkl


Processing batches: 100%|██████████| 63/63 [13:10<00:00, 12.55s/it]


Saved chunk 1 to pickle: ../data/processed/allmpnet-embeddings/1_1993-1996.pkl


Processing batches: 100%|██████████| 63/63 [12:49<00:00, 12.21s/it]


Saved chunk 2 to pickle: ../data/processed/allmpnet-embeddings/2_1996-1999.pkl


Processing batches: 100%|██████████| 63/63 [12:03<00:00, 11.48s/it]


Saved chunk 3 to pickle: ../data/processed/allmpnet-embeddings/3_1999-2001.pkl


Processing batches: 100%|██████████| 63/63 [11:42<00:00, 11.15s/it]


Saved chunk 4 to pickle: ../data/processed/allmpnet-embeddings/4_2001-2002.pkl


Processing batches: 100%|██████████| 63/63 [11:44<00:00, 11.18s/it]


Saved chunk 5 to pickle: ../data/processed/allmpnet-embeddings/5_2002-2004.pkl


Processing batches: 100%|██████████| 63/63 [13:02<00:00, 12.42s/it]


Saved chunk 6 to pickle: ../data/processed/allmpnet-embeddings/6_2004-2006.pkl


Processing batches: 100%|██████████| 63/63 [12:38<00:00, 12.05s/it]


Saved chunk 7 to pickle: ../data/processed/allmpnet-embeddings/7_2006-2007.pkl


Processing batches: 100%|██████████| 63/63 [11:50<00:00, 11.27s/it]


Saved chunk 8 to pickle: ../data/processed/allmpnet-embeddings/8_2007-2008.pkl


Processing batches: 100%|██████████| 63/63 [11:52<00:00, 11.31s/it]


Saved chunk 9 to pickle: ../data/processed/allmpnet-embeddings/9_2008-2010.pkl


Processing batches: 100%|██████████| 63/63 [12:03<00:00, 11.49s/it]


Saved chunk 10 to pickle: ../data/processed/allmpnet-embeddings/10_2010-2011.pkl


Processing batches: 100%|██████████| 63/63 [11:59<00:00, 11.43s/it]


Saved chunk 11 to pickle: ../data/processed/allmpnet-embeddings/11_2011-2012.pkl


Processing batches: 100%|██████████| 63/63 [11:54<00:00, 11.34s/it]


Saved chunk 12 to pickle: ../data/processed/allmpnet-embeddings/12_2012-2014.pkl


Processing batches: 100%|██████████| 63/63 [11:51<00:00, 11.29s/it]


Saved chunk 13 to pickle: ../data/processed/allmpnet-embeddings/13_2014-2015.pkl


Processing batches: 100%|██████████| 63/63 [11:49<00:00, 11.26s/it]


Saved chunk 14 to pickle: ../data/processed/allmpnet-embeddings/14_2015-2016.pkl


Processing batches: 100%|██████████| 63/63 [11:52<00:00, 11.30s/it]


Saved chunk 15 to pickle: ../data/processed/allmpnet-embeddings/15_2016-2017.pkl


Processing batches: 100%|██████████| 63/63 [11:50<00:00, 11.28s/it]


Saved chunk 16 to pickle: ../data/processed/allmpnet-embeddings/16_2017-2019.pkl


Processing batches: 100%|██████████| 63/63 [11:50<00:00, 11.28s/it]


Saved chunk 17 to pickle: ../data/processed/allmpnet-embeddings/17_2019-2020.pkl


Processing batches: 100%|██████████| 63/63 [11:51<00:00, 11.29s/it]


Saved chunk 18 to pickle: ../data/processed/allmpnet-embeddings/18_2020-2021.pkl


Processing batches: 100%|██████████| 63/63 [12:06<00:00, 11.53s/it]


Saved chunk 19 to pickle: ../data/processed/allmpnet-embeddings/19_2021-2022.pkl


Processing batches: 100%|██████████| 16/16 [02:54<00:00, 10.93s/it]


Saved chunk 20 to pickle: ../data/processed/allmpnet-embeddings/20_2022-9999.pkl


In [5]:
# read in all .pkl files from the embeddings directory
import glob
import os

embeddings = []

for file in glob.glob("../data/interim/allmpnet-embeddings/*.pkl"):
    embeddings.append(pd.read_pickle(file))
    # os.remove(file)

# concatenate all embeddings
df_embeddings = pd.concat(embeddings)
# save the embeddings and delete the chunks
df_embeddings.to_pickle("../data/interim/allmpnet-embeddings/allmpnet-embeddings.pkl")