<h3>Data Chunking and Embedding</h3>

In [30]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
import pickle
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import os

In [31]:
load_dotenv()
api_key = os.getenv('api_key')

In [32]:
def store_list(data, filename):
    with open(filename, "wb") as f:
        pickle.dump(data, f)

#### Split abstract to chunks

In [33]:
model_name = "sentence-transformers/all-mpnet-base-v2"

In [34]:
splitter = SentenceTransformersTokenTextSplitter(
    model_name=model_name, 
    chunk_overlap=20,
    tokens_per_chunk=256
)

In [35]:
df = pd.read_csv('all_med_data.csv')
medpub_dict = df.to_dict(orient='records')

#### Define embedding function using model text-embedding-ada-002 from OpenAI

1. OpenAI text-embedding-ada-002

In [None]:
client = OpenAI(api_key=api_key)

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

2. all-mpnet-base-v2

In [36]:
model = SentenceTransformer(model_name)

#### Create metadata with vector data

In [37]:
vector_data = []

for doc in tqdm(medpub_dict):
    
    chunks = splitter.split_text(text=doc["Abstract"])

    for j, chunk in enumerate(chunks):

        metadata = {
            "PMID": doc["PMID"],
            "title": doc["Title"],
            "authors": doc["Authors"],
            "first_author": doc["First Author"],
            "journal": doc["Journal/Book"],
            "publication_year": doc["Publication Year"],
            "text_chunk_id": j,
            "chunk": chunk,
            "abstract": doc["Abstract"]
        }

        # 1. with text-embedding-ada-002
        # embedding = get_embedding(chunk, model="text-embedding-ada-002")
        # 2. with all-mpnet-base-v2
        embedding = model.encode(chunk).tolist()

        ids = f"{doc['PMID']}_{j}"

        vector_data.append((ids, embedding, metadata)) 


100%|██████████| 58140/58140 [2:42:03<00:00,  5.98it/s]   


#### Store data

In [38]:
file_path = "vector_data.pkl"
store_list(vector_data, file_path)