In [42]:
from openai import OpenAI

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import os
from dotenv import load_dotenv



In [43]:

load_dotenv("../.env")


True

In [28]:

def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

doc = read_doc('documents/') 

doc 


[]

In [29]:
from datasets import load_dataset

ds = load_dataset("abhinand/MedEmbed-training-triplets-v1", "default")



In [30]:
print(ds["train"][0])

{'query': 'cystic mass on uterus symptoms', 'pos': 'The patient presented to the emergency room with severe abdominal pain in the left lower quadrant, which was diagnosed as a cystic mass on the left anterior wall of the uterus. Presenting Symptoms: Severe abdominal pain in the left lower quadrant', 'neg': 'Symptoms: fast-growing pelvic mass and increased serum levels of tumor markers', 'query_id': 'ajrsr7e4', 'pos_id': 'lmk55mdl', 'neg_id': 'cqbh1ckt'}


In [31]:
docs = []

for row in ds["train"]:
    docs.append(
        Document(
            page_content=row["pos"],
            metadata={
                "domain": "healthcare",
                "dataset": "MedEmbed",
                "query_hint": row["query"]
            }
        )
    )


In [32]:
len(docs)


232684

In [38]:
def chunk_data(docs,chunk_Size=800,chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_Size,chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(docs)
    return docs

chunked_data = chunk_data(docs)

In [34]:
from langchain_openai import OpenAIEmbeddings

embeddings =OpenAIEmbeddings(model='text-embedding-3-small',api_key=os.environ['OPENAI_API_KEY'])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x36f888f50>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x36f889810>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

Large amount of data so going for only 20%

In [None]:
# texts = [doc.page_content for doc in chunked_data]

# vectors = embeddings.embed_documents(texts)

# print(len(vectors))          # number of chunks
# print(len(vectors[0]))       # embedding dimension (1536)

KeyboardInterrupt: 

In [55]:
sample_size = int(0.04* len(chunked_data))

sampled_docs = chunked_data[:sample_size]
sampled_docs
sample_size

9308

In [56]:

texts = [doc.page_content for doc in sampled_docs]
vectors = embeddings.embed_documents(texts)

In [57]:
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore as Pinecone

pc = PineconeClient(api_key=os.environ['PINECONE_API_KEY'])
index_name = 'healthcare'

In [60]:
print(len(vectors[0]))  

1536


In [65]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_name = "healthcare"

if index_name not in [i["name"] for i in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [63]:
import uuid

pinecone_vectors = []

for doc, vector in zip(sampled_docs, vectors):
    pinecone_vectors.append(
        (
            str(uuid.uuid4()),     # unique ID
            vector,                # 1536-dim embedding
            doc.metadata | {
                "text": doc.page_content   # optional but recommended
            }
        )
    )


In [66]:
BATCH_SIZE = 100

for i in range(0, len(pinecone_vectors), BATCH_SIZE):
    batch = pinecone_vectors[i : i + BATCH_SIZE]
    index.upsert(vectors=batch)

In [67]:
stats = index.describe_index_stats()
print(stats)

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '140',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 24 Feb 2026 14:02:43 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '4',
                                    'x-pinecone-request-latency-ms': '3',
                                    'x-pinecone-response-duration-ms': '5'}},
 'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10308}},
 'total_vector_count': 10308,
 'vector_type': 'dense'}
