In [1]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
df = pd.DataFrame(dataset)

df = pd.DataFrame(df["train"].tolist())



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

def extract_context(context_entry):
    """Extracts text from context JSON field"""
    if isinstance(context_entry, str):
        try:
            context_dict = json.loads(context_entry)
            return " ".join(context_dict.get("contexts", []))  # Join all context pieces
        except json.JSONDecodeError:
            return context_entry  # Return as-is if not JSON formatted
    return ""
# Apply preprocessing
df["context"] = df["context"].apply(extract_context)

# Select only needed columns
df = df[["pubid", "question", "context", "long_answer"]]

# Show sample
print(df.head())


      pubid                                           question context  \
0  25429730  Are group 2 innate lymphoid cells ( ILC2s ) in...           
1  25433161  Does vagus nerve contribute to the development...           
2  25445714  Does psammaplin A induce Sirtuin 1-dependent a...           
3  25431941  Is methylation of the FGFR2 gene associated wi...           
4  25432519  Do tumor-infiltrating immune cell profiles and...           

                                         long_answer  
0  As ILC2s are elevated in patients with CRSwNP,...  
1  Neuronal signals via the hepatic vagus nerve c...  
2  PsA significantly inhibited MCF-7/adr cells pr...  
3  We identified a novel biologically plausible c...  
4  Breast cancer immune cell subpopulation profil...  


In [2]:
import os
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
# Set your OpenAI API Key
# os.environ["OPENAI_API_KEY"] = 

# Initialize Pinecone
pc = pinecone.Pinecone(os.environ["PINECONE_API_KEY"])
index_name = "pubmedqa-index"
spec = pinecone.ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )    
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        spec=spec,
    )
index = pc.Index(index_name)

In [5]:
from langchain.vectorstores import Pinecone
from tqdm import tqdm

# Initialize OpenAI embedding model
embedder = OpenAIEmbeddings()
df = df.head(1000)
# Convert DataFrame to list of dictionaries for processing
documents = df.to_dict(orient="records")
# Convert DataFrame to list of dictionaries for processing
# documents = df.to_dict(orient="records")

# Store batched embeddings and upsert to Pinecone
batch_size = 32  # Adjust as needed

for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i+batch_size]
    
    # Extract context texts
    batch_texts = [doc["context"] for doc in batch]
    
    # Generate embeddings for batch
    batch_embeddings = embedder.embed_documents(batch_texts)
    
    # Prepare for Pinecone upsert
    to_upsert = [
        (str(doc["pubid"]), emb, {"question": doc["question"], "text": doc["long_answer"]})
        for doc, emb in zip(batch, batch_embeddings)
    ]
    
    # Upsert batch to Pinecone
    index.upsert(to_upsert)


  embedder = OpenAIEmbeddings()
100%|██████████| 32/32 [06:49<00:00, 12.79s/it]


In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore

# Initialize OpenAI Chat Model
llm = ChatOpenAI(model="gpt-4o")

# Initialize Pinecone Vector Store
embeddings = OpenAIEmbeddings()
# vector_store = Pinecone(index, embedder, "context")
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

# Use a better retriever
question = "Does vagus nerve contribute to the development of the brain?"
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})  # ✅ Use "context"


  embeddings = OpenAIEmbeddings()


In [4]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

system_prompt = (
            "You are an expert assistant for biomedical question-answering tasks. "
            "You will be provided with context retrieved from medical literature, specifically PubMed Open Access Articles. "
            "Use the provided context to directly answer the question in the most accurate and concise manner possible. "
            "If the context does not provide sufficient information, state that the specific details are not available in the context."
            "Do not include statements about limitations of the context in your response. "
            "Your answer should sound authoritative and professional, tailored for a medical audience."
            "\n\n"
            "Context:\n{context}\n"
        )
prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{input}"),
        ])
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(vectorstore.as_retriever(), combine_docs_chain)
response = rag_chain.invoke({"input": "How does the hepatic vagus nerve influence steatohepatitis and obesity?"})


In [5]:
response

{'input': 'How does the hepatic vagus nerve influence steatohepatitis and obesity?',
 'context': [Document(id='25443731', metadata={'question': 'Do the effects of ifenprodil on the activity of antidepressant drugs in the forced swim test in mice?'}, page_content='The concomitant administration of certain commonly prescribed antidepressant drugs that affect the serotonergic neurotransmission (i.e., typical tricyclic antidepressants and selective serotonin reuptake inhibitors) with a negative modulator selectively binding to the GluN1/N2B subunits of the NMDA receptor complex (i.e., ifenprodil) may induce a more pronounced antidepressant-like effect than monotherapy. However, these findings still need to be confirmed in further experiments.'),
  Document(id='25449582', metadata={'question': 'Is gut microbiome composition associated with temperament during early childhood?'}, page_content='Differences in gut microbiome composition, including alpha diversity, beta diversity, and abundances

In [10]:
response = rag_chain.invoke({"input" :"Is it possible to detect myocarditis before left ventricular dysfunction using ultrasound imaging?"})

In [11]:
response['answer']

'Yes, contrast-enhanced ultrasound molecular imaging can detect myocarditis by identifying endothelial inflammation and leukocyte infiltration before there is a detectable decline in left ventricular performance by functional imaging.'