In [2]:
import os
import time
import pandas as pd
from tqdm.auto import tqdm
from pandas import DataFrame
from datasets import load_dataset
import random
import string


# Import OpenAI client and initialize with your API key.
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Import Pinecone client and related specifications.
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [9]:
# Load the dataset (ensure you're logged in with huggingface-cli if needed)
# from datasets import load_dataset
import pandas as pd
# ds = load_dataset("/Users/phani/Downloads/medical_o1_sft.json", "en", split='train[:100]')
ds_dataframe = pd.read_json('./medical_o1_sft.json')

# Merge the Question and Response columns into a single string.
ds_dataframe['merged'] = ds_dataframe.apply(
    lambda row: f"Question: {row['Question']} Answer: {row['Response']}", axis=1
)
print("Example merged text:", ds_dataframe['merged'].iloc[0])

Example merged text: Question: Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings? Answer: The specific cardiac abnormality most likely to be found in this scenario is a patent foramen ovale (PFO). This condition could allow a blood clot from the venous system, such as one from a deep vein thrombosis in the leg, to bypass the lungs and pass directly into the arterial circulation. This can occur when the clot moves from the right atrium to the left atrium through the PFO. Once in the arterial system, the clot can travel to the brain, potentially causing an embolic stroke, which would explain the sudden weakness in the left arm and leg. The connection between the recent travel, which increases the risk of deep vein thrombosis, and the neurological symptoms suggests the presen

In [5]:
MODEL = "text-embedding-3-small"  # Replace with your production embedding model if needed
# Compute an embedding for the first document to obtain the embedding dimension.
sample_embedding_resp = client.embeddings.create(
  input=[ds_dataframe['merged'].iloc[0]],
  model=MODEL
)
print(sample_embedding_resp)
embed_dim = len(sample_embedding_resp.data[0].embedding)
print(f"Embedding dimension: {embed_dim}")

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.016561094671487808, 0.029997453093528748, 0.052735909819602966, 0.03651132434606552, -0.03429997339844704, 0.02302689105272293, -0.04482792690396309, -0.017065858468413353, -0.014710289426147938, 0.004386646673083305, 0.002980516292154789, 0.009800850413739681, -0.039612021297216415, -0.010954598896205425, 0.019361337646842003, -0.008466829545795918, 0.028122613206505775, -0.004578938242048025, 0.0018372839549556375, -0.005149803124368191, 0.011567527428269386, 0.03374713659286499, 0.013123885728418827, -0.012294629588723183, 0.003899909323081374, -0.025358423590660095, -0.007222944404929876, -0.02684868313372135, -0.018183551728725433, -0.01973390206694603, 0.049563098698854446, -0.008749257773160934, 0.002035584533587098, 0.004648042842745781, 0.005411199294030666, 0.04660661891102791, -0.0464143306016922, 0.012318666093051434, -0.03651132434606552, -0.0203107763081789, 0.02037086710333824, -0.06042756140232086, -0.039083220064640

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone using your API key.
pc = Pinecone(api_key=PINECONE_API_KEY)

# Define the Pinecone serverless specification.
AWS_REGION = "us-east-1"
spec = ServerlessSpec(cloud="aws", region=AWS_REGION)

# Create a random index name with lower case alphanumeric characters and '-'
index_name = 'pinecone-index-' + ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))


# Create the index if it doesn't already exist.
if index_name not in pc.list_indexes().names():
  pc.create_index(
    index_name,
    dimension=embed_dim,
    metric='dotproduct',
    spec=spec
  )


# Connect to the index.
index = pc.Index(index_name)
time.sleep(1)
print("Index stats:", index.describe_index_stats())    

Index stats: {'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [14]:
res = {"data": [{"embedding": [1, 2, 3]}]}

In [16]:
print([record["embedding"] for record in res["data"]])

[[1, 2, 3]]


In [17]:
list(zip([1, 2, 3], [[1, 2, 3]]))


[(1, [1, 2, 3])]

In [18]:
batch_size = 32
for i in tqdm(range(0, len(ds_dataframe['merged']), batch_size), desc="Upserting to Pinecone"):
    i_end = min(i + batch_size, len(ds_dataframe['merged']))
    lines_batch = ds_dataframe['merged'][i: i_end]
    ids_batch = [str(n) for n in range(i, i_end)]
    
    # Create embeddings for the current batch.
    res = client.embeddings.create(input=[line for line in lines_batch], model=MODEL)
    embeds = [record.embedding for record in res.data]
    
    # Prepare metadata by extracting original Question and Answer.
    meta = []
    for record in ds_dataframe.iloc[i:i_end].to_dict('records'):
        q_text = record['Question']
        a_text = record['Response']
        # Optionally update metadata for specific entries.
        meta.append({"Question": q_text, "Answer": a_text})
    
    # Upsert the batch into Pinecone.
    vectors = list(zip(ids_batch, embeds, meta))
    index.upsert(vectors=vectors)

Upserting to Pinecone:  28%|██▊       | 9/32 [00:37<01:35,  4.15s/it]


PermissionDeniedError: Error code: 403 - {'error': {'message': 'Project `proj_e7WDixGTJfswZg7ft8kaEzOX` does not have access to model `text-embedding-3-small`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [19]:
def query_pinecone_index(client, index, model, query_text):
    # Generate an embedding for the query.
    query_embedding = client.embeddings.create(input=query_text, model=model).data[0].embedding

    # Query the index and return top 5 matches.
    res = index.query(vector=[query_embedding], top_k=5, include_metadata=True)
    print("Query Results:")
    for match in res['matches']:
        print(f"{match['score']:.2f}: {match['metadata'].get('Question', 'N/A')} - {match['metadata'].get('Answer', 'N/A')}")
    return res

In [20]:
query = (
    "A 45-year-old man with a history of alcohol use presents with symptoms including confusion, ataxia, and ophthalmoplegia. "
    "What is the most likely diagnosis and the recommended treatment?"
)
query_pinecone_index(client, index, MODEL, query)

Query Results:
0.70: A 45-year-old man with a history of alcohol use, who has been abstinent for the past 10 years, presents with sudden onset dysarthria, shuffling gait, and intention tremors. Given this clinical presentation and history, what is the most likely diagnosis? - Considering the clinical presentation of sudden onset dysarthria, shuffling gait, and intention tremors in a 45-year-old man with a history of alcohol use who has been abstinent for the past 10 years, the most likely diagnosis is acquired hepatocerebral degeneration.

This condition is associated with chronic liver disease, which can often be a consequence of long-term alcohol use. Despite the patient's abstinence from alcohol for a decade, previous alcohol use may have led to underlying liver dysfunction. This dysfunction, even if subclinical, can cause encephalopathy due to the accumulation of neurotoxic substances that affect the brain. The sudden onset of these neurological symptoms aligns with how acquired he

{'matches': [{'id': '3',
              'metadata': {'Answer': 'Considering the clinical presentation of '
                                     'sudden onset dysarthria, shuffling gait, '
                                     'and intention tremors in a 45-year-old '
                                     'man with a history of alcohol use who '
                                     'has been abstinent for the past 10 '
                                     'years, the most likely diagnosis is '
                                     'acquired hepatocerebral degeneration.\n'
                                     '\n'
                                     'This condition is associated with '
                                     'chronic liver disease, which can often '
                                     'be a consequence of long-term alcohol '
                                     "use. Despite the patient's abstinence "
                                     'from alcohol for a decade, previous '

In [25]:
print(res)

CreateEmbeddingResponse(data=[Embedding(embedding=[0.022964244708418846, -0.0069473544135689735, 0.03431233391165733, -0.00737737538293004, -0.02866062894463539, 0.07528162002563477, 0.030202003195881844, -0.01698862761259079, 0.025309814140200615, -0.02130000852048397, 0.049368660897016525, 0.021635089069604874, -0.03862371668219566, 0.011549138464033604, 0.03884710371494293, 0.052272699773311615, -0.051066406071186066, 0.03692597150802612, -0.06505046784877777, 0.0660780519247055, 0.036992985755205154, 0.02470666915178299, 0.015357897616922855, -0.0012307260185480118, -0.0014254921115934849, -0.030626440420746803, 0.026292720809578896, 0.027566030621528625, 0.005785739049315453, -0.022539809346199036, 0.015168017707765102, -0.007980521768331528, -0.033150721341371536, -0.031609345227479935, -0.029755229130387306, -0.028638290241360664, 0.02562255784869194, 0.03837798908352852, -0.029487162828445435, -0.022986583411693573, -0.013961724936962128, -0.03596540540456772, 0.025175781920552

In [29]:
matches = index.query(
    vector=[client.embeddings.create(input=query, model=MODEL).data[0].embedding],
    top_k=3,
    include_metadata=True
)['matches']

context = "\n\n".join(
    f"Question: {m['metadata'].get('Question', '')}\nAnswer: {m['metadata'].get('Answer', '')}"
    for m in matches
)
# Use the context to generate a final answer.
response = client.responses.create(
    model="gpt-5-nano",
    input=f"Provide the answer based on the context: {context} and the question: {query} as per the internal knowledge base",
)
print("\nFinal Answer:")
print(response.output_text)


Final Answer:
- Most likely diagnosis: Wernicke encephalopathy due to thiamine (vitamin B1) deficiency in a person with chronic alcohol use.

- Recommended treatment: Immediate parenteral thiamine before any glucose administration. Typical regimen: 100 mg IV or IM three times daily for 2–3 days, then switch to oral thiamine (e.g., 100 mg daily) for several weeks. Address nutrition and continue efforts to stop alcohol use. (In acute/severe cases some guidelines use higher IV doses, but 100 mg TID is commonly used.)


In [30]:
tools = [   
    {"type": "web_search_preview",
      "user_location": {
        "type": "approximate",
        "country": "US",
        "region": "California",
        "city": "SF"
      },
      "search_context_size": "medium"},
    {
        "type": "function",
        "name": "PineconeSearchDocuments",
        "description": "Search for relevant documents based on the medical question asked by the user that is stored within the vector database using a semantic query.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The natural language query to search the vector database."
                },
                "top_k": {
                    "type": "integer",
                    "description": "Number of top results to return.",
                    "default": 3
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
]

In [31]:
queries = [
    {"query": "Who won the cricket world cup in 1983?"},
    {"query": "What is the most common cause of death in the United States according to the internet?"},
    {"query": ("A 7-year-old boy with sickle cell disease is experiencing knee and hip pain, "
               "has been admitted for pain crises in the past, and now walks with a limp. "
               "His exam shows a normal, cool hip with decreased range of motion and pain with ambulation. "
               "What is the most appropriate next step in management according to the internal knowledge base?")}
]

In [41]:
# Process each query dynamically.
for item in queries[2:]:
    input_messages = [{"role": "user", "content": item["query"]}]
    print("\n🌟--- Processing Query ---🌟")
    print(f"🔍 **User Query:** {item['query']}")
    
    # Call the Responses API with tools enabled and allow parallel tool calls.
    response = client.responses.create(
        model="gpt-5-nano",
        input=[
            {"role": "system", "content": "When prompted with a question, select the right tool to use based on the question."
            },
            {"role": "user", "content": item["query"]}
        ],
        tools=tools,
        parallel_tool_calls=True
    )
    
    print("\n✨ **Initial Response Output:**")
    print(response.output)
    
    # Determine if a tool call is needed and process accordingly.
    if response.output:
        tool_call = response.output[0]
        tool_call = next((call for call in response.output if call.type in ["web_search_preview", "function_call"]), None)
        if tool_call:
            tool_name = tool_call.name if tool_call.type == "function_call" else "web_search_preview"
            print(f"\n🔧 **Model triggered a tool call:** {tool_name}")
            
            if tool_name == "PineconeSearchDocuments":
                print("🔍 **Invoking PineconeSearchDocuments tool...**")
                res = query_pinecone_index(client, index, MODEL, item["query"])
                if res["matches"]:
                    best_match = res["matches"][0]["metadata"]
                    result = f"**Question:** {best_match.get('Question', 'N/A')}\n**Answer:** {best_match.get('Answer', 'N/A')}"
                else:
                    result = "**No matching documents found in the index.**"
                print("✅ **PineconeSearchDocuments tool invoked successfully.**")
            else:
                print("🔍 **Invoking simulated web search tool...**")
                result = "**Simulated web search result.**"
                print("✅ **Simulated web search tool invoked successfully.**")
            
            # Append the tool call and its output back into the conversation.
            # reasoning = response.output[0]
            input_messages.append(tool_call)
            input_messages.append({
                "type": "function_call_output",
                "call_id": tool_call.call_id,
                "output": str(result)
            })
            
            # Get the final answer incorporating the tool's result.
            final_response = client.responses.create(
                model="gpt-5-nano",
                input=input_messages,
                tools=tools,
                parallel_tool_calls=True
            )
            print("\n💡 **Final Answer:**")
            print(final_response.output_text)
        else:
            # If no tool call is triggered, print the response directly.
            print("💡 **Final Answer:**")
            print(response.output_text)


🌟--- Processing Query ---🌟
🔍 **User Query:** A 7-year-old boy with sickle cell disease is experiencing knee and hip pain, has been admitted for pain crises in the past, and now walks with a limp. His exam shows a normal, cool hip with decreased range of motion and pain with ambulation. What is the most appropriate next step in management according to the internal knowledge base?

✨ **Initial Response Output:**
[ResponseReasoningItem(id='rs_0999d90e88f10e4c0068e3e1df883081948a64dbff34c9ee35', summary=[], type='reasoning', content=None, encrypted_content=None, status=None), ResponseOutputMessage(id='msg_0999d90e88f10e4c0068e3e1e79a3481949ffcd367c3c3b8c8', content=[ResponseOutputText(annotations=[], text='Order an MRI of the hips (and pelvis as needed) to evaluate for avascular necrosis/osteonecrosis of the femoral head. In children with sickle cell disease who present with hip/knee pain and a limp, early AVN can occur with a normal-appearing, cool hip on exam, and MRI is the most sensit