In [1]:
import json
import time
import requests
import sys
import os
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv() 

# Load API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [2]:
INPUT_JSON_PATH = "/Users/cj2837/Documents/Courses/Project/outputs/std_relation_triples.json"
MODEL_NAME = "gpt-4o"

In [3]:
with open(INPUT_JSON_PATH, 'r') as f:
    KG_DATA = json.load(f)
    print(f"Successfully loaded {len(KG_DATA)} triplets from {INPUT_JSON_PATH}")

# Display a sample triplet
if KG_DATA:
    print("\nSample Triplet:")
    print(KG_DATA[0])

Successfully loaded 1052 triplets from /Users/cj2837/Documents/Courses/Project/outputs/std_relation_triples.json

Sample Triplet:
{'head': 'bonviva', 'head_cui': 'c0918018', 'head_semantic_source': 'msh', 'relation': 'treats', 'tail': 'osteoporosis, postmenopausal', 'tail_cui': 'c0029458', 'tail_semantic_source': 'mth', 'head_type': 'medication', 'tail_type': 'condition'}


## KG Class Definition

In [4]:
class KnowledgeGraph:
    """
    Simulates an in-memory graph database for fast retrieval.
    It indexes the triplets by both head and tail concepts for bidirectional lookup.
    """
    def __init__(self, triplets):
        self.graph = {}
        self._load_triplets(triplets)
        print(f"Knowledge Graph initialized with {len(self.graph)} unique nodes/concepts.")

    def _load_triplets(self, triplets):
        """Converts the list of triplets into a dictionary-based graph structure."""
        for t in triplets:
            head = t['head']
            tail = t['tail']
            relation = t['relation']
            
            # Formulate the triplet into a clear, readable string for the LLM context
            context_str = (
                f"{head} (CUI: {t['head_cui']}) --({relation})--> {tail} (CUI: {t['tail_cui']}). "
                f"Head Type: {t['head_type']}, Tail Type: {t['tail_type']}."
            )

            # 1. Index by Head Concept
            if head not in self.graph:
                self.graph[head] = []
            self.graph[head].append(context_str)

            # 2. Index by Tail Concept (for reverse lookup)
            if tail not in self.graph:
                self.graph[tail] = []
            
            # Store the relationship from tail, reversing the direction helps LLM synthesis
            inverse_context_str = (
                f"{tail} (CUI: {t['tail_cui']}) <--({relation})-- {head} (CUI: {t['head_cui']}). "
                f"Head Type: {t['head_type']}, Tail Type: {t['tail_type']}."
            )
            self.graph[tail].append(inverse_context_str)

    def retrieve_context(self, search_term: str) -> str:
        """
        Retrieves all connected facts (1-hop neighbors) for a given search term.
        Performs a case-insensitive, partial match on the node names.
        """
        search_term_lower = search_term.lower()
        
        # Try to find a node key that contains the search term
        matching_key = next((key for key in self.graph if search_term_lower in key.lower()), None)

        if not matching_key:
            return ""

        context_lines = self.graph.get(matching_key, [])
        
        # Deduplicate and format the context for the LLM
        formatted_context = "\n".join(sorted(list(set(context_lines))))
        
        print(f"üîç Retrieved Context for '{matching_key}': {len(context_lines)} facts found.")
        return formatted_context

# Initialize the KG instance
menopause_kg = KnowledgeGraph(KG_DATA)

Knowledge Graph initialized with 645 unique nodes/concepts.


## RAG Query Function

In [None]:
def api_call_with_backoff(payload, headers, max_retries=5):
    for attempt in range(max_retries):
        try:
            # We use the requests library for the raw HTTP POST request
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                data=json.dumps(payload)
            )
            
            if response.status_code == 200:
                result = response.json()
                # Correct parsing for OpenAI's Chat Completion
                return result['choices'][0]['message']['content']
            
            # Handle API errors (e.g., rate limit 429)
            elif response.status_code in (429, 500, 503) and attempt < max_retries - 1:
                wait_time = 2 ** attempt
                print(f"API Error {response.status_code}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                return f"Error: Failed to fetch response from LLM (Status {response.status_code}): {response.text}"

        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt
                print(f"Request failed: {e}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                return f"Error: Request failed after multiple retries: {e}"
    
    return "Error: Maximum API retries reached."

def soft_grounding_rag_query(kg: KnowledgeGraph, search_term: str, user_question: str):
    """
    Executes the Soft Grounding RAG process: Retrieve context, build augmented prompt, call LLM.
    
    This function prioritizes KG facts but allows the LLM to use general knowledge for completeness and personalization.
    """
    print(f"Starting Soft Grounding RAG Query for concept: '{search_term}'")
    
    # 1. RETRIEVAL: Get context from the Knowledge Graph
    # We still use the KG retrieval to find relevant facts.
    retrieved_context = kg.retrieve_context(search_term)

    # 2. PROMPT CONSTRUCTION (Soft Grounding Policy)
    if not retrieved_context:
        # **FALLBACK INSTRUCTION (No Context)**
        final_context_block = "No specific facts retrieved from the knowledge graph."
        
        system_instruction = (
            "You are a specialized, empathetic medical knowledge assistant. "
            "Since no specific knowledge graph facts were retrieved, please answer the user's question "
            "using your general medical knowledge to provide a comprehensive, personalized recommendation. "
            "Maintain a professional and caring tone."
        )
        print("Context not found. Falling back to comprehensive general knowledge (Soft Grounding policy).")
    else:
        # **SOFT GROUNDING INSTRUCTION (Context Available)**
        final_context_block = (
            "--- KNOWLEDGE GRAPH FACTS ---\n"
            f"{retrieved_context}\n"
            "-----------------------------"
        )
        
        system_instruction = (
            "You are a specialized, empathetic medical knowledge assistant providing personalized advice. "
            "Your response MUST be founded on the information in the 'KNOWLEDGE GRAPH FACTS' section. "
            "First, you MUST incorporate and prioritize the specific facts found in the KG. "
            "Second, if the KG facts are incomplete, you are authorized to use your general medical knowledge "
            "to supplement the answer and provide a comprehensive, personalized recommendation. "
            "Always clearly delineate information derived from the KNOWLEDGE GRAPH to demonstrate grounding."
        )
        print("Context retrieved. Generating answer with prioritization of KG facts.")
    
    # 3. GENERATION: Build Payload and Call API
    
    # The user prompt contains the context and the final question
    user_prompt = f"Given the following facts and constraints, answer the user's question:\n\n{final_context_block}\n\nUSER QUESTION: {user_question}"
    
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_prompt}
        ],
        "temperature": 0.3 # Slightly higher temperature for personalization/synthesis
    }
    
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}' 
    }
    
    # NOTE: Assuming api_call_with_backoff is defined and available
    llm_response = api_call_with_backoff(payload, headers)
    
    print("\n\n--- RAG Response ---")
    print(llm_response)
    print("--------------------")
    return llm_response

def baseline_llm_query(user_question: str):
    """
    Queries the baseline LLM (GPT-4o) directly without any external context.
    This serves as the control group for comparison.
    """
    print(f"Starting Baseline LLM Query.")
    
    # 1. PROMPT CONSTRUCTION (Simple, direct query)
    system_instruction = (
        "You are a general knowledge medical expert. Answer the user's question accurately "
        "and concisely using your general knowledge base. Do not use external tools."
    )
    
    # We use a slightly higher temperature for a more natural baseline answer
    # but keep it low enough for factual accuracy.
    
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_question}
        ],
        "temperature": 0.2
    }
    
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}' 
    }
    
    # 2. GENERATION: Call the OpenAI API
    llm_response = api_call_with_backoff(payload, headers)
    
    return llm_response

# --- Utility Functions for Manual Execution ---

def run_soft_rag_comparison(search_concept: str, user_query: str):
    """
    Runs the Soft Grounding RAG query and prints the result for analysis.
    Requires 'soft_grounding_rag_query' and 'menopause_kg' to be defined.
    """
    print("\n" + "="*70)
    print(f"[RUNNING SOFT-RAG QUERY | Search Concept: {search_concept}]")
    print("="*70)
    
    # A. KG-RAG Query (Uses soft_grounding_rag_query function)
    rag_response = soft_grounding_rag_query(menopause_kg, search_concept, user_query)
    
    print("\n--- SOFT-RAG FINAL RESPONSE ---")
    print(rag_response)
    print("------------------------------\n")
    return rag_response



def run_baseline_comparison(user_query: str):
    """
    Runs the Baseline LLM query and prints the result for analysis.
    Requires 'baseline_llm_query' to be defined.
    """
    print("\n" + "="*70)
    print("[RUNNING BASELINE LLM QUERY]")
    print("="*70)

    # B. Baseline Query (Should be general, likely no CUI numbers)
    baseline_response = baseline_llm_query(user_query)
    
    print("\n--- BASELINE FINAL RESPONSE ---")
    print(baseline_response)
    print("------------------------------\n")
    return baseline_response



## Test

In [6]:
USER_CASE_QUERY = (
    "I am a 52-year-old woman experiencing severe hot flashes and mood swings. "
    "I have a family history of **blood clots**. Based on this personal risk factor "
    "and the available facts, what are my general treatment options for these symptoms, "
    "and what are their pros and cons, particularly considering the blood clot risk?"
)
CASE_SEARCH_TERM = "Hot flashes"


# --- Comparative Execution ---

print("\n" + "#"*80)
print("DEMONSTRATION: PERSONALIZED RECOMMENDATION (SOFT GROUNDING)")
print("#"*80)


# CELL 1: RUN BASELINE LLM (GENERAL ADVICE)
print("\n" + "-"*30 + " BASELINE TEST START " + "-"*30)
baseline_result = run_baseline_comparison(USER_CASE_QUERY)
print("-"*30 + " BASELINE TEST END " + "-"*30 + "\n")


# CELL 2: RUN SOFT GROUNDING RAG (PERSONALIZED, GROUNDED ADVICE)
print("\n" + "-"*30 + " SOFT RAG TEST START " + "-"*30)
soft_rag_result = run_soft_rag_comparison(CASE_SEARCH_TERM, USER_CASE_QUERY)
print("-"*30 + " SOFT RAG TEST END " + "-"*30 + "\n")


# Final Summary
print("\n" + "*"*80)
print("ANALYSIS POINT: The Soft RAG result should explicitly link the 'family history of blood clots' ")
print("to the KG fact 'HRT increases the risk of blood clots' with high priority.")
print("*"*80)



################################################################################
DEMONSTRATION: PERSONALIZED RECOMMENDATION (SOFT GROUNDING)
################################################################################

------------------------------ BASELINE TEST START ------------------------------

[RUNNING BASELINE LLM QUERY]
‚û°Ô∏è Starting Baseline LLM Query.

--- BASELINE FINAL RESPONSE ---
For a 52-year-old woman experiencing severe hot flashes and mood swings, particularly with a family history of blood clots, it's important to consider treatment options that minimize the risk of thrombosis. Here are some general treatment options:

1. **Lifestyle Modifications:**
   - **Pros:** No risk of blood clots, can improve overall health.
   - **Cons:** May not be sufficient for severe symptoms.
   - **Examples:** Regular exercise, maintaining a healthy weight, avoiding triggers like hot drinks and spicy foods, and practicing stress-reduction techniques such as yoga or meditation.


In [7]:
USER_CASE_QUERY = (
    "I am a **65-year-old woman** who went through menopause 15 years ago. I am very thin (low BMI), "
    "which worries me about bone health. Based on the facts available, what are the recommended "
    "pharmacological and non-pharmacological interventions for severe osteoporosis risk, "
    "and what long-term monitoring advice can you provide?"
)
CASE_SEARCH_TERM = "Osteoporosis" # Searching for the specific long-term risk to retrieve facts


# --- Comparative Execution ---

print("\n" + "#"*80)
print("DEMONSTRATION: PERSONALIZED RECOMMENDATION (SOFT GROUNDING)")
print("#" * 80)


# CELL 1: RUN BASELINE LLM (GENERAL ADVICE)
print("\n" + "-"*30 + " BASELINE TEST START " + "-"*30)
baseline_result = run_baseline_comparison(USER_CASE_QUERY)
print("-"*30 + " BASELINE TEST END " + "-"*30 + "\n")


# CELL 2: RUN SOFT GROUNDING RAG (PERSONALIZED, GROUNDED ADVICE)
print("\n" + "-"*30 + " SOFT RAG TEST START " + "-"*30)
soft_rag_result = run_soft_rag_comparison(CASE_SEARCH_TERM, USER_CASE_QUERY)
print("-"*30 + " SOFT RAG TEST END " + "-"*30 + "\n")


# Final Summary
print("\n" + "*"*80)
print("ANALYSIS POINT: The Soft RAG result must use the KG to anchor the discussion on specific treatment options (e.g., Bisphosphonates, Hormone Therapy) and then synthesize the user's high-risk factors (age, low BMI) into an urgent monitoring recommendation.")
print("*"*80)


################################################################################
DEMONSTRATION: PERSONALIZED RECOMMENDATION (SOFT GROUNDING)
################################################################################

------------------------------ BASELINE TEST START ------------------------------

[RUNNING BASELINE LLM QUERY]
‚û°Ô∏è Starting Baseline LLM Query.

--- BASELINE FINAL RESPONSE ---
For a 65-year-old woman with a low BMI and a concern about bone health post-menopause, addressing the risk of osteoporosis is important. Here are recommended interventions:

### Non-Pharmacological Interventions:
1. **Dietary Measures:**
   - **Calcium Intake:** Ensure adequate calcium intake through diet or supplements if necessary. The recommended daily intake for women over 50 is about 1,200 mg.
   - **Vitamin D:** Adequate vitamin D is crucial for calcium absorption. Aim for 800-1,000 IU daily, either through sunlight exposure, diet, or supplements.

2. **Exercise:**
   - Engage in we