In [1]:
import json
import time
import requests
import sys
import os
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv() 

# Load API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [2]:
INPUT_JSON_PATH = "/Users/cj2837/Documents/Courses/Project/outputs/std_relation_triples.json"
MODEL_NAME = "gpt-4o"

In [3]:
with open(INPUT_JSON_PATH, 'r') as f:
    KG_DATA = json.load(f)
    print(f"Successfully loaded {len(KG_DATA)} triplets from {INPUT_JSON_PATH}")

# Display a sample triplet
if KG_DATA:
    print("\nSample Triplet:")
    print(KG_DATA[0])

Successfully loaded 1052 triplets from /Users/cj2837/Documents/Courses/Project/outputs/std_relation_triples.json

Sample Triplet:
{'head': 'bonviva', 'head_cui': 'c0918018', 'head_semantic_source': 'msh', 'relation': 'treats', 'tail': 'osteoporosis, postmenopausal', 'tail_cui': 'c0029458', 'tail_semantic_source': 'mth', 'head_type': 'medication', 'tail_type': 'condition'}


## KG Class Definition

In [4]:
class KnowledgeGraph:
    """
    Simulates an in-memory graph database for fast retrieval.
    It indexes the triplets by both head and tail concepts for bidirectional lookup.
    """
    def __init__(self, triplets):
        self.graph = {}
        self._load_triplets(triplets)
        print(f"Knowledge Graph initialized with {len(self.graph)} unique nodes/concepts.")

    def _load_triplets(self, triplets):
        """Converts the list of triplets into a dictionary-based graph structure."""
        for t in triplets:
            head = t['head']
            tail = t['tail']
            relation = t['relation']
            
            # Formulate the triplet into a clear, readable string for the LLM context
            context_str = (
                f"{head} (CUI: {t['head_cui']}) --({relation})--> {tail} (CUI: {t['tail_cui']}). "
                f"Head Type: {t['head_type']}, Tail Type: {t['tail_type']}."
            )

            # 1. Index by Head Concept
            if head not in self.graph:
                self.graph[head] = []
            self.graph[head].append(context_str)

            # 2. Index by Tail Concept (for reverse lookup)
            if tail not in self.graph:
                self.graph[tail] = []
            
            # Store the relationship from tail, reversing the direction helps LLM synthesis
            inverse_context_str = (
                f"{tail} (CUI: {t['tail_cui']}) <--({relation})-- {head} (CUI: {t['head_cui']}). "
                f"Head Type: {t['head_type']}, Tail Type: {t['tail_type']}."
            )
            self.graph[tail].append(inverse_context_str)

    def retrieve_context(self, search_term: str) -> str:
        """
        Retrieves all connected facts (1-hop neighbors) for a given search term.
        Performs a case-insensitive, partial match on the node names.
        """
        search_term_lower = search_term.lower()
        
        # Try to find a node key that contains the search term
        matching_key = next((key for key in self.graph if search_term_lower in key.lower()), None)

        if not matching_key:
            return ""

        context_lines = self.graph.get(matching_key, [])
        
        # Deduplicate and format the context for the LLM
        formatted_context = "\n".join(sorted(list(set(context_lines))))
        
        print(f"üîç Retrieved Context for '{matching_key}': {len(context_lines)} facts found.")
        return formatted_context

# Initialize the KG instance
menopause_kg = KnowledgeGraph(KG_DATA)

Knowledge Graph initialized with 645 unique nodes/concepts.


## RAG Query Function

In [None]:
def api_call_with_backoff(payload, headers, max_retries=5):
    for attempt in range(max_retries):
        try:
            # We use the requests library for the raw HTTP POST request
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
                headers=headers,
                data=json.dumps(payload)
            )
            
            if response.status_code == 200:
                result = response.json()
                # Correct parsing for OpenAI's Chat Completion
                return result['choices'][0]['message']['content']
            
            # Handle API errors (e.g., rate limit 429)
            elif response.status_code in (429, 500, 503) and attempt < max_retries - 1:
                wait_time = 2 ** attempt
                print(f"API Error {response.status_code}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                return f"Error: Failed to fetch response from LLM (Status {response.status_code}): {response.text}"

        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt
                print(f"Request failed: {e}. Retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                return f"Error: Request failed after multiple retries: {e}"
    
    return "Error: Maximum API retries reached."

def hard_grounding_rag_query(kg: KnowledgeGraph, search_term: str, user_question: str):
    """
    Executes the Hard Grounding RAG process: Retrieve context, build strict prompt, call LLM.
    
    This function forces the LLM to STRICTLY use the retrieved KG facts or state failure.
    """
    print(f"Starting Hard Grounding RAG Query for concept: '{search_term}'")
    
    # 1. RETRIEVAL: Get context from the Knowledge Graph
    retrieved_context = kg.retrieve_context(search_term)

    # 2. PROMPT CONSTRUCTION (Strict Grounding Policy)
    if not retrieved_context:
        # **STRICT FAILURE MESSAGE FOR HARD GROUNDING**
        final_context_block = "No specific facts retrieved from the knowledge graph."
        
        system_instruction = (
            "You are a specialized knowledge extraction tool under strict constraints. "
            "The knowledge graph facts provided below are insufficient to answer the user's query. "
            "You MUST output the following exact phrase: 'The current knowledge graph does not contain enough information to answer this specific query.' "
            "DO NOT use general knowledge or provide any other answer."
        )
        print("Context not found. LLM will state that information is insufficient (Hard Grounding policy).")
    else:
        # **STRICT GROUNDING INSTRUCTION**
        final_context_block = (
            "--- KNOWLEDGE GRAPH FACTS ---\n"
            f"{retrieved_context}\n"
            "-----------------------------"
        )
        
        system_instruction = (
            "You are a specialized knowledge extraction tool. Your sole purpose is to "
            "answer the user's query by strictly synthesizing the information "
            "provided in the 'KNOWLEDGE GRAPH FACTS' section. "
            "Crucially, DO NOT use external knowledge. If the provided facts are insufficient "
            "to answer the question, state: 'The current knowledge graph does not contain "
            "enough information to answer this specific query.'"
        )
    
    # 3. GENERATION: Build Payload and Call API
    
    # The user prompt contains the context and the final question
    user_prompt = f"Given the following facts, answer the user's question:\n\n{final_context_block}\n\nUSER QUESTION: {user_question}"
    
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_prompt}
        ],
        "temperature": 0.0 # Lowest temperature for factual, non-creative answers
    }
    
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}' 
    }
    
    llm_response = api_call_with_backoff(payload, headers)
    
    print("\n\n--- RAG Response ---")
    print(llm_response)
    print("--------------------")
    return llm_response

def baseline_llm_query(user_question: str):
    """
    Queries the baseline LLM (GPT-4o) directly without any external context.
    This serves as the control group for comparison.
    """
    print(f"Starting Baseline LLM Query.")
    
    # 1. PROMPT CONSTRUCTION (Simple, direct query)
    system_instruction = (
        "You are a general knowledge medical expert. Answer the user's question accurately "
        "and concisely using your general knowledge base. Do not use external tools."
    )
    
    # We use a slightly higher temperature for a more natural baseline answer
    # but keep it low enough for factual accuracy.
    
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_question}
        ],
        "temperature": 0.2
    }
    
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.getenv("OPENAI_API_KEY")}' 
    }
    
    # 2. GENERATION: Call the OpenAI API
    llm_response = api_call_with_backoff(payload, headers)
    
    return llm_response

# --- Utility Functions for Manual Execution ---

def run_rag_comparison(search_concept: str, user_query: str):
    """
    Runs the Hard Grounding RAG query and prints the result for analysis.
    Requires 'hard_grounding_rag_query' and 'menopause_kg' to be defined.
    """
    print("\n" + "="*70)
    print(f"[RUNNING KG-RAG QUERY | Search Concept: {search_concept}]")
    print("="*70)
    
    # A. KG-RAG Query (Should be highly accurate, citing specific CUI/relationship)
    rag_response = hard_grounding_rag_query(menopause_kg, search_concept, user_query)
    
    print("\n--- KG-RAG FINAL RESPONSE ---")
    print(rag_response)
    print("------------------------------\n")
    return rag_response


def run_baseline_comparison(user_query: str):
    """
    Runs the Baseline LLM query and prints the result for analysis.
    Requires 'baseline_llm_query' to be defined.
    """
    print("\n" + "="*70)
    print("[RUNNING BASELINE LLM QUERY]")
    print("="*70)

    # B. Baseline Query (Should be general, likely no CUI numbers)
    baseline_response = baseline_llm_query(user_query)
    
    print("\n--- BASELINE FINAL RESPONSE ---")
    print(baseline_response)
    print("------------------------------\n")
    return baseline_response



## Test

In [None]:
# --- Variables for Testing ---
# These variables are defined here so you can easily copy/paste them into your notebook cells

TEST_QUERY_1 = {
    "concept": "Hormone Replacement Therapy",
    "query": "What are the therapeutic benefits and risks of Hormone Replacement Therapy (HRT)?"
}

TEST_QUERY_2 = {
    "concept": "Yoga", 
    "query": "What facts are available regarding the use of yoga for managing menopause symptoms?"
}

#### Query 1

In [7]:
print("--- TEST 1: BASELINE LLM (GENERAL KNOWLEDGE) ---")
baseline_result_1 = run_baseline_comparison(
    TEST_QUERY_1["query"]
)

--- TEST 1: BASELINE LLM (GENERAL KNOWLEDGE) ---

[RUNNING BASELINE LLM QUERY]
‚û°Ô∏è Starting Baseline LLM Query.

--- BASELINE FINAL RESPONSE ---
Hormone Replacement Therapy (HRT) is commonly used to alleviate symptoms of menopause by replenishing estrogen and, in some cases, progesterone levels. Here are the therapeutic benefits and risks associated with HRT:

**Benefits:**
1. **Relief from Menopausal Symptoms:** HRT is effective in reducing hot flashes, night sweats, vaginal dryness, and mood swings.
2. **Bone Health:** It helps in preventing bone loss and reducing the risk of osteoporosis and fractures in postmenopausal women.
3. **Cardiovascular Health:** Some studies suggest that HRT may reduce the risk of heart disease when started early in the postmenopausal period.
4. **Improved Quality of Life:** Many women experience improved sleep, mood, and overall quality of life.

**Risks:**
1. **Breast Cancer:** Long-term use of combined estrogen-progesterone therapy has been associate

In [11]:
# Run the KG-RAG to get the precise, grounded, and auditable answer.
print("--- TEST 1: HARD GROUNDING RAG (KG KNOWLEDGE ONLY) ---")
rag_result_1 = run_rag_comparison(
    TEST_QUERY_1["concept"], 
    TEST_QUERY_1["query"]
)

--- TEST 1: HARD GROUNDING RAG (KG KNOWLEDGE ONLY) ---

[RUNNING KG-RAG QUERY | Search Concept: Hormone Replacement Therapy]
‚û°Ô∏è Starting Hard Grounding RAG Query for concept: 'Hormone Replacement Therapy'
üîç Retrieved Context for 'hormone replacement therapy': 38 facts found.


--- RAG Response ---
The therapeutic benefits of Hormone Replacement Therapy (HRT) include:

- Helps with vaginal dryness severity.
- Helps prevent osteoporosis.
- Maintains bone density.
- Reduces difficulty sleeping.
- Reduces hot flushes.
- Reduces insulin resistance.
- Reduces menopausal symptoms.
- Reduces night sweats.
- Treats symptoms such as flushing, sleeplessness, headache, lack of concentration, associated with premature menopause.
- Treats vaginal dryness.

The risks associated with Hormone Replacement Therapy (HRT) include:

- May increase the risk of blood clots.
- May increase the risk of cerebrovascular accidents.
- May increase the risk of heart diseases.
- May increase the risk of gallbl

#### Query 2

In [9]:
# Run the Baseline LLM control group first to get a general, ungrounded answer.
print("--- TEST 2: BASELINE LLM (GENERAL KNOWLEDGE) ---")
baseline_result_2 = run_baseline_comparison(
    TEST_QUERY_2["query"]
)

--- TEST 2: BASELINE LLM (GENERAL KNOWLEDGE) ---

[RUNNING BASELINE LLM QUERY]
‚û°Ô∏è Starting Baseline LLM Query.

--- BASELINE FINAL RESPONSE ---
Yoga is often recommended as a complementary approach to managing menopause symptoms due to its potential benefits for both physical and mental health. Here are some key points:

1. **Stress Reduction**: Yoga can help reduce stress and anxiety, which are common during menopause. The practice encourages relaxation and mindfulness, which can alleviate mood swings and improve emotional well-being.

2. **Improved Sleep**: Many women experience sleep disturbances during menopause. Yoga, particularly restorative and gentle forms, can promote better sleep by calming the nervous system and reducing insomnia.

3. **Hot Flashes**: Some studies suggest that regular yoga practice may help reduce the frequency and intensity of hot flashes, although results can vary among individuals.

4. **Bone Health**: Weight-bearing yoga poses can help maintain bone 

In [12]:
# Run the KG-RAG to get the precise, grounded, and auditable answer.
print("--- TEST 2: HARD GROUNDING RAG (KG KNOWLEDGE ONLY) ---")
rag_result_2 = run_rag_comparison(
    TEST_QUERY_2["concept"],
    TEST_QUERY_2["query"]
)

--- TEST 2: HARD GROUNDING RAG (KG KNOWLEDGE ONLY) ---

[RUNNING KG-RAG QUERY | Search Concept: Yoga]
‚û°Ô∏è Starting Hard Grounding RAG Query for concept: 'Yoga'
üîç Retrieved Context for 'yoga': 11 facts found.


--- RAG Response ---
The current knowledge graph provides the following facts regarding the use of yoga for managing menopause symptoms:

1. Yoga helps with symptoms such as flushing, sleeplessness, headache, and lack of concentration, which are associated with premature menopause.
2. Yoga reduces hot flushes.
--------------------

--- KG-RAG FINAL RESPONSE ---
The current knowledge graph provides the following facts regarding the use of yoga for managing menopause symptoms:

1. Yoga helps with symptoms such as flushing, sleeplessness, headache, and lack of concentration, which are associated with premature menopause.
2. Yoga reduces hot flushes.
------------------------------

