In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv() 

# Load API key
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


## LOAD DATA

In [60]:
from langchain_core.documents import Document

def load_studies(path):
    docs = []
    current = []
    study_id = 0

    with open(path, "r") as f:
        for line in f:
            line = line.rstrip("\n")

            # If a new study starts
            if line.startswith("Study:"):
                # Save the previous one if exists
                if current:
                    docs.append(
                        Document(
                            page_content="\n".join(current).strip(),
                            metadata={"source": f"study_{study_id}"}
                        )
                    )
                    study_id += 1
                    current = []

            current.append(line)

        # Save last study when file ends
        if current:
            docs.append(
                Document(
                    page_content="\n".join(current).strip(),
                    metadata={"source": f"study_{study_id}"}
                )
            )

    print(f"Loaded {len(docs)} studies.")
    return docs

# Run it
documents = load_studies("/Users/cj2837/Documents/Courses/Project/menopause_knowledge_graph_by_LLM/data/structured_for_llm.txt")


Loaded 77 studies.


In [61]:
documents

[Document(metadata={'source': 'study_0'}, page_content='Study: MOBILE Study - A Study of Bonviva (Ibandronate) Regimens in Women With Post-Menopausal Osteoporosis\n\nSummary: This study will compare the efficacy and safety of different treatment regimens of oral Bonviva tablets in women with post-menopausal osteoporosis. Patients will also receive daily supplementation with vitamin D and calcium. The anticipated time of study treatment is 2+ years, and the target sample size is 500+ individuals.\n\nResults: Outcome: Relative Change From Baseline at One Year (12 Months) in Mean Lumbar Spine (L2 - L4) Bone Mineral Density.Parameters: Mean Difference (Final Values) = 0.615 (p = 0.045, CI 0.013-1.216); Mean Difference (Final Values) = 1 (p = 0.001, CI 0.395-1.605). Non-inferiority type: NON_INFERIORITY_OR_EQUIVALENCE.'),
 Document(metadata={'source': 'study_1'}, page_content='Study: DIVA Study - A Study of Different Regimens of Intravenous Administration of Bonviva (Ibandronate) in Women W

## Build Model

In [None]:
TRIPLE_PROMPT = """
You are an expert medical knowledge graph constructor specializing in menopause and women's health. 
Your primary task is to meticulously analyze the provided CLINICAL TRIAL SUMMARIES and extract ALL relevant concepts and relationships 
into a structured list of knowledge triples.
---
**GOAL**
Identify and normalize key biomedical concepts and relationships relevant to menopause, including but not limited to Symptom, Condition, Biomarker, Treatment, Intervention, Medication, Lifestyle Factor, Outcome, Risk, Population, Supplement, Food.

---
TEXT BLOCK:
{text}

---
**EXTRACTION STEPS (Follow This Sequence)**
1.  **Terminology Recognition & Typing:** Scan the TEXT BLOCK and identify all concepts belonging to the required categories (Symptom, Treatment, Outcome, etc.).
    - *Action:* Immediately map any synonyms found (e.g., "HRT," "Hormone Therapy") to their single, authoritative concept name (e.g., **'Hormone Replacement Therapy (HRT)'**).
    - *Action:* For terms with synonyms, use just one of the synonyms. Make it consistent.
    - *Action:* Assign a clear type (e.g., 'Symptom', 'Intervention') to each concept.

2.  **Relationship Identification:** Identify explicit connections between the recognized concepts (e.g., 'causes', 'treats', 'mitigates', 'increases_risk_of', 'is_a_type_of', 'affects', 'is_a_symptom_of', etc.). Pay close attention to negative and conditional language.
    - Relationships MUST be concise. 
    - Use specific predicates for negative or conditional relationships (e.g., 'does_not_increase_risk', 'may_mitigate').

3.  **Triple Construction:** Formulate the [Head, Relation, Tail] structure based *only* on the statements found in the text. Ensure both the Head and Tail use the **normalized name and consistent capitalization**.

4.  **Internal Verification (Self-Correction):** Before generating the output, check every constructed triple: Does the relationship and connection exist *verbatim* or *explicitly implied* by the current text? **Discard any triple that cannot be directly supported by the TEXT BLOCK.**
---
**OUTPUT RULES & SCHEMA**

1. REQUIRED CONCEPTS (Subjects/Objects) & TYPES:
* You MUST classify each entity into a single type.

2. REQUIRED RELATIONSHIPS:
* Relationships MUST be concise verb phrases (e.g., 'causes', 'treats').
* NEGATIVE/UNCERTAINTY RULE: Use specific predicates for negative or conditional relationships (e.g., 'does_not_increase_risk', 'may_mitigate').

3. FOCUS ON CLINICAL TRIAL SUMMARIES:
* Ignore metadata such as study names, research title, sample sizes, recruitment details, and funding information.
* Do NOT include results stated as non-significant, negative, or uncertain findings.

4. STRICT JSON OUTPUT:
You MUST output ONLY a list of JSON objects. Do NOT include any introductory or explanatory prose, or quotes. 
If NO triples are found after verification, output an empty list: **[]**.

**JSON SCHEMA:**
[{"head": "","head_type": "","relation": "","tail": "","tail_type": ""}]
"""


In [34]:
def extract_triples(text):
    raw = client.chat.completions.create(
        model=os.getenv("OPENAI_MODEL_NAME", "gpt-4o"),
        temperature=0,
        messages=[
            {"role": "system", "content": "You output ONLY clean JSON, never code fences."},
            {"role": "user", "content": TRIPLE_PROMPT.replace("{text}", text)}
        ]
    ).choices[0].message.content

    cleaned = raw.strip()

    # Remove ```json ... ``` if present
    if cleaned.startswith("```"):
        cleaned = re.sub(r"```json|```", "", cleaned).strip()

    # Debug if JSON fails
    try:
        return json.loads(cleaned)
    except Exception as e:
        print("\n JSON PARSE ERROR:", e)
        print(" RAW OUTPUT START:", raw[:200], "...\n")
        return []


In [None]:
import time
import json
import re
from openai import RateLimitError

def extract_triples_with_retry(text, max_retries=5):
    retries = 0
    wait_time = 3

    while True:
        try:
            raw = client.chat.completions.create(
                model=os.getenv("OPENAI_MODEL_NAME", "gpt-4o"),
                temperature=0,
                messages=[
                    {"role": "system", "content": "You output ONLY clean JSON, never code fences."},
                    {"role": "user", "content": TRIPLE_PROMPT.replace("{text}", text)}
                ]
            ).choices[0].message.content

            cleaned = raw.strip()
            if cleaned.startswith("```"):
                cleaned = re.sub(r"```json|```", "", cleaned).strip()

            return json.loads(cleaned)

        except RateLimitError as e:
            retries += 1
            if retries > max_retries:
                print("Too many rate limit retries — skipping this study.")
                return []

            print(f"Rate limit hit, retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            wait_time *= 1.5  # exponential backoff

        except Exception as e:
            print("JSON parse error or unexpected error:", e)
            return []


## Running in batches

In [35]:
BATCH_SIZE = 10
output = {}

In [67]:
import math

def split_into_chunks(lst, k=5):
    chunk_size = math.ceil(len(lst) / k)
    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]

# Split into 5 groups
docs_parts = split_into_chunks(documents, k=5)

for i, p in enumerate(docs_parts):
    print(f"Part {i+1}: {len(p)} studies")


Part 1: 16 studies
Part 2: 16 studies
Part 3: 16 studies
Part 4: 16 studies
Part 5: 13 studies


In [74]:
import time

def run_extraction_on_part(part_docs, label):
    output_partial = {}

    for idx, doc in enumerate(part_docs):
        print(f"[{label}] doc {idx+1}/{len(part_docs)}: {doc.metadata['source']}")

        triples = extract_triples_with_retry(doc.page_content)
        output_partial[doc.metadata["source"]] = triples

    return output_partial


In [75]:
outputs = []

for i, part in enumerate(docs_parts, start=1):
    partial = run_extraction_on_part(part, label=f"PART {i}")
    outputs.append(partial)


[PART 1] doc 1/16: study_0
[PART 1] doc 2/16: study_1
[PART 1] doc 3/16: study_2
[PART 1] doc 4/16: study_3
[PART 1] doc 5/16: study_4
[PART 1] doc 6/16: study_5
[PART 1] doc 7/16: study_6
[PART 1] doc 8/16: study_7
[PART 1] doc 9/16: study_8
[PART 1] doc 10/16: study_9
[PART 1] doc 11/16: study_10
[PART 1] doc 12/16: study_11
[PART 1] doc 13/16: study_12
[PART 1] doc 14/16: study_13
[PART 1] doc 15/16: study_14
[PART 1] doc 16/16: study_15
[PART 2] doc 1/16: study_16
[PART 2] doc 2/16: study_17
[PART 2] doc 3/16: study_18
[PART 2] doc 4/16: study_19
[PART 2] doc 5/16: study_20
[PART 2] doc 6/16: study_21
[PART 2] doc 7/16: study_22
[PART 2] doc 8/16: study_23
[PART 2] doc 9/16: study_24
[PART 2] doc 10/16: study_25
[PART 2] doc 11/16: study_26
[PART 2] doc 12/16: study_27
[PART 2] doc 13/16: study_28
[PART 2] doc 14/16: study_29
[PART 2] doc 15/16: study_30
[PART 2] doc 16/16: study_31
[PART 3] doc 1/16: study_32
[PART 3] doc 2/16: study_33
[PART 3] doc 3/16: study_34
[PART 3] doc 4/1

In [76]:
output_final = {}
for o in outputs:
    output_final.update(o)

In [77]:
output_final

{'study_0': [{'head': 'Bonviva (Ibandronate)',
   'head_type': 'Medication',
   'relation': 'treats',
   'tail': 'Post-Menopausal Osteoporosis',
   'tail_type': 'Condition'},
  {'head': 'Vitamin D',
   'head_type': 'Supplement',
   'relation': 'supplementation_with',
   'tail': 'Post-Menopausal Osteoporosis',
   'tail_type': 'Condition'},
  {'head': 'Calcium',
   'head_type': 'Supplement',
   'relation': 'supplementation_with',
   'tail': 'Post-Menopausal Osteoporosis',
   'tail_type': 'Condition'},
  {'head': 'Bonviva (Ibandronate)',
   'head_type': 'Medication',
   'relation': 'improves',
   'tail': 'Bone Mineral Density',
   'tail_type': 'Biomarker'}],
 'study_1': [{'head': 'Bonviva (Ibandronate)',
   'head_type': 'Medication',
   'relation': 'treats',
   'tail': 'Post-Menopausal Osteoporosis',
   'tail_type': 'Condition'},
  {'head': 'Intravenous Administration',
   'head_type': 'Intervention',
   'relation': 'compared_to',
   'tail': 'Oral Daily Administration',
   'tail_type': 'I

In [None]:
# Save to disk
import json
OUT_PATH = "/Users/cj2837/Documents/Courses/Project/extraction_outputs/structured_triples.json"
with open(OUT_PATH, "w") as f:
    json.dump(output_final, f, indent=2)

print("Saved:", OUT_PATH)

✅ Saved: /Users/cj2837/Documents/Courses/Project/extraction_outputs/structured_triples.json
