In [1]:
import json
from collections import Counter, defaultdict
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [2]:
IN_PATH = "/Users/cj2837/Documents/Courses/Project/outputs/std_relation_triples.json"

with open(IN_PATH, "r") as f:
    triples = json.load(f)

In [3]:
from collections import Counter

# Count node types across head_type and tail_type
node_type_counter = Counter()

for t in triples:
    if t.get("head_type"):
        node_type_counter[t["head_type"]] += 1
    if t.get("tail_type"):
        node_type_counter[t["tail_type"]] += 1

# Sort by descending frequency
sorted_counts = node_type_counter.most_common()

print("=== Unique Node Types and Frequencies ===")
for node_type, count in sorted_counts:
    print(f"{node_type}: {count}")

print("\nTotal unique node types:", len(node_type_counter))


=== Unique Node Types and Frequencies ===
condition: 440
treatment: 397
symptom: 390
outcome: 293
biomarker: 144
intervention: 139
population: 63
medication: 60
lifestyle factor: 50
supplement: 37
organization: 23
risk: 22
study type: 13
study: 10
biological process: 5
research: 3
surgery: 2
process: 2
food: 2
timepoint: 1
risk factor: 1
research study: 1
time period: 1
diagnostic criterion: 1
diagnostic test: 1
measurement: 1
event: 1
finding: 1

Total unique node types: 28


In [4]:
NODETYPE_LIST_STR = "\n".join([f"- {nt}" for nt in sorted(node_type_counter.keys())])

CLUSTER_NODETYPE_PROMPT = f"""
You are cleaning and consolidating concept-type labels from a biomedical knowledge graph.

Below is a list of node types (concept categories) extracted from triples generated from menopause-related clinical trials and articles.

Your task:

- Identify *highly similar* or *redundant* node types.
    - Examples: "symptom" vs "symptoms", "outcome" vs "clinical_outcome",
      "intervention" vs "treatment", "biomarker" vs "biological_marker".
- Cluster node types that differ only by:
    - pluralization
    - formatting (underscores, hyphens)
    - synonyms conveying the same conceptual type
    - overly narrow names that belong to a broader parent category
- **Do NOT cluster** types that are clearly different categories
  (e.g., "Population" vs "Outcome", or "Medication" vs "Biomarker").

For each cluster:
    - Select the **canonical node type**, preferably:
        1. the most general category (e.g., "symptom", "treatment")
        2. OR the type appearing most frequently in the list
    - List all other related terms under `"synonyms"`.

Only output clusters that have more than one member.
Return ONLY valid JSON, no explanations.

Here are the node types:

{NODETYPE_LIST_STR}

Return JSON exactly in this format:

[
  {{
    "canonical": "symptom",
    "synonyms": ["symptoms", "menopausal_symptom"]
  }}
]
"""


In [5]:
resp = client.chat.completions.create(
    model=os.getenv("OPENAI_MODEL_NAME", "gpt-4o"),
    temperature=0,
    messages=[
        {"role": "system", "content": "You output ONLY clean JSON, never code fences."},
        {"role": "user", "content": CLUSTER_NODETYPE_PROMPT}
    ]
)

clusters = json.loads(resp.choices[0].message.content)

In [6]:
clusters

[{'canonical': 'biomarker', 'synonyms': ['biological_marker']},
 {'canonical': 'condition', 'synonyms': ['finding']},
 {'canonical': 'diagnostic test', 'synonyms': ['diagnostic_criterion']},
 {'canonical': 'intervention', 'synonyms': ['treatment']},
 {'canonical': 'outcome', 'synonyms': ['clinical_outcome']},
 {'canonical': 'research study', 'synonyms': ['study']},
 {'canonical': 'risk factor', 'synonyms': ['risk']},
 {'canonical': 'symptom', 'synonyms': ['symptoms']},
 {'canonical': 'time period', 'synonyms': ['timepoint']}]

## Check and Replace

In [4]:
# node_type_counter = Counter(...)  # from your earlier step
node_types = set(node_type_counter.keys())

# Extract canonical + synonyms
canonical_types = set()
synonym_types = set()

for c in clusters:
    canonical_types.add(c["canonical"])
    synonym_types.update(c["synonyms"])

# Check missing canonical — real issues
canonical_missing = [c for c in canonical_types if c not in node_types]

# Check missing synonyms — hallucinations but safe
synonym_missing = [s for s in synonym_types if s not in node_types]

print("\n=== COUNT SUMMARY ===")
print(f"Canonical types (total): {len(canonical_types)}")
print(f"Synonym types (total):   {len(synonym_types)}")
print(f"Total across clusters:   {len(canonical_types) + len(synonym_types)}")

print("\n=== CANONICAL CHECK ===")
print(f"Found in node types: {len(canonical_types) - len(canonical_missing)}")
print(f"Missing canonical:   {len(canonical_missing)}")
if canonical_missing:
    for c in canonical_missing:
        print("❌ Missing canonical:", c)
else:
    print("✔️ All canonical types exist in node_types.")

print("\n=== SYNONYM CHECK ===")
print(f"Found in node types: {len(synonym_types) - len(synonym_missing)}")
print(f"Missing synonyms:    {len(synonym_missing)}")
if synonym_missing:
    for s in synonym_missing:
        print("⚠️ Hallucinated synonym:", s)
else:
    print("✔️ No hallucinated synonyms.")


NameError: name 'clusters' is not defined

#### Replace with final self-prepared list

In [5]:
raw_map = {
    'finding': 'treatment',
    'research study': 'study',
    'risk factor': 'risk',
    'time period': 'timepoint',
    'diagnostic criterion': 'diagnostic test',
    'event': 'condition',
    'research': 'intervention'
}

node_types = set(node_type_counter.keys())

# keep only valid keys
syn_to_canonical = {k: v for k, v in raw_map.items() if k in node_types}

print("\n=== VALID SYNONYM → CANONICAL MAPPING ===")
print(syn_to_canonical)


=== VALID SYNONYM → CANONICAL MAPPING ===
{'finding': 'treatment', 'research study': 'study', 'risk factor': 'risk', 'time period': 'timepoint', 'diagnostic criterion': 'diagnostic test', 'event': 'condition', 'research': 'intervention'}


In [6]:
def map_type(t):
    if t is None:
        return None
    return syn_to_canonical.get(t, t)   # replace if synonym, else keep original

for t in triples:
    t["head_type"] = map_type(t.get("head_type"))
    t["tail_type"] = map_type(t.get("tail_type"))

In [7]:
new_counter = Counter()

for t in triples:
    if t.get("head_type"):
        new_counter[t["head_type"]] += 1
    if t.get("tail_type"):
        new_counter[t["tail_type"]] += 1

print("\n=== AFTER CLEANING: Node Types and Frequencies ===")
for node_type, count in new_counter.most_common():
    print(f"{node_type}: {count}")

print("\nTotal unique node types AFTER:", len(new_counter))


=== AFTER CLEANING: Node Types and Frequencies ===
condition: 441
treatment: 398
symptom: 390
outcome: 293
biomarker: 144
intervention: 142
population: 63
medication: 60
lifestyle factor: 50
supplement: 37
risk: 23
organization: 23
study type: 13
study: 11
biological process: 5
timepoint: 2
surgery: 2
process: 2
food: 2
diagnostic test: 2
measurement: 1

Total unique node types AFTER: 21


In [8]:
OUT_PATH = "/Users/cj2837/Documents/Courses/Project/outputs/std_nodes_triples_cleaned.json"

with open(OUT_PATH, "w") as f:
    json.dump(triples, f, indent=2)

print("Saved cleaned triples to:", OUT_PATH)


Saved cleaned triples to: /Users/cj2837/Documents/Courses/Project/outputs/std_nodes_triples_cleaned.json


In [9]:
import json

# Load triples
with open(OUT_PATH, "r") as f:
    triples = json.load(f)

# --- Unique triples ---
# Represent each triple as a tuple to deduplicate
triple_tuples = {
    (t["head"], t["relation"], t["tail"])
    for t in triples
}
num_unique_triples = len(triple_tuples)

# --- Unique nodes ---
nodes = set()
for t in triples:
    nodes.add(t["head"])
    nodes.add(t["tail"])
num_unique_nodes = len(nodes)

# --- Unique relations ---
relations = {t["relation"] for t in triples}
num_unique_relations = len(relations)

# --- Unique node types (from head_type + tail_type) ---
node_types = {t["head_type"] for t in triples} | {t["tail_type"] for t in triples}
num_unique_node_types = len(node_types)

# Print results
print("=== Triple Summary ===")
print("Unique triples:", num_unique_triples)
print("Unique nodes:", num_unique_nodes)
print("Unique relations:", num_unique_relations)
print("Unique node types:", num_unique_node_types)

print("\nNode types:", node_types)
print("Relations:", relations)


=== Triple Summary ===
Unique triples: 1051
Unique nodes: 645
Unique relations: 189
Unique node types: 21

Node types: {'symptom', 'process', 'intervention', 'study', 'condition', 'supplement', 'treatment', 'medication', 'biological process', 'risk', 'diagnostic test', 'study type', 'lifestyle factor', 'surgery', 'timepoint', 'measurement', 'organization', 'population', 'biomarker', 'food', 'outcome'}
Relations: {'experiences', 'affects', 'signals', 'manages', 'modulated', 'characterized_by', 'is_safer_than', 'funds', 'treats', 'may_include', 'provides_guidance_on', 'may_reduce_risk_of', 'receives', 'impairs', 'weighed_against', 'has_low_rate_of', 'increases_severity_of', 'may_have', 'increases_risk_of', 'available_for', 'is_important_during', 'occurs_at_average_age', 'age_range', 'treated_with', 'face_disproportionate_burden_of', 'important', 'provides', 'study', 'has_condition', 'makes', 'diagnosed', 'protects', 'outcome_of', 'duration', 'developed_by', 'a_biomarker', 'used_with', 'd