In [None]:
import json
import os
from glob import glob

INPUT_DIR = "/Users/cj2837/Documents/Courses/Project/extraction_outputs/"
json_files = glob(os.path.join(INPUT_DIR, "*.json"))

raw_all_triples = []

def extract_triples_from_json(data):
    triples = []

    # Case A: dict of studies
    if isinstance(data, dict):
        for v in data.values():
            if isinstance(v, str):            # handle string-encoded JSON
                v = json.loads(v)
            if isinstance(v, list):
                triples.extend(v)

    # Case B: a single list
    elif isinstance(data, list):
        triples.extend(data)

    return triples


# Loop over files
for fpath in json_files:
    with open(fpath, "r") as f:
        data = json.load(f)
    triples = extract_triples_from_json(data)
    raw_all_triples.extend(triples)

print(f"Total triples collected: {len(raw_all_triples)}")


✅ Total triples collected: 1322


In [2]:
# covert all nodes to lowercase
all_triples = [
    {key: (value.lower() if isinstance(value, str) else value) for key, value in item.items()}
    for item in raw_all_triples
]

In [3]:
node_names = set()

for t in all_triples:
    node_names.add(t["head"])
    node_names.add(t["tail"])

print("Total unique node names:", len(node_names))

Total unique node names: 1069


In [11]:
edge_names = set()

for t in all_triples:
    edge_names.add(t["relation"])

print("Total unique edge names:", len(edge_names))

Total unique edge names: 303


In [12]:
nodetype_names = set()

for t in all_triples:
    nodetype_names.add(t["head_type"])
    nodetype_names.add(t["tail_type"])

print("Total unique node type names:", len(nodetype_names))

Total unique node type names: 29


## Cleaning step 1: remove parenthesis

Parenthesis contains synonyms or extra details that are not that useful but causes trouble for mapping

In [129]:
import re

nodes_with_paren = set()

for t in all_triples:
    for node in [t["head"], t["tail"]]:
        if node and re.search(r"\(.*\)", node):   # contains (...) with anything inside
            nodes_with_paren.add(node)

print(f"Found {len(nodes_with_paren)} nodes containing parentheses:\n")
for n in sorted(nodes_with_paren):
    print("-", n)

Found 96 nodes containing parentheses:

- acquired generalized hypoactive sexual desire disorder (hsdd)
- antidepressants (ssris and snris)
- antidepressants (ssris, snris)
- ba058 (abaloparatide)
- bay3427080 (nt-814)
- birth control pills, antidepressants (ssris and snris), gabapentin, fezolinetant, oxybutynin
- body mass index (bmi)
- bone mineral density (bmd)
- bone mineral density (bmd) of femoral neck
- bone mineral density (bmd) of lumbar spine
- bone mineral density (bmd) of lumbar spine (l2-l4)
- bone mineral density (bmd) of total hip
- bone-specific alkaline phosphatase (bsap)
- boniva confidence scale (bcs)
- bonviva (ibandronate)
- bonviva (ibandronate) treatment
- bonviva/boniva (ibandronate)
- change from baseline in follicle-stimulating hormone (fsh) level
- change in bone mineral density (bmd) of axial lumbar spine
- change in montgomery-asperg depression rating scale (madrs)
- change in plasma c-terminal telopeptide of collagen 1 (ctx-1)
- cumulative incidence of ven

In [4]:
# remove parenthesis
import re

def strip_parentheses(text):
    if not text:
        return text
    # Remove all (...) including content
    text = re.sub(r"\(.*?\)", "", text)
    # Remove extra spaces created by removal
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [5]:
cleaned_triples = []

for t in all_triples:
    cleaned_triples.append({
        **t,
        "head": strip_parentheses(t["head"]),
        "tail": strip_parentheses(t["tail"])
    })


## UMLS Mapping

In [6]:
node_names = set()

for t in cleaned_triples:
    node_names.add(t["head"])
    node_names.add(t["tail"])

print("Total unique node names:", len(node_names))

Total unique node names: 1047


In [None]:
import requests, time

api_key = ""
BASE_URL = "https://uts-ws.nlm.nih.gov/rest/search/current"

def umls_lookup(term):
    params = {
        "string": term,
        "apiKey": api_key,
        "searchType": "words",         # less strict than "exact"
        "approximateMatch": "true",    # allow fuzzy matching
        "returnIdType": "concept"      # return CUI
    }
    try:
        r = requests.get(BASE_URL, params=params)
        r.raise_for_status()
        data = r.json()
        results = data.get("result", {}).get("results", [])

        if results:
            top = results[0]

            return {
                "CUI": top.get("ui"),
                "preferred_name": top.get("name"),
                "semantic_source": top.get("rootSource")
            }

        return {"CUI": None, "preferred_name": None, "semantic_source": None}

    except Exception:
        return {"CUI": None, "preferred_name": None, "semantic_source": None}



In [8]:
umls_map = {}

for term in node_names:
    umls_map[term] = umls_lookup(term)


In [9]:
mapped = {k:v for k,v in umls_map.items() if v["CUI"]}
unmapped = {k:v for k,v in umls_map.items() if v["CUI"] is None}

print("Mapped:", len(mapped))
print("Unmapped:", len(unmapped))


Mapped: 574
Unmapped: 473


In [13]:
unmapped

{'having sex more often': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'vaginal wall degradative activity': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'sleep adequacy': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'perimenopausal and postmenopausal women': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'non-hormonal medicine': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'lifestyle changes, nonhormonal medications, and hormones': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'low-dose hormonal birth control': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'oral calcitonin': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'chronological aging': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},
 'bone turnover markers': {'CUI': None,
  'preferred_name': None,
  'semantic_source': None},


In [14]:
standardized = []

for t in cleaned_triples:
    head = t["head"]
    tail = t["tail"]

    head_map = umls_map.get(head, {})
    tail_map = umls_map.get(tail, {})

    standardized.append({
        "head": head_map.get("preferred_name") or head,
        "head_cui": head_map.get("CUI"),
        "head_semantic_source": head_map.get("semantic_source"),

        "relation": t["relation"],

        "tail": tail_map.get("preferred_name") or tail,
        "tail_cui": tail_map.get("CUI"),
        "tail_semantic_source": tail_map.get("semantic_source"),

        "head_type": t.get("head_type"),
        "tail_type": t.get("tail_type")
    })


In [15]:
standardized 

[{'head': 'Bonviva',
  'head_cui': 'C0918018',
  'head_semantic_source': 'MSH',
  'relation': 'treats',
  'tail': 'Osteoporosis, Postmenopausal',
  'tail_cui': 'C0029458',
  'tail_semantic_source': 'MTH',
  'head_type': 'medication',
  'tail_type': 'condition'},
 {'head': 'vitamin D',
  'head_cui': 'C0042866',
  'head_semantic_source': 'MTH',
  'relation': 'supplementation_with',
  'tail': 'Osteoporosis, Postmenopausal',
  'tail_cui': 'C0029458',
  'tail_semantic_source': 'MTH',
  'head_type': 'supplement',
  'tail_type': 'condition'},
 {'head': 'calcium',
  'head_cui': 'C0006675',
  'head_semantic_source': 'MTH',
  'relation': 'supplementation_with',
  'tail': 'Osteoporosis, Postmenopausal',
  'tail_cui': 'C0029458',
  'tail_semantic_source': 'MTH',
  'head_type': 'supplement',
  'tail_type': 'condition'},
 {'head': 'Bonviva',
  'head_cui': 'C0918018',
  'head_semantic_source': 'MSH',
  'relation': 'improves',
  'tail': 'Bone Mineral Density Test',
  'tail_cui': 'C0177804',
  'tail_se

## Cleaning step 2: keep 1 relation


In [16]:
from collections import Counter, defaultdict

# Compute overall relation frequency
relation_counts = Counter(t["relation"] for t in standardized)

# Build mapping: (head, tail) → list of relations
pair_relations = defaultdict(list)
for t in standardized:
    key = (t["head"], t["tail"])
    pair_relations[key].append(t["relation"])

# Identify conflict pairs
conflicts = {k: set(v) for k, v in pair_relations.items() if len(set(v)) > 1}

print(f"Found {len(conflicts)} conflicting head–tail pairs.\n")

Found 54 conflicting head–tail pairs.



In [17]:
conflicts

{('Bonviva', 'Increased bone mineral density of lumbar spine'): {'affects',
  'increases'},
 ('ibandronate', 'risedronate'): {'is_preferred_over', 'preferred_over'},
 ('denosumab', 'Increased bone mineral density of lumbar spine'): {'affects',
  'improves',
  'increases'},
 ('denosumab', 'total hip bone mineral density'): {'affects',
  'improves',
  'increases'},
 ('Hot flushes', 'Menopause'): {'associated_with',
  'lasts_after',
  'may_occur_during'},
 ('romosozumab', 'bone mineral density at the lumbar spine'): {'affects',
  'increases'},
 ('denosumab', 'bone mineral density at lumbar spine'): {'affects',
  'increases'},
 ('romosozumab', 'bone mineral density at the femoral neck'): {'affects',
  'increases'},
 ('romosozumab', 'bone mineral density at the total hip'): {'affects',
  'increases'},
 ('postmenopausal women with osteoporosis', 'romosozumab'): {'receives',
  'treated_with'},
 ('Premature Menopause',
  'Human Papillomavirus-40'): {'occurs_at_or_before_age', 'occurs_before_ag

In [18]:
def pick_best_relation(rel_set):
    rel_list = list(rel_set)

    # 1. If "affects" is present AND there are other more specific relations → remove it
    if "affects" in rel_set and len(rel_set) > 1:
        rel_list = [r for r in rel_list if r != "affects"]

    # 2. Rank remaining relations by global frequency (descending)
    rel_list.sort(key=lambda r: relation_counts[r], reverse=True)

    # 3. If tie, lexicographically break the tie
    # (stable sort ensures lexicographic only applies when frequencies are equal)
    return rel_list[0]



In [19]:
cleaned_triples = []
change_report = []

for (head, tail), relations in conflicts.items():
    best = pick_best_relation(relations)

    # Record the decision
    change_report.append({
        "head": head,
        "tail": tail,
        "relations": sorted(list(relations)),
        "kept": best
    })

In [20]:
for t in standardized:
    key = (t["head"], t["tail"])
    if key in conflicts:
        # keep only if relation == chosen best
        if t["relation"] == [r for r in change_report if r["head"] == key[0] and r["tail"] == key[1]][0]["kept"]:
            cleaned_triples.append(t)
    else:
        cleaned_triples.append(t)

print(f"Original triples: {len(standardized)}")
print(f"After resolving relation conflicts: {len(cleaned_triples)}")
print(f"Removed {len(standardized) - len(cleaned_triples)} redundant triples.")

Original triples: 1322
After resolving relation conflicts: 1243
Removed 79 redundant triples.


In [21]:
print("\n=== Relation Conflict Resolution Report ===\n")
for item in change_report:
    print(f"- {item['head']}  ↔  {item['tail']}")
    print(f"  Relations: {item['relations']}  →  kept **{item['kept']}**\n")



=== Relation Conflict Resolution Report ===

- Bonviva  ↔  Increased bone mineral density of lumbar spine
  Relations: ['affects', 'increases']  →  kept **increases**

- ibandronate  ↔  risedronate
  Relations: ['is_preferred_over', 'preferred_over']  →  kept **preferred_over**

- denosumab  ↔  Increased bone mineral density of lumbar spine
  Relations: ['affects', 'improves', 'increases']  →  kept **increases**

- denosumab  ↔  total hip bone mineral density
  Relations: ['affects', 'improves', 'increases']  →  kept **increases**

- Hot flushes  ↔  Menopause
  Relations: ['associated_with', 'lasts_after', 'may_occur_during']  →  kept **associated_with**

- romosozumab  ↔  bone mineral density at the lumbar spine
  Relations: ['affects', 'increases']  →  kept **increases**

- denosumab  ↔  bone mineral density at lumbar spine
  Relations: ['affects', 'increases']  →  kept **increases**

- romosozumab  ↔  bone mineral density at the femoral neck
  Relations: ['affects', 'increases']  →

## Cleaning step 3: Remove duplicate triples

In [22]:
# Remove duplicates based on (head, relation, tail)
unique_triples = []
seen = set()

for t in cleaned_triples:  
    key = (t["head"], t["relation"], t["tail"])
    if key not in seen:
        unique_triples.append(t)
        seen.add(key)

print(f"Before deduplication: {len(cleaned_triples)} triples")
print(f"After deduplication:  {len(unique_triples)} triples")
print(f"Removed {len(cleaned_triples) - len(unique_triples)} duplicates.")


Before deduplication: 1243 triples
After deduplication:  1197 triples
Removed 46 duplicates.


In [23]:
# to view duplicates
from collections import Counter

keys = [(t["head"], t["relation"], t["tail"]) for t in cleaned_triples]
counts = Counter(keys)

duplicates = [k for k, v in counts.items() if v > 1]

print("Duplicate triples:")
for k in duplicates:
    print(f"  {k}  → appears {counts[k]} times")


Duplicate triples:
  ('Bonviva', 'treats', 'Osteoporosis, Postmenopausal')  → appears 2 times
  ('vitamin D', 'supplementation_with', 'Osteoporosis, Postmenopausal')  → appears 2 times
  ('calcium', 'supplementation_with', 'Osteoporosis, Postmenopausal')  → appears 2 times
  ('Bonviva', 'increases', 'Increased bone mineral density of lumbar spine')  → appears 2 times
  ('Both ovaries', 'produce', 'Estrogen measurement')  → appears 2 times
  ('Both ovaries', 'produce', 'progesterone')  → appears 2 times
  ('Obtain multiple measurements of endometrial thickness from various angles for women presenting with postmenopausal bleeding', 'has_condition', 'Osteoporosis')  → appears 5 times
  ('Esmirtazapine', 'treats', 'Vasomotor menopausal symptoms')  → appears 2 times
  ('Vasomotor menopausal symptoms', 'associated_with', 'Menopause')  → appears 2 times
  ('mk-6913', 'reduces', 'Hot flushes')  → appears 2 times
  ('denosumab', 'increases', 'bone mineral density at lumbar spine')  → appears 2 

In [24]:
len(duplicates)

38

## saved

In [None]:
import json

OUT_PATH = "/Users/cj2837/Documents/Courses/Project/outputs/mapped_triples.json"

with open(OUT_PATH, "w") as f:
    json.dump(unique_triples, f, indent=2, ensure_ascii=False)

print("Saved standardized triples to:", OUT_PATH)
print("Total triples saved:", len(unique_triples))


✅ Saved standardized triples to: /Users/cj2837/Documents/Courses/Project/outputs/mapped_triples.json
Total triples saved: 1197


## Sanity Checks

#### Node types

In [26]:
# to check node types
from collections import defaultdict

# Counters
type_to_nodes = defaultdict(set)   # maps type → set of node names

for t in unique_triples: 
    h, ht = t.get("head"), t.get("head_type")
    ta, tat = t.get("tail"), t.get("tail_type")
    
    if ht:
        type_to_nodes[ht].add(h)
    if tat:
        type_to_nodes[tat].add(ta)

# ---- Print nicely ----
print("=== Node Types Summary ===\n")
total_types = len(type_to_nodes)

for node_type, nodes in sorted(type_to_nodes.items()):
    count = len(nodes)
    example = next(iter(nodes)) if nodes else "N/A"

    print(f"• {node_type}: {count} nodes")
    print(f"    example: {example}\n")

print(f"Total node types: {total_types}")


=== Node Types Summary ===

• biological process: 5 nodes
    example: calcium usage

• biomarker: 103 nodes
    example: Epithelial cells.parabasal | Vaginal | Hematology and Cell counts

• condition: 119 nodes
    example: inflammatory disorders such as rheumatoid arthritis and postmenopausal osteoporosis

• diagnostic criterion: 1 nodes
    example: Perimenopausal 6 to 12 Months Since LMP Question

• diagnostic test: 1 nodes
    example: blood hormone levels (lab test)

• event: 1 nodes
    example: final menstrual period

• finding: 1 nodes
    example: hormone replacement therapy did not prevent heart disease

• food: 2 nodes
    example: soy containing foods

• intervention: 79 nodes
    example: having sex more often

• lifestyle factor: 28 nodes
    example: Date quit tobacco smoking

• measurement: 1 nodes
    example: One to two times per year

• medication: 27 nodes
    example: fluoxetine

• organization: 9 nodes
    example: nccih-funded researchers

• outcome: 293 nodes
 

In [27]:
# to check Nodes with Multiple Assigned Types
from collections import defaultdict

# Map each node → set of assigned types
node_to_types = defaultdict(set)

for t in unique_triples:
    tail = t["tail"]
    head_type = t.get("head_type")
    tail_type = t.get("tail_type")

    if head_type:
        node_to_types[head].add(head_type)
    if tail_type:
        node_to_types[tail].add(tail_type)

# ---- Find conflicts ----
conflicts = {node: types for node, types in node_to_types.items() if len(types) > 1}

print("=== Nodes with Multiple Assigned Types ===\n")

if not conflicts:
    print("✅ No node appears with multiple types. (Good!)")
else:
    for node, types in conflicts.items():
        print(f"• {node}")
        print(f"   types: {sorted(types)}\n")

print(f"Total conflicted nodes: {len(conflicts)}")


=== Nodes with Multiple Assigned Types ===

• Hormone replacement therapy
   types: ['biomarker', 'condition', 'event', 'food', 'intervention', 'lifestyle factor', 'medication', 'organization', 'outcome', 'population', 'research study', 'risk', 'risk factor', 'study', 'study type', 'supplement', 'surgery', 'symptom', 'treatment']

• Bone Mineral Density Test
   types: ['biomarker', 'outcome']

• Increased bone mineral density of lumbar spine
   types: ['biomarker', 'outcome']

• Osteopenia
   types: ['condition', 'outcome']

• ibandronate
   types: ['medication', 'treatment']

• risedronate
   types: ['medication', 'treatment']

• Fracture
   types: ['condition', 'outcome']

• Menopausal symptom
   types: ['condition', 'outcome', 'symptom']

• cumulative amenorrhea
   types: ['outcome', 'symptom']

• Finding of hormone level
   types: ['biological process', 'biomarker']

• PRO-CTCAE V1.0 - Hot Flashes Severity
   types: ['outcome', 'symptom']

• loss of bone density
   types: ['outcome

#### Relationship

In [28]:
from collections import Counter

# Count frequency of each relation
relation_counts = Counter(t["relation"] for t in unique_triples)

# Sort by count (descending)
sorted_relations = relation_counts.most_common()

print("Relations sorted by frequency:")
for relation, count in sorted_relations:
    print(f"{relation}: {count}")

print("\nTotal unique relations:", len(relation_counts))


Relations sorted by frequency:
affects: 117
reduces: 90
increases: 62
treats: 59
causes: 51
improves: 45
increases_risk_of: 42
includes: 38
associated_with: 30
has_symptom: 26
is_a_type_of: 21
include: 19
may_mitigate: 18
may_help_with: 17
is_a_symptom_of: 16
symptom: 15
decreases: 12
may_increase_risk_of: 11
caused_by: 11
studies: 9
prevents: 8
measured_by: 8
reduces_risk_of: 8
may_cause: 8
receives: 7
influences: 7
includes_symptom: 7
may_increase: 7
studied_for: 7
experiences: 6
used_with: 6
may_treat: 6
linked_to: 6
treat: 6
administered_as: 6
treated_with: 5
recommended_for: 5
is_a_stage_of: 5
compared_to: 4
assesses: 4
generally_have: 4
does_not_recommend: 4
may_contribute_to: 4
is_characterized_by: 4
increase_risk_of: 4
helps_with: 4
can_lead_to: 4
may_help: 4
can_help_with: 4
may_indicate: 4
is_a_risk_factor_for: 4
focuses_on: 4
does_not_reduce_risk_of: 4
can cause: 4
secretes: 3
has_condition: 3
related_to: 3
measured_in: 3
prescribed_for: 3
used_for: 3
have_not_been_clearly_s

In [29]:
# same head and tail, but different relation

from collections import defaultdict

# Group relations by (head, tail)
pair_relations = defaultdict(set)

for t in unique_triples:
    key = (t["head"], t["tail"])
    pair_relations[key].add(t["relation"])

# Find cases where same head/tail appear with multiple relations
conflicts = {k: v for k, v in pair_relations.items() if len(v) > 1}

print(f"Found {len(conflicts)} head–tail pairs with conflicting relations:\n")

for (head, tail), relations in conflicts.items():
    print(f"- {head}  ↔  {tail}")
    print(f"  Relations: {sorted(list(relations))}\n")


Found 0 head–tail pairs with conflicting relations:

