In [2]:
# %% [markdown]
# # Add `symptom_code` to every symptom
#
# * Reads:  symptom_groups_semantic_90_adjusted.json
#           diseases_raw.json
# * Writes: diseases_with_symptom_codes.json
# -----------------------------------------------------------------

# %% 0. (One-time) install deps if missing
# !pip install -U sentence-transformers scikit-learn tqdm

# %% 1. Path configuration  (edit as needed)
from pathlib import Path
GROUP_FILE   = Path("symptom_groups_semantic_90.json")
DISEASE_FILE = Path("fixed_selected_normalized.json")
OUT_FILE     = Path("diseases_with_symptom_codes.json")

assert GROUP_FILE.exists(), f"Not found: {GROUP_FILE}"
assert DISEASE_FILE.exists(), f"Not found: {DISEASE_FILE}"

# %% 2. Minimal normalizer (ASCII-fold + lower + strip punctuation)
import unicodedata, string, re, json

_punct_tbl = str.maketrans('', '', string.punctuation)
def norm(txt: str) -> str:
    txt = unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode("ascii")
    txt = txt.lower().translate(_punct_tbl)
    return re.sub(r"\s+", " ", txt).strip()

# %% 3. Build alias → code lookup from the group file
with GROUP_FILE.open(encoding="utf-8") as f:
    groups = json.load(f)

alias_to_code = {}
for i, g in enumerate(groups, start=1):
    code = f"S{i:04d}"                      # or `g["canonical"]`
    for variant in g["aliases"] + [g["canonical"]]:
        alias_to_code[norm(variant)] = code

print("Lookup size:", len(alias_to_code))

# %% 4. Tag every symptom in the disease file
with DISEASE_FILE.open(encoding="utf-8") as f:
    diseases = json.load(f)

for dis in diseases:
    for sym in dis.get("symptoms", []):
        key = norm(sym["symptom_name"])
        sym["symptom_code"] = alias_to_code.get(key)    # None if no match

# %% 5. Save the augmented file
with OUT_FILE.open("w", encoding="utf-8") as f:
    json.dump(diseases, f, ensure_ascii=False, indent=2)

print("✅ Finished →", OUT_FILE.resolve())

# %% 6. Quick preview (optional)
import itertools, pprint
sample = next(itertools.islice((d for d in diseases if d["symptoms"][0].get("symptom_code")), 1, None))
pprint.pp(sample, depth=3, compact=True)


Lookup size: 22724
✅ Finished → D:\Nam_4\Ki_2\Grab\data\data_en\decision tree\data\diseases_with_symptom_codes.json
{'disease_name': 'Aagenaes syndrome',
 'global_index': 1,
 'symptoms': [{'symptom_name': 'Lymphedema',
               'symptom_description': 'Swelling primarily in the lower '
                                      'extremities.',
               'source_sentence': 'Lymphedema, which can lead to swelling '
                                  'primarily in the lower extremities [4][14]',
               'confidence_score': 1.0,
               'symptom_code': 'S0008'},
              {'symptom_name': 'Abdominal pain and tenderness',
               'symptom_description': 'Patients often experience abdominal '
                                      'discomfort and tenderness.',
               'source_sentence': 'Abdominal pain and tenderness: Patients '
                                  'often experience abdominal discomfort and '
                                  'tenderness [3][15

In [4]:
unique_symptoms = set()
for entry in diseases:
    for sym in entry.get("symptoms", []):
        name = sym.get("symptom_code")
        if name:
            unique_symptoms.add(name)

# 3) In kết quả
print(f"Số triệu chứng phân biệt (unique): {len(unique_symptoms):,}")

Số triệu chứng phân biệt (unique): 16,068


In [9]:
import json

# Load files
with open('symptom_groups_semantic_90.json', 'r', encoding='utf-8') as f:
    symptom_groups = json.load(f)

with open('diseases_with_symptom_codes.json', 'r', encoding='utf-8') as f:
    diseases = json.load(f)

# Tạo mapping symptom_name -> symptom_code (kiểm tra an toàn)
symptom_to_code = {}
for disease in diseases:
    if 'symptoms' in disease:  # Kiểm tra rõ ràng
        for symptom in disease['symptoms']:
            name = symptom['symptom_name'].strip().lower()
            if name not in symptom_to_code:
                symptom_to_code[name] = symptom['symptom_code']
    else:
        print(f"⚠️ Disease entry missing 'symptoms': {disease['disease_name']}")

# Thêm mã symptom_code vào symptom_groups
for group in symptom_groups:
    canonical = group['canonical'].strip().lower()
    found = False

    # Kiểm tra canonical trước
    if canonical in symptom_to_code:
        group['symptom_code'] = symptom_to_code[canonical]
        found = True
    else:
        # Kiểm tra aliases nếu không tìm thấy canonical
        for alias in group['aliases']:
            alias_norm = alias.strip().lower()
            if alias_norm in symptom_to_code:
                group['symptom_code'] = symptom_to_code[alias_norm]
                found = True
                break

    # Không tìm thấy thì để None
    if not found:
        group['symptom_code'] = None

# Save file kết quả
with open('symptom_groups_semantic_90_updated.json', 'w', encoding='utf-8') as f:
    json.dump(symptom_groups, f, ensure_ascii=False, indent=2)

print("File updated successfully.")


⚠️ Disease entry missing 'symptoms': Charge syndrome
⚠️ Disease entry missing 'symptoms': Childhood-onset dystonia with optic atrophy and basal ganglia abnormalities
⚠️ Disease entry missing 'symptoms': Childhood renal cell carcinoma with mit translocations
⚠️ Disease entry missing 'symptoms': Chime syndrome
⚠️ Disease entry missing 'symptoms': Chlamydia pneumonia
⚠️ Disease entry missing 'symptoms': Cholesterol embolism
⚠️ Disease entry missing 'symptoms': Cholesterol ester storage disease
⚠️ Disease entry missing 'symptoms': Chondrodysplasia punctata
⚠️ Disease entry missing 'symptoms': Chromosome 14q11-q22 deletion syndrome
⚠️ Disease entry missing 'symptoms': Chromosome 1q41-q42 deletion syndrome
⚠️ Disease entry missing 'symptoms': Chromosome 2q31.2 deletion syndrome
⚠️ Disease entry missing 'symptoms': Chromosome 2q37 deletion syndrome
⚠️ Disease entry missing 'symptoms': Chronic granulomatous disease
⚠️ Disease entry missing 'symptoms': Chronic mucocutaneous candidiasis
⚠️ Disea

In [8]:
import json

with open('diseases_with_symptom_codes.json', 'r', encoding='utf-8') as f:
    diseases = json.load(f)

# In thử ra cấu trúc mẫu để kiểm tra
print(type(diseases))
print(diseases[:1])  # In thử record đầu tiên


<class 'list'>
[{'disease_name': 'A53 diffuse large b-cell lymphoma', 'global_index': 0, 'symptoms': [{'symptom_name': 'Unintentional weight loss', 'symptom_description': "Unexplained weight loss due to the cancer's impact on appetite and metabolism.", 'source_sentence': "Many people with DLBCL experience unexplained weight loss due to the cancer's impact on their appetite and metabolism [1].", 'confidence_score': 1.0, 'symptom_code': 'S0001'}, {'symptom_name': 'Fatigue', 'symptom_description': "Feeling extremely tired or weak, affecting the body's ability to produce red blood cells.", 'source_sentence': "Feeling extremely tired or weak is a common symptom of DLBCL, as the cancer can affect the body's ability to produce red blood cells [2].", 'confidence_score': 1.0, 'symptom_code': 'S0002'}, {'symptom_name': 'Pain', 'symptom_description': 'Pain in the lymph nodes, bones, or joints due to the growth of cancerous cells.', 'source_sentence': 'Pain in the lymph nodes, bones, or joints can