In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip -q install -U sentence-transformers scikit-learn tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# ---------- 0. Cài thư viện (chỉ 1 lần) -------------------------


# ---------- 1. Cấu hình đường dẫn ------------------------------
INPUT_JSON  = "/kaggle/input/symptoms-name/symptom_names.json"  # <— đổi cho khớp
OUTPUT_JSON = "/kaggle/working/symptom_groups_semantic_90.json"

MODEL_NAME  = "sentence-transformers/all-MiniLM-L6-v2"   # nhẹ (~60 MB)
BATCH_SIZE  = 128     # giảm nếu RAM hạn chế
SIM_THRESH  = 0.90    # “≈90 % giống nhau”

# ---------- 2. Đọc danh sách triệu chứng -----------------------
import json, numpy as np, os
from tqdm.auto import tqdm

with open(INPUT_JSON, encoding="utf-8") as f:
    symptoms = json.load(f)

print(f"🔎 Loaded {len(symptoms):,} symptom strings")

# ---------- 3. Mã hoá câu (sentence embedding) -----------------
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(MODEL_NAME)
embeddings = model.encode(
    symptoms,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True      # cosine-sim = dot-prod
)

# ---------- 4. Gom cụm bằng DBSCAN -----------------------------
from sklearn.cluster import DBSCAN

eps = 1 - SIM_THRESH   # cosine-dist = 1 − cosine-sim
clusterer = DBSCAN(eps=eps, metric="cosine", min_samples=1)
labels = clusterer.fit_predict(embeddings)

# ---------- 5. Xây dict nhóm ↔ alias ---------------------------
from collections import defaultdict

groups = defaultdict(list)
for label, symptom in zip(labels, symptoms):
    groups[label].append(symptom)

def pick_canonical(variants):
    # lấy biến thể ngắn nhất (tuỳ chỉnh theo nhu cầu)
    return min(variants, key=len)

result = [
    {"canonical": pick_canonical(v), "aliases": sorted(set(v))}
    for v in groups.values()   # bỏ lẻ 1 nếu muốn
]

print(f"✅ Formed {len(result):,} multi-alias groups")

# ---------- 6. Ghi file JSON kết quả ---------------------------
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"📂 Saved → {OUTPUT_JSON}")


🔎 Loaded 22,765 symptom strings


2025-05-08 06:05:37.298962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746684337.492135      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746684337.547047      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/178 [00:00<?, ?it/s]

✅ Formed 16,075 multi-alias groups
📂 Saved → /kaggle/working/symptom_groups_semantic_90.json
