In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================================================
# Gemini Flash 2.0 ✦ Paraphrase Disease Name Test‑set Builder
# ============================================================

# ------------ CONFIG ------------
from kaggle_secrets import UserSecretsClient

API_KEY       = UserSecretsClient().get_secret("api")
JSON_PATH     = "/kaggle/input/diseases-json-final/fixed_selected.json"   # file chứa LIST các object
OUTPUT_JSONL  = "/kaggle/working/testdata.jsonl"
MODEL_NAME    = "gemini-2.0-flash"
REQS_PER_MIN  = 2000
SLEEP_SEC     = 60.0 / REQS_PER_MIN
CHKP_EVERY    = 100
RESUME_RUN    = True
MAX_RETRIES   = 3
TEMPERATURE   = 0.7

# ------------ LIBS ------------
import json, time, random, requests, sys
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

API_URL = (
    "https://generativelanguage.googleapis.com/v1beta/"
    f"models/{MODEL_NAME}:generateContent?key={API_KEY}"
)

# ------------ HÀM TIỆN ÍCH ------------
def random_symptoms(symptoms_list):
    k = random.randint(1, len(symptoms_list))
    return random.sample([s["symptom_name"] for s in symptoms_list], k)

def build_prompt(disease, symp_names):
    """
    Tạo prompt yêu cầu LLM viết lại từng triệu chứng
    (không thêm bớt, không nhắc tên bệnh), kết quả một dòng
    duy nhất, các triệu chứng cách nhau bằng dấu phẩy.
    """
    return (
        "You are a medical paraphrasing assistant.\n\n"
        "Task: Given the official disease name and SELECTED symptom names, "
        "rewrite EACH symptom name in clear, concise English using synonyms or "
        "brief descriptive phrases.  ⚠️  Do NOT add new symptoms, do NOT remove "
        "any, and do NOT refer to the disease name.  Preserve the original order.\n\n"
        "Return the rewritten symptoms **in a single line**, separated ONLY by commas.\n\n"
        f"Disease name: \"{disease}\"\n"
        f"Symptoms: {'; '.join(symp_names)}\n\n"
        "Rewritten symptoms:"
    )


def call_gemini(prompt):
    payload = {
        "contents": [{"parts": [{"text": prompt}]}],
        "generationConfig": {
            "temperature": TEMPERATURE,
            "topP": 0.95,
            "maxOutputTokens": 32,
        },
    }
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.post(API_URL, json=payload, timeout=30)
            r.raise_for_status()
            return r.json()["candidates"][0]["content"]["parts"][0]["text"].strip()
        except Exception as e:
            if attempt == MAX_RETRIES:
                raise
            print(f"⚠️  Retry {attempt}/{MAX_RETRIES}: {e}")
            time.sleep(SLEEP_SEC * attempt)

def load_existing(path):
    done = set()
    if RESUME_RUN and Path(path).exists():
        with open(path, encoding="utf-8") as f:
            for line in f:
                try:
                    done.add(json.loads(line)["global_index"])
                except Exception:
                    pass
    return done

# ------------ MAIN ------------
def main():
    df = pd.read_json(JSON_PATH)

    required = {"disease_name", "global_index", "symptoms"}
    if not required.issubset(df.columns):
        sys.exit(f"❌ Thiếu cột: {required - set(df.columns)}")

    processed = load_existing(OUTPUT_JSONL)
    fout      = open(OUTPUT_JSONL, "a" if processed else "w", encoding="utf-8")
    count_new = 0

    for row in tqdm(df.itertuples(index=False), total=len(df), desc="Generating"):
        if row.global_index in processed:
            continue

        try:
            # ----- Xử lý 1 hàng -----
            if not row.symptoms:
                raise ValueError("Empty symptoms list")

            chosen   = random_symptoms(row.symptoms)
            prompt   = build_prompt(row.disease_name, chosen)
            new_name = call_gemini(prompt)

            record = {
                "global_index": int(row.global_index),
                "original_name": row.disease_name,
                "paraphrased_name": new_name,
                "used_symptoms": chosen,
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")
            count_new += 1
            processed.add(row.global_index)

            if count_new % CHKP_EVERY == 0:
                fout.flush()

            time.sleep(SLEEP_SEC)

        except Exception as err:
            # ----- Báo lỗi nhưng không dừng pipeline -----
            tqdm.write(f"⚠️  Skipped index {row.global_index}: {err}")
            continue   # sang bản ghi kế tiếp

    fout.close()
    print(f"✅ Hoàn tất! File lưu tại: {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()



Generating:   0%|          | 0/10143 [00:00<?, ?it/s]

⚠️  Skipped index 488: Empty symptoms list
⚠️  Skipped index 1456: Empty symptoms list
⚠️  Retry 1/3: 503 Server Error: Service Unavailable for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=AIzaSyCFTMycesDeKWglFWLACCRARrQ-k58EsRw
⚠️  Retry 1/3: 503 Server Error: Service Unavailable for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=AIzaSyCFTMycesDeKWglFWLACCRARrQ-k58EsRw
⚠️  Skipped index 2198: object of type 'float' has no len()
⚠️  Skipped index 2253: object of type 'float' has no len()
⚠️  Skipped index 2260: object of type 'float' has no len()
⚠️  Skipped index 2273: object of type 'float' has no len()
⚠️  Skipped index 2275: object of type 'float' has no len()
⚠️  Skipped index 2290: object of type 'float' has no len()
⚠️  Skipped index 2291: object of type 'float' has no len()
⚠️  Skipped index 2299: object of type 'float' has no len()
⚠️  Skipped index 2339: object of type 'flo