In [0]:
CREATE OR REPLACE TABLE ndb_poc.default.diabetes_cohort AS

WITH 
patient_ids AS (
  SELECT EXPLODE(SEQUENCE(1, 100000)) AS patient_id
),

patients_base AS (
  SELECT
    patient_id,
    ABS(HASH(patient_id, 'salt1')) / 2147483647.0 AS rand1,
    ABS(HASH(patient_id, 'salt2')) / 2147483647.0 AS rand2,
    ABS(HASH(patient_id, 'salt3')) / 2147483647.0 AS rand3,
    ABS(HASH(patient_id, 'salt4')) / 2147483647.0 AS rand4,
    ABS(HASH(patient_id, 'salt5')) / 2147483647.0 AS rand5,
    ABS(HASH(patient_id, 'salt6')) / 2147483647.0 AS rand6,
    ABS(HASH(patient_id, 'salt7')) / 2147483647.0 AS rand7,
    ABS(HASH(patient_id, 'salt8')) / 2147483647.0 AS rand8,
    ABS(HASH(patient_id, 'salt9')) / 2147483647.0 AS rand9,
    ABS(HASH(patient_id, 'missing1')) / 2147483647.0 AS rand_missing1,
    ABS(HASH(patient_id, 'missing2')) / 2147483647.0 AS rand_missing2,
    ABS(HASH(patient_id, 'missing3')) / 2147483647.0 AS rand_missing3
  FROM patient_ids
),

-- Step 1: 性別と年齢を先に決定
patients_with_demographics AS (
  SELECT
    *,
    -- 性別: 糖尿病は男性がやや多い（男性58%）
    CASE WHEN rand1 < 0.58 THEN '男' ELSE '女' END AS sex,
    
    -- 年齢分布: より現実的に高齢に偏らせる
    -- 40歳未満: 2%, 40-49: 6%, 50-59: 14%, 60-69: 26%, 70-79: 35%, 80+: 17%
    CASE 
      WHEN rand2 < 0.02 THEN 30 + CAST(rand3 * 10 AS INT)  -- 30-39 (2%)
      WHEN rand2 < 0.08 THEN 40 + CAST(rand3 * 10 AS INT)  -- 40-49 (6%)
      WHEN rand2 < 0.22 THEN 50 + CAST(rand3 * 10 AS INT)  -- 50-59 (14%)
      WHEN rand2 < 0.48 THEN 60 + CAST(rand3 * 10 AS INT)  -- 60-69 (26%)
      WHEN rand2 < 0.83 THEN 70 + CAST(rand3 * 10 AS INT)  -- 70-79 (35%)
      ELSE 80 + CAST(rand3 * 15 AS INT)                     -- 80-94 (17%)
    END AS age
  FROM patients_base
),

-- Step 2: 年齢階級と年齢依存の併存疾患確率を計算
patients_with_age_group AS (
  SELECT
    *,
    CASE 
      WHEN age < 40 THEN '40歳未満'
      WHEN age < 50 THEN '40-49歳'
      WHEN age < 60 THEN '50-59歳'
      WHEN age < 70 THEN '60-69歳'
      WHEN age < 80 THEN '70-79歳'
      ELSE '80歳以上'
    END AS age_group,
    
    -- 年齢による高血圧確率（加齢で増加）
    CASE 
      WHEN age < 50 THEN 0.30
      WHEN age < 60 THEN 0.45
      WHEN age < 70 THEN 0.55
      WHEN age < 80 THEN 0.65
      ELSE 0.70
    END AS ht_prob,
    
    -- 年齢による脂質異常症確率（中年でピーク）
    CASE 
      WHEN age < 50 THEN 0.35
      WHEN age < 60 THEN 0.45
      WHEN age < 70 THEN 0.40
      WHEN age < 80 THEN 0.35
      ELSE 0.30
    END AS dl_prob,
    
    -- 年齢による慢性腎臓病(CKD)確率（高齢で増加）
    CASE 
      WHEN age < 50 THEN 0.03
      WHEN age < 60 THEN 0.06
      WHEN age < 70 THEN 0.12
      WHEN age < 80 THEN 0.20
      ELSE 0.28
    END AS ckd_prob
  FROM patients_with_demographics
),

-- Step 3: 診断年の分布（2015-2019、徐々に増加傾向）
patients_with_year AS (
  SELECT
    *,
    -- 診断年: 2015: 12%, 2016: 16%, 2017: 20%, 2018: 24%, 2019: 28%
    CASE 
      WHEN rand4 < 0.12 THEN 2015
      WHEN rand4 < 0.28 THEN 2016
      WHEN rand4 < 0.48 THEN 2017
      WHEN rand4 < 0.72 THEN 2018
      ELSE 2019
    END AS diagnosis_year
  FROM patients_with_age_group
),

-- Step 4: 併存疾患を独立に判定
patients_with_comorbidities AS (
  SELECT
    *,
    rand7 < ht_prob AS has_hypertension,
    rand8 < dl_prob AS has_dyslipidemia,
    rand9 < ckd_prob AS has_ckd
  FROM patients_with_year
)

SELECT
  patient_id,
  sex,
  2020 - age AS birth_year,
  age,
  age_group,
  
  -- 都道府県（人口比に基づく重み付け）
  CASE 
    WHEN rand_missing3 < 0.02 THEN NULL
    ELSE LPAD(CAST(
      CASE 
        WHEN rand5 < 0.11 THEN 13  -- Tokyo 11%
        WHEN rand5 < 0.18 THEN 14  -- Kanagawa 7%
        WHEN rand5 < 0.25 THEN 27  -- Osaka 7%
        WHEN rand5 < 0.31 THEN 23  -- Aichi 6%
        WHEN rand5 < 0.36 THEN 11  -- Saitama 5%
        WHEN rand5 < 0.41 THEN 12  -- Chiba 5%
        WHEN rand5 < 0.45 THEN 28  -- Hyogo 4%
        WHEN rand5 < 0.49 THEN 01  -- Hokkaido 4%
        WHEN rand5 < 0.53 THEN 40  -- Fukuoka 4%
        WHEN rand5 < 0.56 THEN 22  -- Shizuoka 3%
        ELSE 1 + CAST(rand5 * 47 AS INT)
      END AS STRING), 2, '0')
  END AS prefecture_code,
  
  -- 診断日
  DATE_ADD(
    CONCAT(diagnosis_year, '-01-01'),
    CAST(rand6 * 364 AS INT)
  ) AS diabetes_diagnosis_date,
  
  -- 併存疾患フラグ
  has_hypertension,
  has_dyslipidemia,
  has_ckd,
  
  -- 併存疾患パターン（日本語、シンプルな組み合わせ優先）
  CASE 
    WHEN has_hypertension AND has_dyslipidemia AND has_ckd THEN '高血圧+脂質異常症+慢性腎臓病'
    WHEN has_hypertension AND has_ckd THEN '高血圧+慢性腎臓病'
    WHEN has_dyslipidemia AND has_ckd THEN '脂質異常症+慢性腎臓病'
    WHEN has_hypertension AND has_dyslipidemia THEN '高血圧+脂質異常症'
    WHEN has_ckd THEN '慢性腎臓病のみ'
    WHEN has_hypertension THEN '高血圧のみ'
    WHEN has_dyslipidemia THEN '脂質異常症のみ'
    ELSE '併存疾患なし'
  END AS comorbidity_pattern,
  
  -- 心血管イベント発生（年齢・併存疾患で確率変動）
  CASE 
    WHEN (
      0.01  -- base risk
      + CASE WHEN age >= 80 THEN 0.03 WHEN age >= 70 THEN 0.02 WHEN age >= 60 THEN 0.01 ELSE 0 END
      + CASE WHEN has_hypertension THEN 0.015 ELSE 0 END
      + CASE WHEN has_ckd THEN 0.02 ELSE 0 END
    ) > rand1 THEN TRUE
    ELSE FALSE 
  END AS cv_event_occurred,
  
  -- 観察期間（診断年が古いほど長い傾向）
  CASE diagnosis_year
    WHEN 2015 THEN CAST(36 + rand6 * 24 AS INT)
    WHEN 2016 THEN CAST(30 + rand6 * 24 AS INT)
    WHEN 2017 THEN CAST(24 + rand6 * 20 AS INT)
    WHEN 2018 THEN CAST(12 + rand6 * 20 AS INT)
    ELSE CAST(6 + rand6 * 12 AS INT)
  END AS follow_up_months,
  
  -- HbA1c（8%欠損）
  CASE 
    WHEN rand_missing1 < 0.08 THEN NULL
    ELSE ROUND(
      6.5 
      + rand1 * 2.5
      + CASE WHEN age < 50 THEN 0.3 ELSE 0 END
      + CASE WHEN sex = '男' THEN 0.2 ELSE 0 END
      + rand2 * 1.0,
      1
    )
  END AS hba1c,
  
  -- BMI（5%欠損）
  CASE 
    WHEN rand_missing2 < 0.05 THEN NULL
    ELSE ROUND(
      CASE 
        WHEN sex = '男' THEN 24.5 + rand3 * 6 + rand4 * 4
        ELSE 23.0 + rand3 * 7 + rand4 * 5
      END
      - CASE WHEN age >= 80 THEN 2.0 WHEN age >= 70 THEN 1.0 ELSE 0 END,
      1
    )
  END AS bmi

FROM patients_with_comorbidities;