# 02. Silver Layer: RA患者定義とデータ変換

## 概要
Bronzeデータに対して論文のRA定義を適用し、分析用のSilverデータを作成します。

## 論文の7つのRA定義

| 定義 | DMARDs処方条件 | CS処方条件 | 患者数 | 有病率(%) |
|------|---------------|-----------|--------|----------|
| Definition 0 | 条件なし（ICD-10コードのみ） | 条件なし | 1,116,122 | 0.88 |
| Definition 1 | csDMARDs/bDMARDs/tsDMARDs 1ヶ月以上 | または2ヶ月以上 | 1,026,634 | 0.81 |
| Definition 2 | csDMARDs/bDMARDs/tsDMARDs 1ヶ月以上 | 条件なし | 869,340 | 0.69 |
| **Definition 3** | **csDMARDs/bDMARDs/tsDMARDs 2ヶ月以上** | **条件なし** | **825,772** | **0.65** |
| Definition 4 | csDMARDs/bDMARDs/tsDMARDs 6ヶ月以上 | 条件なし | 583,137 | 0.46 |
| Definition 5 | MTX/SSZ/TAC/BUC/IGT/bDMARDs/tsDMARDs 1ヶ月以上 | 条件なし | 841,599 | 0.66 |
| Definition 6 | MTX/SSZ/TAC/BUC/IGT/bDMARDs/tsDMARDs 2ヶ月以上 | 条件なし | 798,114 | 0.63 |

**本分析ではDefinition 3を採用**（論文と同様）

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

# データディレクトリ
BRONZE_DIR = "../data/bronze"
SILVER_DIR = "../data/silver"
os.makedirs(SILVER_DIR, exist_ok=True)

print("Silver Layer処理を開始します...")

## 1. Bronzeデータの読み込み

In [None]:
# Bronzeデータの読み込み
df_patients = pd.read_parquet(f"{BRONZE_DIR}/patients.parquet")
df_re = pd.read_parquet(f"{BRONZE_DIR}/re_receipt.parquet")
df_sy = pd.read_parquet(f"{BRONZE_DIR}/sy_disease.parquet")
df_iy = pd.read_parquet(f"{BRONZE_DIR}/iy_medication.parquet")
df_si = pd.read_parquet(f"{BRONZE_DIR}/si_procedure.parquet")
df_ho = pd.read_parquet(f"{BRONZE_DIR}/ho_insurer.parquet")

print("Bronzeデータを読み込みました:")
print(f"  - patients: {len(df_patients)} records")
print(f"  - re_receipt: {len(df_re)} records")
print(f"  - sy_disease: {len(df_sy)} records")
print(f"  - iy_medication: {len(df_iy)} records")
print(f"  - si_procedure: {len(df_si)} records")
print(f"  - ho_insurer: {len(df_ho)} records")

## 2. RA関連コードの定義

In [None]:
# RA関連ICD-10コード（論文と同じ定義）
RA_ICD10_CODES = [
    # M05.x: 血清陽性関節リウマチ
    "M050", "M051", "M052", "M053", "M058", "M059",
    # M06.x: その他の関節リウマチ（M061とM064を除く）
    "M060", "M062", "M063", "M068", "M069",
    # M08.x: 若年性関節炎（M081とM082を除く）
    "M080", "M083", "M084", "M088", "M089"
]

# DMARDs医薬品コード
CSDMARD_CODES = ["1199101", "1199102", "1199201", "1199301", "1199401", "1199501", "1199601"]
BDMARD_TNFI_CODES = ["4400101", "4400102", "4400103", "4400104", "4400105"]
BDMARD_IL6I_CODES = ["4400201", "4400202"]
BDMARD_ABT_CODES = ["4400301"]
TSDMARD_JAKI_CODES = ["4400401", "4400402"]

# 全DMARDsコード
ALL_DMARD_CODES = (
    CSDMARD_CODES + BDMARD_TNFI_CODES + BDMARD_IL6I_CODES + 
    BDMARD_ABT_CODES + TSDMARD_JAKI_CODES
)

# 経口ステロイドコード
CS_CODES = ["2454001", "2454002", "2454003"]

print(f"RA関連ICD-10コード数: {len(RA_ICD10_CODES)}")
print(f"DMARDsコード数: {len(ALL_DMARD_CODES)}")

## 3. RA関連ICD-10コードを持つ患者の抽出（Definition 0）

In [None]:
def extract_ra_icd10_patients(df_sy: pd.DataFrame, ra_codes: list) -> set:
    """
    RA関連ICD-10コードを持つ患者を抽出
    """
    ra_patients = df_sy[df_sy['ICD10コード'].isin(ra_codes)]['共通キー'].unique()
    return set(ra_patients)

# Definition 0: ICD-10コードのみ
definition_0_patients = extract_ra_icd10_patients(df_sy, RA_ICD10_CODES)
print(f"Definition 0 (ICD-10のみ): {len(definition_0_patients)} patients")

## 4. DMARDs処方月数の計算

In [None]:
def calculate_prescription_months(df_iy: pd.DataFrame, df_re: pd.DataFrame, 
                                   drug_codes: list, patient_set: set) -> pd.DataFrame:
    """
    各患者のDMARDs処方月数を計算
    
    Parameters:
    -----------
    df_iy : 医薬品データ
    df_re : レセプト基本データ
    drug_codes : 対象薬剤コードのリスト
    patient_set : 対象患者の共通キーのセット
    
    Returns:
    --------
    患者ごとの処方月数を含むDataFrame
    """
    # 対象患者のDMARDs処方を抽出
    df_dmard = df_iy[
        (df_iy['共通キー'].isin(patient_set)) & 
        (df_iy['医薬品コード'].isin(drug_codes))
    ].copy()
    
    if len(df_dmard) == 0:
        return pd.DataFrame(columns=['共通キー', 'prescription_months'])
    
    # レセプト情報をマージして診療年月を取得
    df_dmard = df_dmard.merge(
        df_re[['検索番号', '診療年月']].drop_duplicates(),
        on='検索番号',
        how='left'
    )
    
    # 患者ごとの処方月数をカウント
    prescription_months = df_dmard.groupby('共通キー')['診療年月'].nunique().reset_index()
    prescription_months.columns = ['共通キー', 'prescription_months']
    
    return prescription_months

# DMARDs処方月数の計算
df_dmard_months = calculate_prescription_months(
    df_iy, df_re, ALL_DMARD_CODES, definition_0_patients
)

print(f"DMARDs処方のある患者数: {len(df_dmard_months)}")
print(f"\n処方月数の分布:")
print(df_dmard_months['prescription_months'].describe())

## 5. 各RA定義の適用

In [None]:
def apply_ra_definitions(definition_0_patients: set, 
                          df_dmard_months: pd.DataFrame,
                          df_iy: pd.DataFrame,
                          df_re: pd.DataFrame) -> dict:
    """
    論文の7つのRA定義を適用
    """
    results = {}
    
    # Definition 0: ICD-10コードのみ
    results['def_0'] = definition_0_patients
    
    # Definition 2: DMARDs 1ヶ月以上
    def2_patients = set(
        df_dmard_months[df_dmard_months['prescription_months'] >= 1]['共通キー']
    )
    results['def_2'] = definition_0_patients & def2_patients
    
    # Definition 3: DMARDs 2ヶ月以上（主要定義）
    def3_patients = set(
        df_dmard_months[df_dmard_months['prescription_months'] >= 2]['共通キー']
    )
    results['def_3'] = definition_0_patients & def3_patients
    
    # Definition 4: DMARDs 6ヶ月以上
    def4_patients = set(
        df_dmard_months[df_dmard_months['prescription_months'] >= 6]['共通キー']
    )
    results['def_4'] = definition_0_patients & def4_patients
    
    return results

# 各定義の適用
ra_definitions = apply_ra_definitions(
    definition_0_patients, df_dmard_months, df_iy, df_re
)

print("=" * 50)
print("RA定義別の患者数")
print("=" * 50)
for def_name, patients in ra_definitions.items():
    prevalence = len(patients) / len(df_patients) * 100
    print(f"{def_name}: {len(patients):,} patients ({prevalence:.2f}%)")

## 6. Definition 3に基づくRA患者データの作成

In [None]:
# Definition 3の患者リスト
ra_patients_def3 = ra_definitions['def_3']

# RA患者の基本情報
df_ra_patients = df_patients[df_patients['共通キー'].isin(ra_patients_def3)].copy()

# 年齢群の再計算（基準日: 2017年10月1日）
def assign_age_group(age: int) -> str:
    """年齢から年齢群を決定"""
    if 16 <= age <= 19:
        return "16-19"
    elif 20 <= age <= 29:
        return "20-29"
    elif 30 <= age <= 39:
        return "30-39"
    elif 40 <= age <= 49:
        return "40-49"
    elif 50 <= age <= 59:
        return "50-59"
    elif 60 <= age <= 69:
        return "60-69"
    elif 70 <= age <= 79:
        return "70-79"
    elif 80 <= age <= 84:
        return "80-84"
    else:
        return "85+"

df_ra_patients['age_group'] = df_ra_patients['age'].apply(assign_age_group)

print(f"Definition 3 RA患者数: {len(df_ra_patients)}")
print(f"\n性別分布:")
print(df_ra_patients['sex'].value_counts())
print(f"\n女性比率: {(df_ra_patients['sex'] == '2').mean():.1%}")

## 7. 薬剤使用パターンの集計

In [None]:
def calculate_medication_usage(df_iy: pd.DataFrame, ra_patients: set) -> pd.DataFrame:
    """
    RA患者の薬剤使用パターンを集計
    """
    # RA患者の医薬品データを抽出
    df_ra_meds = df_iy[df_iy['共通キー'].isin(ra_patients)].copy()
    
    # 薬剤カテゴリ別の使用状況
    drug_categories = {
        'MTX': ['1199101', '1199102'],
        'SSZ': ['1199201'],
        'BUC': ['1199401'],
        'TAC': ['1199301'],
        'IGT': ['1199501'],
        'LEF': ['1199601'],
        'TNFI': ['4400101', '4400102', '4400103', '4400104', '4400105'],
        'IL6I': ['4400201', '4400202'],
        'ABT': ['4400301'],
        'JAKi': ['4400401', '4400402'],
        'CS': ['2454001', '2454002', '2454003']
    }
    
    # 各患者の薬剤使用フラグを作成
    patient_drugs = {}
    for patient in ra_patients:
        patient_meds = df_ra_meds[df_ra_meds['共通キー'] == patient]['医薬品コード'].tolist()
        patient_drugs[patient] = {}
        for cat_name, codes in drug_categories.items():
            patient_drugs[patient][cat_name] = any(code in patient_meds for code in codes)
    
    df_drug_usage = pd.DataFrame.from_dict(patient_drugs, orient='index')
    df_drug_usage.index.name = '共通キー'
    df_drug_usage = df_drug_usage.reset_index()
    
    return df_drug_usage

df_drug_usage = calculate_medication_usage(df_iy, ra_patients_def3)

print("薬剤使用率（全RA患者）:")
print("=" * 40)
for col in df_drug_usage.columns:
    if col != '共通キー':
        usage_rate = df_drug_usage[col].mean() * 100
        print(f"{col}: {usage_rate:.1f}%")

## 8. 診療行為（手術・検査）の集計

In [None]:
def calculate_procedure_rates(df_si: pd.DataFrame, ra_patients: set) -> pd.DataFrame:
    """
    RA患者の手術・検査実施率を集計
    """
    # RA患者の診療行為データを抽出
    df_ra_proc = df_si[df_si['共通キー'].isin(ra_patients)].copy()
    
    # 手術・検査カテゴリ
    procedure_categories = {
        'TJR': 'TJR',           # 人工関節置換術
        'ARTHROPLASTY': 'ARTHROPLASTY',  # 関節形成術
        'SYNOVECTOMY': 'SYNOVECTOMY',    # 滑膜切除術
        'ULTRASOUND': 'ULTRASOUND',      # 関節超音波
        'BMD': 'BMD'            # 骨密度測定
    }
    
    # 各患者の手術・検査実施フラグを作成
    patient_procs = {}
    for patient in ra_patients:
        patient_proc_types = df_ra_proc[df_ra_proc['共通キー'] == patient]['procedure_type'].tolist()
        patient_procs[patient] = {}
        for cat_name, proc_type in procedure_categories.items():
            patient_procs[patient][cat_name] = proc_type in patient_proc_types
    
    df_proc_usage = pd.DataFrame.from_dict(patient_procs, orient='index')
    df_proc_usage.index.name = '共通キー'
    df_proc_usage = df_proc_usage.reset_index()
    
    return df_proc_usage

df_proc_usage = calculate_procedure_rates(df_si, ra_patients_def3)

print("手術・検査実施率（全RA患者）:")
print("=" * 40)
for col in df_proc_usage.columns:
    if col != '共通キー':
        usage_rate = df_proc_usage[col].mean() * 100
        print(f"{col}: {usage_rate:.2f}%")

## 9. Silverデータの作成と保存

In [None]:
# RA患者マスタの作成
df_ra_master = df_ra_patients.merge(df_drug_usage, on='共通キー', how='left')
df_ra_master = df_ra_master.merge(df_proc_usage, on='共通キー', how='left')

# DMARDs処方月数を追加
df_ra_master = df_ra_master.merge(
    df_dmard_months,
    on='共通キー',
    how='left'
)

# bDMARDs使用フラグを追加
df_ra_master['bDMARDs'] = (
    df_ra_master['TNFI'] | df_ra_master['IL6I'] | df_ra_master['ABT']
)

# 全DMARDs使用フラグを追加
df_ra_master['any_DMARD'] = (
    df_ra_master['MTX'] | df_ra_master['SSZ'] | df_ra_master['BUC'] |
    df_ra_master['TAC'] | df_ra_master['IGT'] | df_ra_master['LEF'] |
    df_ra_master['bDMARDs'] | df_ra_master['JAKi']
)

# RA関連手術フラグを追加
df_ra_master['any_RA_surgery'] = (
    df_ra_master['TJR'] | df_ra_master['ARTHROPLASTY'] | df_ra_master['SYNOVECTOMY']
)

print(f"RA患者マスタのカラム: {df_ra_master.columns.tolist()}")
print(f"\nレコード数: {len(df_ra_master)}")

In [None]:
# 定義別患者リストの保存用DataFrameを作成
df_definitions = pd.DataFrame({
    'definition': ['def_0', 'def_2', 'def_3', 'def_4'],
    'n_patients': [
        len(ra_definitions['def_0']),
        len(ra_definitions['def_2']),
        len(ra_definitions['def_3']),
        len(ra_definitions['def_4'])
    ],
    'prevalence_pct': [
        len(ra_definitions['def_0']) / len(df_patients) * 100,
        len(ra_definitions['def_2']) / len(df_patients) * 100,
        len(ra_definitions['def_3']) / len(df_patients) * 100,
        len(ra_definitions['def_4']) / len(df_patients) * 100
    ]
})

print("RA定義別サマリー:")
print(df_definitions)

In [None]:
# Silverデータの保存
df_ra_master.to_parquet(f"{SILVER_DIR}/ra_patients_def3.parquet", index=False)
df_definitions.to_parquet(f"{SILVER_DIR}/ra_definitions_summary.parquet", index=False)

# 定義別患者リストの保存
for def_name, patients in ra_definitions.items():
    df_def = pd.DataFrame({'共通キー': list(patients)})
    df_def.to_parquet(f"{SILVER_DIR}/ra_patients_{def_name}.parquet", index=False)

print("\nSilverデータを保存しました:")
print(f"  - ra_patients_def3.parquet: {len(df_ra_master)} records")
print(f"  - ra_definitions_summary.parquet: {len(df_definitions)} records")
for def_name in ra_definitions.keys():
    print(f"  - ra_patients_{def_name}.parquet")

## 10. Silver Layer データ品質チェック

In [None]:
print("=" * 60)
print("Silver Layer データ品質サマリー")
print("=" * 60)

print(f"\n【Definition 3 RA患者の基本統計】")
print(f"総患者数: {len(df_ra_master):,}")
print(f"女性患者数: {(df_ra_master['sex'] == '2').sum():,}")
print(f"女性比率: {(df_ra_master['sex'] == '2').mean():.1%}")

print(f"\n【年齢分布】")
age_dist = df_ra_master['age_group'].value_counts().sort_index()
for group, count in age_dist.items():
    pct = count / len(df_ra_master) * 100
    print(f"  {group}: {count} ({pct:.1f}%)")

print(f"\n【65歳以上の患者割合】")
elderly_count = (df_ra_master['age'] >= 65).sum()
print(f"  {elderly_count} ({elderly_count/len(df_ra_master)*100:.1f}%)")

print(f"\n【85歳以上の患者割合】")
very_elderly = (df_ra_master['age'] >= 85).sum()
print(f"  {very_elderly} ({very_elderly/len(df_ra_master)*100:.1f}%)")

print(f"\n【薬剤使用率】")
drug_cols = ['MTX', 'SSZ', 'BUC', 'TAC', 'IGT', 'LEF', 'TNFI', 'IL6I', 'ABT', 'JAKi', 'CS', 'bDMARDs']
for col in drug_cols:
    if col in df_ra_master.columns:
        rate = df_ra_master[col].mean() * 100
        print(f"  {col}: {rate:.1f}%")

print(f"\n【手術・検査実施率】")
proc_cols = ['TJR', 'ARTHROPLASTY', 'SYNOVECTOMY', 'ULTRASOUND', 'BMD', 'any_RA_surgery']
for col in proc_cols:
    if col in df_ra_master.columns:
        rate = df_ra_master[col].mean() * 100
        print(f"  {col}: {rate:.2f}%")

print("\n" + "=" * 60)
print("Silver Layer 処理完了")
print("=" * 60)