In [22]:
import os
import pandas as pd
from dotenv import load_dotenv

# config

In [23]:
season = "August2021-August2023"
data_root = f"../../data/NHANES/{season}"
os.makedirs(data_root, exist_ok=True)
seed = 42
download_data = False

load_dotenv()

True

# data download

In [6]:
if download_data:
    # Demographic Variables and Sample Weights https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt 
    # Dietary Interview - Individual Foods, First Day https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DR1IFF_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DR1IFF_L.xpt
    # Balance https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BAX_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BAX_L.xpt
    # Blood Pressure - Oscillometric Measurements https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BPXO_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BPXO_L.xpt
    # Body Measures https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BMX_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BMX_L.xpt
    # Liver Ultrasound Transient Elastography https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/LUX_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/LUX_L.xpt
    # Early Childhood https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/ECQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/ECQ_L.xpt
    # Income https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/INQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/INQ_L.xpt
    # Medical Conditions https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/MCQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/MCQ_L.xpt
    # Occupation https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/OCQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/OCQ_L.xpt
    # Physical Activity https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/PAQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/PAQ_L.xpt
    # Smoking - Cigarette Use https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/SMQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/SMQ_L.xpt
    # Weight History https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/WHQ_L.htm
    !curl --output-dir $data_root -O https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/WHQ_L.xpt

In [7]:
data = {}
for data_file in os.listdir(data_root):
    data[data_file.replace(".xpt", "")] = pd.read_sas(f"{data_root}/{data_file}")

In [215]:
df_demographic = data["DEMO_L"][[
    "SEQN",       # Respondent sequence number (subject id)
    # Basic demographics
    "RIDAGEYR",   # Age in years at screening
    "RIAGENDR",   # Gender
    "RIDRETH1",   # Race/Hispanic origin
    # Socioeconomic
    "DMDEDUC2",   # Education level - Adults 20+
    "DMDMARTZ",   # Marital status
    "INDFMPIR",   # Family poverty income ratio
]]

df_food = data["DR1IFF_L"][[
    "SEQN",        # subject id
    # Food item identifier
    "DR1IFDCD",    # USDA food code
    "DR1IGRMS",    # grams consumed
    # Meal context
    "DR1_030Z",    # Name of eating occasion (breakfast/lunch etc.)
    "DR1_040Z",    # Time of eating occasion
    # Where obtained / source can correlate with diet pattern
    "DR1FS",       # Food source
]]

df_blood_pressure = data["BPXO_L"][[
    "SEQN",          # subject id
    # Systolic/Diastolic (oscillometric)
    "BPXOSY1", "BPXOSY2", "BPXOSY3",   # systolic readings
    "BPXODI1", "BPXODI2", "BPXODI3",   # diastolic readings
    # Pulse
    "BPXOPLS1", "BPXOPLS2", "BPXOPLS3",
]]
df_body_measure = data["BMX_L"][[
    "SEQN",     # subject id
    # Targets
    "BMXHT",    # Standing height (cm)
    "BMXWT",    # Weight (kg)
]]
df_income = data["INQ_L"][[
    "SEQN",        # subject id
    "INQ300",      # Total family income
]]
df_occupation = data["OCQ_L"][[
    "SEQN",        # subject id
    "OCD150",      # Work activity level (typical physical activity at work)
]]
df_physical_activity = data["PAQ_L"][[
    "SEQN",        # subject id
    "PAD790Q",      # Vigorous work activity
    "PAD790U",      # Moderate work activity
    "PAD810Q",      # Moderate recreational activity
    "PAD810U",      # Sedentary activity (minutes)
    "PAD820",       # Minutes vigorous LTPA
    "PAD680",      # Minutes sedentary activity
]]
df_smoking = data["SMQ_L"][[
    "SEQN",        # subject id
    "SMQ020",      # Smoked at least 100 cigarettes in life?
]]

In [None]:
df_nhanes = df_demographic.merge(
    df_food,
    on="SEQN",
).merge(
    df_blood_pressure,
    on="SEQN",
).merge(
    df_body_measure,
    on="SEQN",
).merge(
    df_income,
    on="SEQN",
).merge(
    df_occupation,
    on="SEQN",
).merge(
    df_physical_activity,
    on="SEQN",
).merge(
    df_smoking,
    on="SEQN",
)
df_nhanes

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,RIDRETH1,DMDEDUC2,DMDMARTZ,INDFMPIR,DR1IFDCD,DR1IGRMS,DR1_030Z,...,MCQ160C,MCQ160F,OCD150,PAD790Q,PAD790U,PAD810Q,PAD810U,PAD820,PAD680,SMQ020
0,130378.0,43.0,1.0,5.0,5.0,1.0,5.0,94000100.0,120.00,7.0,...,2.0,2.0,1.0,3.000000e+00,b'W',3.000000e+00,b'W',45.0,360.0,1.0
1,130378.0,43.0,1.0,5.0,5.0,1.0,5.0,94000100.0,120.00,7.0,...,2.0,2.0,1.0,3.000000e+00,b'W',3.000000e+00,b'W',45.0,360.0,1.0
2,130378.0,43.0,1.0,5.0,5.0,1.0,5.0,92101000.0,300.00,1.0,...,2.0,2.0,1.0,3.000000e+00,b'W',3.000000e+00,b'W',45.0,360.0,1.0
3,130378.0,43.0,1.0,5.0,5.0,1.0,5.0,94000100.0,240.00,1.0,...,2.0,2.0,1.0,3.000000e+00,b'W',3.000000e+00,b'W',45.0,360.0,1.0
4,130378.0,43.0,1.0,5.0,5.0,1.0,5.0,83102000.0,4.90,2.0,...,2.0,2.0,1.0,3.000000e+00,b'W',3.000000e+00,b'W',45.0,360.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76035,142310.0,80.0,2.0,3.0,3.0,1.0,5.0,75224023.0,20.00,3.0,...,2.0,2.0,4.0,5.397605e-79,b'',5.397605e-79,b'',,360.0,1.0
76036,142310.0,80.0,2.0,3.0,3.0,1.0,5.0,94100100.0,240.00,3.0,...,2.0,2.0,4.0,5.397605e-79,b'',5.397605e-79,b'',,360.0,1.0
76037,142310.0,80.0,2.0,3.0,3.0,1.0,5.0,93505000.0,28.00,3.0,...,2.0,2.0,4.0,5.397605e-79,b'',5.397605e-79,b'',,360.0,1.0
76038,142310.0,80.0,2.0,3.0,3.0,1.0,5.0,61204000.0,0.65,3.0,...,2.0,2.0,4.0,5.397605e-79,b'',5.397605e-79,b'',,360.0,1.0


In [203]:
df_samples = df_nhanes.sample(n=50, random_state=seed).reset_index(drop=True)

In [205]:
df_samples.columns

Index(['SEQN', 'RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'DMDEDUC2', 'DMDMARTZ',
       'INDFMPIR', 'DR1IFDCD', 'DR1IGRMS', 'DR1_030Z', 'DR1_040Z', 'DR1FS',
       'BPXOSY1', 'BPXOSY2', 'BPXOSY3', 'BPXODI1', 'BPXODI2', 'BPXODI3',
       'BPXOPLS1', 'BPXOPLS2', 'BPXOPLS3', 'BMXHT', 'BMXWT', 'INQ300',
       'MCQ010', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160F', 'OCD150',
       'PAD790Q', 'PAD790U', 'PAD810Q', 'PAD810U', 'PAD820', 'PAD680',
       'SMQ020'],
      dtype='str')

# summarize subject

In [206]:
personalities = []
for idx, row in enumerate(df_samples.iterrows()):
    personality = ""
    for col, val in row[1].items():
        if col == "BMXHT" or col == "BMXWT":  # exclude height, weight
            continue
        # reduce token
        if str(val).endswith(".0"):
            val = str(val).replace(".0", "")
        personality += f"{col} {val} "
    personalities.append(personality)

personalities

["SEQN 133066 RIDAGEYR 70 RIAGENDR 2 RIDRETH1 3 DMDEDUC2 4 DMDMARTZ 1 INDFMPIR 3.71 DR1IFDCD 92552010 DR1IGRMS 240 DR1_030Z 3 DR1_040Z 1 DR1FS 1 BPXOSY1 155 BPXOSY2 158 BPXOSY3 158 BPXODI1 92 BPXODI2 75 BPXODI3 77 BPXOPLS1 51 BPXOPLS2 52 BPXOPLS3 73 INQ300 1 MCQ010 1 MCQ160A 1 MCQ160B 2 MCQ160C 2 MCQ160F 2 OCD150 4 PAD790Q 5.397605346934028e-79 PAD790U b'' PAD810Q 5.397605346934028e-79 PAD810U b'' PAD820 nan PAD680 300 SMQ020 1 ",
 "SEQN 131913 RIDAGEYR 80 RIAGENDR 1 RIDRETH1 4 DMDEDUC2 3 DMDMARTZ 2 INDFMPIR nan DR1IFDCD 71508020 DR1IGRMS 130 DR1_030Z 4 DR1_040Z 1 DR1FS 1 BPXOSY1 144 BPXOSY2 144 BPXOSY3 142 BPXODI1 60 BPXODI2 62 BPXODI3 55 BPXOPLS1 56 BPXOPLS2 58 BPXOPLS3 58 INQ300 9 MCQ010 2 MCQ160A 1 MCQ160B 2 MCQ160C 2 MCQ160F 2 OCD150 4 PAD790Q 1 PAD790U b'W' PAD810Q 5.397605346934028e-79 PAD810U b'' PAD820 nan PAD680 180 SMQ020 1 ",
 "SEQN 130642 RIDAGEYR 43 RIAGENDR 1 RIDRETH1 5 DMDEDUC2 5 DMDMARTZ 1 INDFMPIR 5 DR1IFDCD 94000100 DR1IGRMS 276 DR1_030Z 7 DR1_040Z 1 DR1FS nan BPXOSY

In [180]:
limit = 50

prompt = f"""
# 命令
被験者に関する "パーソナリティ" で与えられる情報と、それらの情報の enum である "データソース" の URL から得られる情報を合わせて、 "回答例" で示されているようなその人を表す文章を英語で作成してください。
"パーソナリティ" の情報の定義については "データソース" の URL から得られる情報を確認した上で活用してください。
必ずしもすべての情報を使わなくても問題ありません。
最終的な回答では、"パーソナリティ" の SEQN ごとに、結果のみを回答に含むようにしてください。
被験者の呼称は性別に合わせて He または She としてください。
出力形式は json 形式としてください。

# パーソナリティ
{'\n--\n'.join(personalities[:limit])}

# データソース
Demographic Variables and Sample Weights https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.htm
Dietary Interview - Individual Foods, First Day https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DR1IFF_L.htm
Blood Pressure - Oscillometric Measurements https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BPXO_L.htm
Body Measures https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BMX_L.htm
Income https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/INQ_L.htm
Occupation https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/OCQ_L.htm
Physical Activity https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/PAQ_L.htm
Smoking - Cigarette Use https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/SMQ_L.htm

# 回答例
{{
    "1234": "She is a 32-year-old Norwegian woman who works as a software engineer. She mentioned playing volleyball in college and still plays recreationally on weekends. She describes herself as taller than most of her female friends and maintains an active lifestyle with regular gym sessions."
}}
"""
print(prompt)


# 命令
被験者に関する "パーソナリティ" で与えられる情報と、それらの情報の enum である "データソース" の URL から得られる情報を合わせて、 "回答例" で示されているようなその人を表す文章を英語で作成してください。
"パーソナリティ" の情報の定義については "データソース" の URL から得られる情報を確認した上で活用してください。
必ずしもすべての情報を使わなくても問題ありません。
最終的な回答では、"パーソナリティ" の SEQN ごとに、結果のみを回答に含むようにしてください。
被験者の呼称は性別に合わせて He または She としてください。
出力形式は json 形式としてください。

# パーソナリティ
SEQN 133066 RIDAGEYR 70 RIAGENDR 2 RIDRETH1 3 DMDEDUC2 4 DMDMARTZ 1 INDFMPIR 3.71 DR1IFDCD 92552010 DR1IGRMS 240 DR1_030Z 3 DR1_040Z 1 DR1FS 1 BPXOSY1 155 BPXOSY2 158 BPXOSY3 158 BPXODI1 92 BPXODI2 75 BPXODI3 77 BPXOPLS1 51 BPXOPLS2 52 BPXOPLS3 73 INQ300 1 MCQ010 1 MCQ160A 1 MCQ160B 2 MCQ160C 2 MCQ160F 2 OCD150 4 PAD790Q 5.397605346934028e-79 PAD790U b'' PAD810Q 5.397605346934028e-79 PAD810U b'' PAD820 nan PAD680 300 SMQ020 1 
--
SEQN 131913 RIDAGEYR 80 RIAGENDR 1 RIDRETH1 4 DMDEDUC2 3 DMDMARTZ 2 INDFMPIR nan DR1IFDCD 71508020 DR1IGRMS 130 DR1_030Z 4 DR1_040Z 1 DR1FS 1 BPXOSY1 144 BPXOSY2 144 BPXOSY3 142 BPXODI1 60 BPXODI2 62 BPXODI3 55 BPXOPLS1 56 BPXOPLS2 58 BPXOP

In [181]:
import anthropic

client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-sonnet-4-5",
    max_tokens=8192,
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    tools=[
        {
            "type": "web_fetch_20250910",
            "name": "web_fetch",
            "max_uses": 5,
            "citations": {"enabled": True}
        }
    ],
    extra_headers={
        "anthropic-beta": "web-fetch-2025-09-10"
    }
)

In [182]:
with open("./result.txt", "w") as fw:
    fw.write(response.content[-1].text)

# result

In [186]:
with open("./result.json", "r") as fr:
    result = json.load(fr)

In [210]:
df_text = pd.DataFrame({
    "SEQN": result.keys(),
    "text_description": result.values(),
})
df_text["SEQN"] = df_text["SEQN"].apply(int)

In [224]:
df_result = df_samples.merge(
    df_text,
    on="SEQN",
)

final_result = []
for row in df_result[["SEQN", "text_description", "BMXHT", "BMXWT",]].iterrows():
    final_result.append({
        "subject_id": str(int(row[1]["SEQN"])),
        "text_description": row[1]["text_description"],
        "height_cm": row[1]["BMXHT"],
        "weight_kg": row[1]["BMXWT"],
    })

with open(f"{data_root}/subject.json", "w") as fw:
    json.dump(final_result, fp=fw, indent=4)
