# NHANES Ground Truth Distributions

## Required Variables (all combinations)
1. RIDAGEYR (Age) → 5 bins
2. RIAGENDR (Gender)
3. RIDRETH1 (Race)
4. DMDEDUC2 (Education)
5. INDFMPIR (Poverty Income Ratio) → 4 bins
6. OCD150 (Work Activity Level)

## Additional Variables
- SMQ020 (Smoking)

## Key Format
`RIDAGEYR=20-39__RIAGENDR=Male__RIDRETH1=White`

## Minimum Sample Size
N >= 50 unique subjects

In [None]:
import os
import json
import pandas as pd
import numpy as np
import sys
from dotenv import load_dotenv
from itertools import product

load_dotenv()
sys.path.append("../..")
from src.data.nhanes import load_nhanes_data

MIN_SAMPLES = 50

df_nhanes = load_nhanes_data()
print(f"Data shape: {df_nhanes.shape}")
print(f"Unique subjects: {df_nhanes['SEQN'].nunique()}")
print(f"Columns: {df_nhanes.columns.tolist()}")

In [None]:
# Explore variables
print("=" * 60)
print("Variable Exploration")
print("=" * 60)

print("\n1. RIDAGEYR (Age):")
print(f"   Range: {df_nhanes['RIDAGEYR'].min()} - {df_nhanes['RIDAGEYR'].max()}")

print("\n2. RIAGENDR (Gender):")
print(df_nhanes['RIAGENDR'].value_counts())

print("\n3. RIDRETH1 (Race):")
print(df_nhanes['RIDRETH1'].value_counts())

print("\n4. DMDEDUC2 (Education):")
print(df_nhanes['DMDEDUC2'].value_counts())

print("\n5. INDFMPIR (Poverty Income Ratio, 0-5 scale):")
print(f"   Range: {df_nhanes['INDFMPIR'].min():.2f} - {df_nhanes['INDFMPIR'].max():.2f}")
print(f"   Mean: {df_nhanes['INDFMPIR'].mean():.2f}")
print(f"   Missing: {df_nhanes['INDFMPIR'].isna().sum()}")

print("\n6. OCD150 (Work Activity):")
print(df_nhanes['OCD150'].value_counts())

print("\n7. SMQ020 (Smoking):")
print(df_nhanes['SMQ020'].value_counts())

In [None]:
# Create categorical variables
df = df_nhanes.copy()

# 1. RIDAGEYR: 5 bins using quantiles
df["age_bin"] = pd.qcut(df["RIDAGEYR"], q=5, labels=False, duplicates='drop')
age_ranges = df.groupby("age_bin", observed=True)["RIDAGEYR"].agg(["min", "max"])

age_labels = {}
for idx, row in age_ranges.iterrows():
    age_labels[idx] = f"{int(row['min'])}-{int(row['max'])}"
df["RIDAGEYR_cat"] = df["age_bin"].map(age_labels)

print("RIDAGEYR bins:")
for idx, label in age_labels.items():
    print(f"  Bin {idx}: {label}")

# 2. RIAGENDR
df["RIAGENDR_cat"] = df["RIAGENDR"].map({1.0: "Male", 2.0: "Female"})

# 3. RIDRETH1
race_map = {
    1.0: "MexicanAmerican",
    2.0: "OtherHispanic",
    3.0: "White",
    4.0: "Black",
    5.0: "Other"
}
df["RIDRETH1_cat"] = df["RIDRETH1"].map(race_map)

# 4. DMDEDUC2
edu_map = {
    1.0: "LessThan9th",
    2.0: "9thTo11th",
    3.0: "HighSchool",
    4.0: "SomeCollege",
    5.0: "CollegeGrad"
}
df["DMDEDUC2_cat"] = df["DMDEDUC2"].map(edu_map)

# 5. INDFMPIR: Poverty Income Ratio (0-5 scale) -> 4 bins
# 0-1: Below poverty line
# 1-2.5: Low income
# 2.5-4: Middle income
# 4-5: High income
def map_income(x):
    if pd.isna(x):
        return None
    elif x < 1:
        return "BelowPoverty"
    elif x < 2.5:
        return "LowIncome"
    elif x < 4:
        return "MiddleIncome"
    else:
        return "HighIncome"

df["INDFMPIR_cat"] = df["INDFMPIR"].apply(map_income)
print("\nINDFMPIR distribution:")
print(df["INDFMPIR_cat"].value_counts())

# 6. OCD150
activity_map = {
    1.0: "Sedentary",
    2.0: "Light",
    3.0: "Moderate",
    4.0: "Heavy"
}
df["OCD150_cat"] = df["OCD150"].map(activity_map)

# 7. SMQ020
df["SMQ020_cat"] = df["SMQ020"].map({1.0: "Yes", 2.0: "No"})

print("\nPreprocessing complete.")

In [None]:
# Variable mapping: original column name -> categorical column name
var_mapping = {
    "RIDAGEYR": "RIDAGEYR_cat",
    "RIAGENDR": "RIAGENDR_cat",
    "RIDRETH1": "RIDRETH1_cat",
    "DMDEDUC2": "DMDEDUC2_cat",
    "INDFMPIR": "INDFMPIR_cat",
    "OCD150": "OCD150_cat",
    "SMQ020": "SMQ020_cat"
}

cat_cols = list(var_mapping.values())

print("Category summary:")
print("=" * 60)
total_combinations = 1
for orig_name, cat_name in var_mapping.items():
    n_cats = df[cat_name].dropna().nunique()
    cats = sorted(df[cat_name].dropna().unique().tolist())
    print(f"{orig_name}: {n_cats} categories - {cats}")
    total_combinations *= n_cats

print(f"\nTheoretical max combinations (7 vars): {total_combinations:,}")

In [None]:
def compute_stats(data):
    """Compute statistics for height and weight from unique subjects."""
    # Drop duplicate subjects to get accurate counts
    unique_data = data.drop_duplicates(subset=["SEQN"])
    
    if len(unique_data) < MIN_SAMPLES:
        return None
    
    height_data = unique_data["BMXHT"].dropna()
    weight_data = unique_data["BMXWT"].dropna()
    
    if len(height_data) < MIN_SAMPLES or len(weight_data) < MIN_SAMPLES:
        return None
    
    return {
        "height_mean": round(float(height_data.mean()), 2),
        "height_std": round(float(height_data.std()), 2),
        "height_var": round(float(height_data.var()), 2),
        "weight_mean": round(float(weight_data.mean()), 2),
        "weight_std": round(float(weight_data.std()), 2),
        "weight_var": round(float(weight_data.var()), 2),
        "n": int(len(unique_data))
    }

def make_key(var_value_pairs):
    """Create key in format: RIDAGEYR=20-39__RIAGENDR=Male__..."""
    return "__".join([f"{var}={val}" for var, val in var_value_pairs])

In [None]:
# Build all distributions
df_valid = df.dropna(subset=["BMXHT", "BMXWT"])
print(f"Rows with valid height/weight: {len(df_valid)}")
print(f"Unique subjects: {df_valid['SEQN'].nunique()}")
print(f"Minimum sample size: {MIN_SAMPLES}")

distributions = {}

# Overall
stats = compute_stats(df_valid)
if stats:
    distributions["Overall"] = stats
    print(f"\nOverall: n={stats['n']} unique subjects")

In [None]:
# Single variable marginals
print("\nComputing single variable marginals...")
single_count = 0

for orig_name, cat_name in var_mapping.items():
    for val in df_valid[cat_name].dropna().unique():
        subset = df_valid[df_valid[cat_name] == val]
        stats = compute_stats(subset)
        if stats:
            key = make_key([(orig_name, val)])
            distributions[key] = stats
            single_count += 1

print(f"  Added {single_count} single variable entries")

In [None]:
# Two variable combinations
print("\nComputing two variable combinations...")
pair_count = 0

# Define useful pairs
pairs = [
    ("RIAGENDR", "RIDAGEYR"),
    ("RIAGENDR", "RIDRETH1"),
    ("RIDAGEYR", "RIDRETH1"),
    ("RIAGENDR", "SMQ020"),
    ("RIAGENDR", "DMDEDUC2"),
    ("RIDAGEYR", "DMDEDUC2"),
]

for var1, var2 in pairs:
    cat1, cat2 = var_mapping[var1], var_mapping[var2]
    for val1 in df_valid[cat1].dropna().unique():
        for val2 in df_valid[cat2].dropna().unique():
            subset = df_valid[(df_valid[cat1] == val1) & (df_valid[cat2] == val2)]
            stats = compute_stats(subset)
            if stats:
                key = make_key([(var1, val1), (var2, val2)])
                distributions[key] = stats
                pair_count += 1

print(f"  Added {pair_count} two variable entries")

In [None]:
# Three variable combinations (Gender x Age x Race)
print("\nComputing three variable combinations...")
triple_count = 0

for gender in df_valid["RIAGENDR_cat"].dropna().unique():
    for age in df_valid["RIDAGEYR_cat"].dropna().unique():
        for race in df_valid["RIDRETH1_cat"].dropna().unique():
            subset = df_valid[
                (df_valid["RIAGENDR_cat"] == gender) & 
                (df_valid["RIDAGEYR_cat"] == age) & 
                (df_valid["RIDRETH1_cat"] == race)
            ]
            stats = compute_stats(subset)
            if stats:
                key = make_key([
                    ("RIAGENDR", gender),
                    ("RIDAGEYR", age),
                    ("RIDRETH1", race)
                ])
                distributions[key] = stats
                triple_count += 1

print(f"  Added {triple_count} three variable entries")

In [None]:
# Full 6 variable combinations
print("\nComputing full 6 variable combinations...")
main_vars = ["RIDAGEYR", "RIAGENDR", "RIDRETH1", "DMDEDUC2", "INDFMPIR", "OCD150"]
main_cats = [var_mapping[v] for v in main_vars]

df_main = df_valid.dropna(subset=main_cats)
print(f"Rows with all 6 main variables: {len(df_main)}")
print(f"Unique subjects: {df_main['SEQN'].nunique()}")

var_values = {v: sorted(df_main[var_mapping[v]].dropna().unique().tolist()) for v in main_vars}

full_count = 0
for combo in product(*[var_values[v] for v in main_vars]):
    mask = pd.Series([True] * len(df_main), index=df_main.index)
    for var, val in zip(main_vars, combo):
        mask &= (df_main[var_mapping[var]] == val)
    
    subset = df_main[mask]
    stats = compute_stats(subset)
    
    if stats:
        key = make_key(list(zip(main_vars, combo)))
        distributions[key] = stats
        full_count += 1

print(f"  Added {full_count} full 6 variable entries")

In [None]:
# Summary
print("\n" + "=" * 60)
print("Distribution Summary")
print("=" * 60)
print(f"Total entries: {len(distributions)}")
print(f"Minimum sample size: {MIN_SAMPLES}")

json_str = json.dumps(distributions, indent=2)
print(f"Estimated JSON size: {len(json_str) / 1024:.1f} KB")

In [None]:
# Create final output with metadata
output = {
    "metadata": {
        "source": "NHANES August2021-August2023",
        "description": "Ground truth distributions for height (cm) and weight (kg)",
        "total_unique_subjects": int(df_nhanes["SEQN"].nunique()),
        "min_samples_per_entry": MIN_SAMPLES,
        "variables": {
            "RIDAGEYR": {
                "description": "Age in years (binned by quantiles)",
                "values": list(age_labels.values())
            },
            "RIAGENDR": {
                "description": "Gender",
                "values": ["Male", "Female"]
            },
            "RIDRETH1": {
                "description": "Race/Ethnicity",
                "values": ["MexicanAmerican", "OtherHispanic", "White", "Black", "Other"]
            },
            "DMDEDUC2": {
                "description": "Education level",
                "values": ["LessThan9th", "9thTo11th", "HighSchool", "SomeCollege", "CollegeGrad"]
            },
            "INDFMPIR": {
                "description": "Family poverty income ratio (0-5 scale, binned)",
                "values": ["BelowPoverty", "LowIncome", "MiddleIncome", "HighIncome"],
                "bins": "0-1: BelowPoverty, 1-2.5: LowIncome, 2.5-4: MiddleIncome, 4-5: HighIncome"
            },
            "OCD150": {
                "description": "Work activity level",
                "values": ["Sedentary", "Light", "Moderate", "Heavy"]
            },
            "SMQ020": {
                "description": "Smoked 100+ cigarettes in life",
                "values": ["Yes", "No"]
            }
        },
        "key_format": "COLNAME=value__COLNAME=value (e.g., RIAGENDR=Male__RIDAGEYR=18-37)"
    },
    "distributions": distributions
}

In [None]:
# Save to JSON
output_dir = "../../data/processed"
os.makedirs(output_dir, exist_ok=True)

output_path = f"{output_dir}/nhanes_ground_truth.json"
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")

In [None]:
# Preview sample entries
print("Sample entries:")
print("=" * 60)

sample_keys = list(distributions.keys())[:10]

for key in sample_keys:
    d = distributions[key]
    print(f"\n{key}")
    print(f"  Height: {d['height_mean']} +/- {d['height_std']} cm")
    print(f"  Weight: {d['weight_mean']} +/- {d['weight_std']} kg")
    print(f"  N: {d['n']} unique subjects")

In [None]:
# Usage example
print("\nUsage Example:")
print("=" * 60)
print("""
import json

with open('data/processed/nhanes_ground_truth.json', 'r') as f:
    gt = json.load(f)

# Get distribution for a 18-37 year old male
key = 'RIAGENDR=Male__RIDAGEYR=18-37'
if key in gt['distributions']:
    stats = gt['distributions'][key]
    print(f"Height: {stats['height_mean']} +/- {stats['height_std']} cm")
    print(f"Weight: {stats['weight_mean']} +/- {stats['weight_std']} kg")
    print(f"N: {stats['n']} unique subjects")
""")