# NHANES Ground Truth Distributions

## Required Variables (all combinations)
1. RIDAGEYR (Age) → 5 bins
2. RIAGENDR (Gender)
3. RIDRETH1 (Race)
4. DMDEDUC2 (Education)
5. INDFMPIR (Poverty Income Ratio) → 4 bins
6. OCD150 (Work Activity Level)

## Additional Variables
- SMQ020 (Smoking)

## Key Format
`RIDAGEYR=20-39__RIAGENDR=Male__RIDRETH1=White`

## Minimum Sample Size
N >= 50 unique subjects

In [1]:
import os
import json
import pandas as pd
import numpy as np
import sys
from dotenv import load_dotenv
from itertools import product

load_dotenv()
sys.path.append("../..")
from src.data.nhanes import load_nhanes_data

MIN_SAMPLES = 50

df_nhanes = load_nhanes_data()
print(f"Data shape: {df_nhanes.shape}")
print(f"Unique subjects: {df_nhanes['SEQN'].nunique()}")
print(f"Columns: {df_nhanes.columns.tolist()}")

Data shape: (6337, 26)
Unique subjects: 6337
Columns: ['SEQN', 'RIDAGEYR', 'RIAGENDR', 'RIDRETH1', 'DMDEDUC2', 'DMDMARTZ', 'INDFMPIR', 'BPXOSY1', 'BPXOSY2', 'BPXOSY3', 'BPXODI1', 'BPXODI2', 'BPXODI3', 'BPXOPLS1', 'BPXOPLS2', 'BPXOPLS3', 'BMXHT', 'BMXWT', 'OCD150', 'PAD790Q', 'PAD790U', 'PAD810Q', 'PAD810U', 'PAD820', 'PAD680', 'SMQ020']


In [2]:
# Explore variables
print("=" * 60)
print("Variable Exploration")
print("=" * 60)

print("\n1. RIDAGEYR (Age):")
print(f"   Range: {df_nhanes['RIDAGEYR'].min()} - {df_nhanes['RIDAGEYR'].max()}")

print("\n2. RIAGENDR (Gender):")
print(df_nhanes['RIAGENDR'].value_counts())

print("\n3. RIDRETH1 (Race):")
print(df_nhanes['RIDRETH1'].value_counts())

print("\n4. DMDEDUC2 (Education):")
print(df_nhanes['DMDEDUC2'].value_counts())

print("\n5. INDFMPIR (Poverty Income Ratio, 0-5 scale):")
print(f"   Range: {df_nhanes['INDFMPIR'].min():.2f} - {df_nhanes['INDFMPIR'].max():.2f}")
print(f"   Mean: {df_nhanes['INDFMPIR'].mean():.2f}")
print(f"   Missing: {df_nhanes['INDFMPIR'].isna().sum()}")

print("\n6. OCD150 (Work Activity):")
print(df_nhanes['OCD150'].value_counts())

print("\n7. SMQ020 (Smoking):")
print(df_nhanes['SMQ020'].value_counts())

Variable Exploration

1. RIDAGEYR (Age):
   Range: 18.0 - 80.0

2. RIAGENDR (Gender):
RIAGENDR
2.0    3479
1.0    2858
Name: count, dtype: int64

3. RIDRETH1 (Race):
RIDRETH1
3.0    3684
4.0     791
5.0     767
2.0     647
1.0     448
Name: count, dtype: int64

4. DMDEDUC2 (Education):
DMDEDUC2
5.0    2143
4.0    1846
3.0    1297
2.0     482
1.0     292
9.0       4
Name: count, dtype: int64

5. INDFMPIR (Poverty Income Ratio, 0-5 scale):
   Range: 0.00 - 5.00
   Mean: 2.89
   Missing: 831

6. OCD150 (Work Activity):
OCD150
1.0    3229
4.0    2640
3.0     264
2.0     203
9.0       1
Name: count, dtype: int64

7. SMQ020 (Smoking):
SMQ020
2.0    3810
1.0    2518
9.0       4
7.0       3
Name: count, dtype: int64


In [3]:
# Create categorical variables
df = df_nhanes.copy()

# 1. RIDAGEYR: 5 bins using quantiles
df["age_bin"] = pd.qcut(df["RIDAGEYR"], q=5, labels=False, duplicates='drop')
age_ranges = df.groupby("age_bin", observed=True)["RIDAGEYR"].agg(["min", "max"])

age_labels = {}
for idx, row in age_ranges.iterrows():
    age_labels[idx] = f"{int(row['min'])}-{int(row['max'])}"
df["RIDAGEYR_cat"] = df["age_bin"].map(age_labels)

print("RIDAGEYR bins:")
for idx, label in age_labels.items():
    print(f"  Bin {idx}: {label}")

# 2. RIAGENDR
df["RIAGENDR_cat"] = df["RIAGENDR"].map({1.0: "Male", 2.0: "Female"})

# 3. RIDRETH1
race_map = {
    1.0: "MexicanAmerican",
    2.0: "OtherHispanic",
    3.0: "White",
    4.0: "Black",
    5.0: "Other"
}
df["RIDRETH1_cat"] = df["RIDRETH1"].map(race_map)

# 4. DMDEDUC2
edu_map = {
    1.0: "LessThan9th",
    2.0: "9thTo11th",
    3.0: "HighSchool",
    4.0: "SomeCollege",
    5.0: "CollegeGrad"
}
df["DMDEDUC2_cat"] = df["DMDEDUC2"].map(edu_map)

# 5. INDFMPIR: Poverty Income Ratio (0-5 scale) -> 4 bins
# 0-1: Below poverty line
# 1-2.5: Low income
# 2.5-4: Middle income
# 4-5: High income
def map_income(x):
    if pd.isna(x):
        return None
    elif x < 1:
        return "BelowPoverty"
    elif x < 2.5:
        return "LowIncome"
    elif x < 4:
        return "MiddleIncome"
    else:
        return "HighIncome"

df["INDFMPIR_cat"] = df["INDFMPIR"].apply(map_income)
print("\nINDFMPIR distribution:")
print(df["INDFMPIR_cat"].value_counts())

# 6. OCD150
activity_map = {
    1.0: "Sedentary",
    2.0: "Light",
    3.0: "Moderate",
    4.0: "Heavy"
}
df["OCD150_cat"] = df["OCD150"].map(activity_map)

# 7. SMQ020
df["SMQ020_cat"] = df["SMQ020"].map({1.0: "Yes", 2.0: "No"})

print("\nPreprocessing complete.")

RIDAGEYR bins:
  Bin 0: 18-33
  Bin 1: 34-48
  Bin 2: 49-61
  Bin 3: 62-70
  Bin 4: 71-80

INDFMPIR distribution:
INDFMPIR_cat
HighIncome      1884
LowIncome       1605
MiddleIncome    1118
BelowPoverty     899
Name: count, dtype: int64

Preprocessing complete.


In [4]:
# Variable mapping: original column name -> categorical column name
var_mapping = {
    "RIDAGEYR": "RIDAGEYR_cat",
    "RIAGENDR": "RIAGENDR_cat",
    "RIDRETH1": "RIDRETH1_cat",
    "DMDEDUC2": "DMDEDUC2_cat",
    "INDFMPIR": "INDFMPIR_cat",
    "OCD150": "OCD150_cat",
    "SMQ020": "SMQ020_cat"
}

cat_cols = list(var_mapping.values())

print("Category summary:")
print("=" * 60)
total_combinations = 1
for orig_name, cat_name in var_mapping.items():
    n_cats = df[cat_name].dropna().nunique()
    cats = sorted(df[cat_name].dropna().unique().tolist())
    print(f"{orig_name}: {n_cats} categories - {cats}")
    total_combinations *= n_cats

print(f"\nTheoretical max combinations (7 vars): {total_combinations:,}")

Category summary:
RIDAGEYR: 5 categories - ['18-33', '34-48', '49-61', '62-70', '71-80']
RIAGENDR: 2 categories - ['Female', 'Male']
RIDRETH1: 5 categories - ['Black', 'MexicanAmerican', 'Other', 'OtherHispanic', 'White']
DMDEDUC2: 5 categories - ['9thTo11th', 'CollegeGrad', 'HighSchool', 'LessThan9th', 'SomeCollege']
INDFMPIR: 4 categories - ['BelowPoverty', 'HighIncome', 'LowIncome', 'MiddleIncome']
OCD150: 4 categories - ['Heavy', 'Light', 'Moderate', 'Sedentary']
SMQ020: 2 categories - ['No', 'Yes']

Theoretical max combinations (7 vars): 8,000


In [5]:
def compute_stats(data):
    """Compute statistics for height and weight from unique subjects."""
    # Drop duplicate subjects to get accurate counts
    unique_data = data.drop_duplicates(subset=["SEQN"])
    
    if len(unique_data) < MIN_SAMPLES:
        return None
    
    height_data = unique_data["BMXHT"].dropna()
    weight_data = unique_data["BMXWT"].dropna()
    
    if len(height_data) < MIN_SAMPLES or len(weight_data) < MIN_SAMPLES:
        return None
    
    return {
        "height_mean": round(float(height_data.mean()), 2),
        "height_std": round(float(height_data.std()), 2),
        "height_var": round(float(height_data.var()), 2),
        "weight_mean": round(float(weight_data.mean()), 2),
        "weight_std": round(float(weight_data.std()), 2),
        "weight_var": round(float(weight_data.var()), 2),
        "n": int(len(unique_data))
    }

def make_key(var_value_pairs):
    """Create key in format: RIDAGEYR=20-39__RIAGENDR=Male__..."""
    return "__".join([f"{var}={val}" for var, val in var_value_pairs])

In [6]:
# Build all distributions
df_valid = df.dropna(subset=["BMXHT", "BMXWT"])
print(f"Rows with valid height/weight: {len(df_valid)}")
print(f"Unique subjects: {df_valid['SEQN'].nunique()}")
print(f"Minimum sample size: {MIN_SAMPLES}")

distributions = {}

# Overall
stats = compute_stats(df_valid)
if stats:
    distributions["Overall"] = stats
    print(f"\nOverall: n={stats['n']} unique subjects")

Rows with valid height/weight: 6235
Unique subjects: 6235
Minimum sample size: 50

Overall: n=6235 unique subjects


In [7]:
# Single variable marginals
print("\nComputing single variable marginals...")
single_count = 0

for orig_name, cat_name in var_mapping.items():
    for val in df_valid[cat_name].dropna().unique():
        subset = df_valid[df_valid[cat_name] == val]
        stats = compute_stats(subset)
        if stats:
            key = make_key([(orig_name, val)])
            distributions[key] = stats
            single_count += 1

print(f"  Added {single_count} single variable entries")


Computing single variable marginals...
  Added 27 single variable entries


In [8]:
# Two variable combinations
print("\nComputing two variable combinations...")
pair_count = 0

# Define useful pairs
pairs = [
    ("RIAGENDR", "RIDAGEYR"),
    ("RIAGENDR", "RIDRETH1"),
    ("RIDAGEYR", "RIDRETH1"),
    ("RIAGENDR", "SMQ020"),
    ("RIAGENDR", "DMDEDUC2"),
    ("RIDAGEYR", "DMDEDUC2"),
]

for var1, var2 in pairs:
    cat1, cat2 = var_mapping[var1], var_mapping[var2]
    for val1 in df_valid[cat1].dropna().unique():
        for val2 in df_valid[cat2].dropna().unique():
            subset = df_valid[(df_valid[cat1] == val1) & (df_valid[cat2] == val2)]
            stats = compute_stats(subset)
            if stats:
                key = make_key([(var1, val1), (var2, val2)])
                distributions[key] = stats
                pair_count += 1

print(f"  Added {pair_count} two variable entries")


Computing two variable combinations...
  Added 81 two variable entries


In [9]:
# Three variable combinations (Gender x Age x Race)
print("\nComputing three variable combinations...")
triple_count = 0

for gender in df_valid["RIAGENDR_cat"].dropna().unique():
    for age in df_valid["RIDAGEYR_cat"].dropna().unique():
        for race in df_valid["RIDRETH1_cat"].dropna().unique():
            subset = df_valid[
                (df_valid["RIAGENDR_cat"] == gender) & 
                (df_valid["RIDAGEYR_cat"] == age) & 
                (df_valid["RIDRETH1_cat"] == race)
            ]
            stats = compute_stats(subset)
            if stats:
                key = make_key([
                    ("RIAGENDR", gender),
                    ("RIDAGEYR", age),
                    ("RIDRETH1", race)
                ])
                distributions[key] = stats
                triple_count += 1

print(f"  Added {triple_count} three variable entries")


Computing three variable combinations...
  Added 39 three variable entries


In [10]:
# Full 6 variable combinations
print("\nComputing full 6 variable combinations...")
main_vars = ["RIDAGEYR", "RIAGENDR", "RIDRETH1", "DMDEDUC2", "INDFMPIR", "OCD150"]
main_cats = [var_mapping[v] for v in main_vars]

df_main = df_valid.dropna(subset=main_cats)
print(f"Rows with all 6 main variables: {len(df_main)}")
print(f"Unique subjects: {df_main['SEQN'].nunique()}")

var_values = {v: sorted(df_main[var_mapping[v]].dropna().unique().tolist()) for v in main_vars}

full_count = 0
for combo in product(*[var_values[v] for v in main_vars]):
    mask = pd.Series([True] * len(df_main), index=df_main.index)
    for var, val in zip(main_vars, combo):
        mask &= (df_main[var_mapping[var]] == val)
    
    subset = df_main[mask]
    stats = compute_stats(subset)
    
    if stats:
        key = make_key(list(zip(main_vars, combo)))
        distributions[key] = stats
        full_count += 1

print(f"  Added {full_count} full 6 variable entries")


Computing full 6 variable combinations...
Rows with all 6 main variables: 5191
Unique subjects: 5191
  Added 9 full 6 variable entries


In [11]:
# Summary
print("\n" + "=" * 60)
print("Distribution Summary")
print("=" * 60)
print(f"Total entries: {len(distributions)}")
print(f"Minimum sample size: {MIN_SAMPLES}")

json_str = json.dumps(distributions, indent=2)
print(f"Estimated JSON size: {len(json_str) / 1024:.1f} KB")


Distribution Summary
Total entries: 157
Minimum sample size: 50
Estimated JSON size: 33.2 KB


In [12]:
# Create final output with metadata
output = {
    "metadata": {
        "source": "NHANES August2021-August2023",
        "description": "Ground truth distributions for height (cm) and weight (kg)",
        "total_unique_subjects": int(df_nhanes["SEQN"].nunique()),
        "min_samples_per_entry": MIN_SAMPLES,
        "variables": {
            "RIDAGEYR": {
                "description": "Age in years (binned by quantiles)",
                "values": list(age_labels.values())
            },
            "RIAGENDR": {
                "description": "Gender",
                "values": ["Male", "Female"]
            },
            "RIDRETH1": {
                "description": "Race/Ethnicity",
                "values": ["MexicanAmerican", "OtherHispanic", "White", "Black", "Other"]
            },
            "DMDEDUC2": {
                "description": "Education level",
                "values": ["LessThan9th", "9thTo11th", "HighSchool", "SomeCollege", "CollegeGrad"]
            },
            "INDFMPIR": {
                "description": "Family poverty income ratio (0-5 scale, binned)",
                "values": ["BelowPoverty", "LowIncome", "MiddleIncome", "HighIncome"],
                "bins": "0-1: BelowPoverty, 1-2.5: LowIncome, 2.5-4: MiddleIncome, 4-5: HighIncome"
            },
            "OCD150": {
                "description": "Work activity level",
                "values": ["Sedentary", "Light", "Moderate", "Heavy"]
            },
            "SMQ020": {
                "description": "Smoked 100+ cigarettes in life",
                "values": ["Yes", "No"]
            }
        },
        "key_format": "COLNAME=value__COLNAME=value (e.g., RIAGENDR=Male__RIDAGEYR=18-37)"
    },
    "distributions": distributions
}

In [13]:
# Save to JSON
output_dir = "../../data/processed"
os.makedirs(output_dir, exist_ok=True)

output_path = f"{output_dir}/nhanes_ground_truth.json"
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")

Saved to: ../../data/processed/nhanes_ground_truth.json
File size: 37.7 KB


In [14]:
# Preview sample entries
print("Sample entries:")
print("=" * 60)

sample_keys = list(distributions.keys())[:10]

for key in sample_keys:
    d = distributions[key]
    print(f"\n{key}")
    print(f"  Height: {d['height_mean']} +/- {d['height_std']} cm")
    print(f"  Weight: {d['weight_mean']} +/- {d['weight_std']} kg")
    print(f"  N: {d['n']} unique subjects")

Sample entries:

Overall
  Height: 166.97 +/- 10.03 cm
  Weight: 82.87 +/- 22.45 kg
  N: 6235 unique subjects

RIDAGEYR=34-48
  Height: 167.95 +/- 9.9 cm
  Weight: 86.09 +/- 24.48 kg
  N: 1230 unique subjects

RIDAGEYR=62-70
  Height: 166.32 +/- 9.74 cm
  Weight: 82.74 +/- 20.28 kg
  N: 1296 unique subjects

RIDAGEYR=18-33
  Height: 168.81 +/- 9.92 cm
  Weight: 80.74 +/- 23.74 kg
  N: 1289 unique subjects

RIDAGEYR=49-61
  Height: 167.03 +/- 10.05 cm
  Weight: 85.59 +/- 23.33 kg
  N: 1305 unique subjects

RIDAGEYR=71-80
  Height: 164.45 +/- 10.05 cm
  Weight: 78.75 +/- 18.69 kg
  N: 1115 unique subjects

RIAGENDR=Male
  Height: 174.47 +/- 7.73 cm
  Weight: 88.78 +/- 21.84 kg
  N: 2817 unique subjects

RIAGENDR=Female
  Height: 160.79 +/- 7.05 cm
  Weight: 77.99 +/- 21.76 kg
  N: 3418 unique subjects

RIDRETH1=Other
  Height: 165.26 +/- 10.18 cm
  Weight: 76.16 +/- 23.38 kg
  N: 759 unique subjects

RIDRETH1=White
  Height: 168.24 +/- 9.93 cm
  Weight: 83.92 +/- 22.2 kg
  N: 3639 unique

In [15]:
# Usage example
print("\nUsage Example:")
print("=" * 60)
print("""
import json

with open('data/processed/nhanes_ground_truth.json', 'r') as f:
    gt = json.load(f)

# Get distribution for a 18-37 year old male
key = 'RIAGENDR=Male__RIDAGEYR=18-37'
if key in gt['distributions']:
    stats = gt['distributions'][key]
    print(f"Height: {stats['height_mean']} +/- {stats['height_std']} cm")
    print(f"Weight: {stats['weight_mean']} +/- {stats['weight_std']} kg")
    print(f"N: {stats['n']} unique subjects")
""")


Usage Example:

import json

with open('data/processed/nhanes_ground_truth.json', 'r') as f:
    gt = json.load(f)

# Get distribution for a 18-37 year old male
key = 'RIAGENDR=Male__RIDAGEYR=18-37'
if key in gt['distributions']:
    stats = gt['distributions'][key]
    print(f"Height: {stats['height_mean']} +/- {stats['height_std']} cm")
    print(f"Weight: {stats['weight_mean']} +/- {stats['weight_std']} kg")
    print(f"N: {stats['n']} unique subjects")

