In [4]:
import numpy as np
import pandas as pd

def generate_synthetic_nutrition_nl(n=5000, seed=42, max_zips=30):
    rng = np.random.default_rng(seed)

    # Broad set of real 4-digit prefixes from many NL cities
    zip_prefixes = [
        # Amsterdam
        "1011","1012","1013","1014","1015","1016","1017","1018","1019",
        "1021","1022","1023","1024","1025","1026","1027","1028",
        "1031","1032","1033","1034","1035","1036","1037",
        "1041","1042","1043","1044","1045","1046","1047",
        # Rotterdam
        "3011","3012","3013","3014","3015","3016","3017","3018","3019",
        "3021","3022","3023","3024","3025","3026","3027","3028",
        "3031","3032","3033","3034","3035","3036","3037","3038",
        "3041","3042","3043","3044","3045","3046","3047","3048",
        # Den Haag
        "2511","2512","2513","2514","2515","2516","2517","2518",
        "2521","2522","2523","2524","2525","2526",
        "2531","2532","2533","2534","2535","2536",
        "2541","2542","2543","2544","2545","2546","2547","2548",
        # Utrecht
        "3511","3512","3513","3514","3515","3516","3517",
        "3521","3522","3523","3524","3525","3526","3527",
        "3531","3532","3533","3534",
        "3541","3542","3543","3544","3545","3546","3547","3548",
        # Groningen
        "9711","9712","9713","9714","9715","9716","9717","9718",
        "9721","9722","9723","9724","9725","9726","9727","9728",
        "9731","9732","9733","9734","9735","9736","9737","9738",
        # Leeuwarden
        "8911","8912","8913","8914","8915","8916","8917",
        "8921","8922","8923","8924","8925","8926","8927",
        # Maastricht
        "6211","6212","6213","6214","6215","6216","6217","6218","6219",
        "6221","6222","6223","6224","6225","6226","6227","6228","6229",
        # Eindhoven
        "5611","5612","5613","5614","5615","5616","5617",
        "5621","5622","5623","5624","5625","5626",
        "5631","5632","5633","5634","5635",
        "5641","5642","5643","5644","5645","5646","5647",
        # Tilburg
        "5011","5012","5013","5014",
        "5021","5022","5023","5024","5025",
        "5031","5032","5033","5034","5035",
        "5041","5042","5043","5044","5045","5046","5047","5048","5049",
        # Breda
        "4811","4812","4813","4814","4815","4816","4817","4818","4819",
        "4821","4822","4823","4824","4825",
        # Nijmegen
        "6511","6512","6513","6514","6515",
        "6521","6522","6523","6524","6525",
        "6531","6532","6533","6534","6535",
        # Haarlem
        "2011","2012","2013","2014","2015",
        "2021","2022","2023","2024","2025",
        "2031","2032","2033","2034","2035",
        # Alkmaar
        "1811","1812","1813","1814","1815",
        "1821","1822","1823","1824","1825"
    ]

    # --- NEW: restrict to max_zips unique zips
    selected_zips = rng.choice(zip_prefixes, size=min(max_zips, len(zip_prefixes)), replace=False)

    # Draw demographics
    age = rng.integers(18, 90, size=n)
    gender = rng.choice(["male", "female", "other"], size=n, p=[0.48, 0.48, 0.04])
    zip_code = rng.choice(selected_zips, size=n)

    # Zip-level latent effects
    unique_zips = np.unique(zip_code)
    zip_kcal_shift = {z: rng.normal(0, 150) for z in unique_zips}
    zip_health_index = {z: rng.normal(0, 1.0) for z in unique_zips}

    # Individual latent variables
    activity = rng.normal(0, 1, size=n)
    noise_kcal = rng.normal(0, 150, size=n)

    # Gender and age effects
    gender_kcal = np.where(gender == "male", 0, np.where(gender == "female", -200, -100))
    age_kcal = -4.0 * (age - 40)

    # Build kcal
    kcal_base = 2300
    kcal = (kcal_base + age_kcal + gender_kcal +
            np.array([zip_kcal_shift[z] for z in zip_code]) +
            180 * activity + noise_kcal)
    kcal = np.clip(kcal, 1200, 4200)

    # Macro shares with zip-level health index
    h = np.array([zip_health_index[z] for z in zip_code])
    base_alpha = np.vstack([np.full(n, 5.0), np.full(n, 2.0), np.full(n, 2.0)]).T
    alpha = base_alpha.copy()
    alpha[:, 0] += 0.6 * h
    alpha[:, 1] += 0.3 * h
    alpha[:, 2] += -0.7 * h
    alpha = np.clip(alpha, 0.2, None)
    macro_shares = np.array([rng.dirichlet(a) for a in alpha])
    carb_share, prot_share, fat_share = macro_shares[:, 0], macro_shares[:, 1], macro_shares[:, 2]

    # Convert macros to grams
    carbs_g = np.clip((carb_share * kcal / 4.0) + rng.normal(0, 5, size=n), 80, 600)
    protein_g = np.clip((prot_share * kcal / 4.0) + rng.normal(0, 4, size=n), 30, 220)
    fat_g = np.clip((fat_share * kcal / 9.0) + rng.normal(0, 3, size=n), 20, 180)

    # Fruit portions
    lam = 3.0 + 0.02 * (age - 40) + 0.6 * h + np.where(gender == "female", 0.2, 0.0)
    lam = np.clip(lam, 0.1, 8.0)
    fruit_portions = rng.poisson(lam).clip(0, 10)

    df = pd.DataFrame({
        "age": age,
        "gender": gender,
        "zip_code": zip_code,
        "daily_kcal": kcal.astype(int),
        "protein_g": protein_g.round(0).astype(int),
        "carbs_g": carbs_g.round(0).astype(int),
        "fat_g": fat_g.round(0).astype(int),
        "fruit_portions": fruit_portions
    })

    return df


# Generate and save
df_fake = generate_synthetic_nutrition_nl(n=5000, seed=42, max_zips=60)
df_fake.to_csv("fake_nutrition_nl.csv", index=False)
print("Saved fake_nutrition_nl.csv")
print(df_fake.head())

# Optional: quick sanity check of correlations (numeric only)
num_cols = ["age","daily_kcal","protein_g","carbs_g","fat_g","fruit_portions"]
print("\nCorrelation matrix (numeric):")
print(df_fake[num_cols].corr().round(2))


Saved fake_nutrition_nl.csv
   age  gender zip_code  daily_kcal  protein_g  carbs_g  fat_g  fruit_portions
0   26  female     1028        1706         88      240     46               2
1   49  female     3531        2188        106      294     64               6
2   66  female     1046        1796        157      270     20               1
3   65  female     5621        1776        111      310     20               8
4   51  female     1031        2037         52      112    157               1

Correlation matrix (numeric):
                 age  daily_kcal  protein_g  carbs_g  fat_g  fruit_portions
age             1.00       -0.28      -0.09    -0.10  -0.05            0.21
daily_kcal     -0.28        1.00       0.23     0.43   0.17           -0.07
protein_g      -0.09        0.23       1.00    -0.34  -0.27            0.02
carbs_g        -0.10        0.43      -0.34     1.00  -0.48            0.06
fat_g          -0.05        0.17      -0.27    -0.48   1.00           -0.15
fruit_porti