# Data Extraction

In [1]:
import pyreadstat
import os
import pandas as pd
import openpyxl
import numpy as np

In [2]:
columns_to_use = {
    "ALQ_L.XPT": ["SEQN", "ALQ121"],
    "BMX_L.XPT": ["SEQN", "BMXWT", "BMXHT", "BMXBMI"],
    "BPQ_L.XPT": ["SEQN", "BPQ020"],
    "BPXO_L.XPT": ["SEQN", "BPXOSY1","BPXODI1","BPXOPLS1"],
    "CBC_L.XPT": ["SEQN","LBXHGB"],
    "DEMO_L.XPT": ["SEQN", "RIAGENDR","RIDAGEYR","DMDBORN4","DMDEDUC2","DMDMARTZ","RIDEXPRG","DMDHHSIZ"],
    "DIQ_L.XPT": ["SEQN", "DIQ010"],
    "DPQ_L.XPT": ["SEQN", "DPQ010","DPQ020","DPQ030","DPQ040","DPQ050","DPQ060","DPQ070","DPQ080","DPQ090"],
    "FNQ_L.XPT": ["SEQN", "FNQ410","FNQ430","FNQ440","FNQ450","FNQ460","FNQ470","FNQ490"],
    "FOLATE_L.XPT": ["SEQN", "LBDRFOSI"],
    "HDL_L.XPT": ["SEQN", "LBDHDDSI"],
    "HIQ_L.XPT": ["SEQN", "HIQ011"],
    "HOQ_L.XPT": ["SEQN", "HOD051"],
    "HSCRP_L.XPT": ["SEQN", "LBXHSCRP"],
    "HUQ_L.XPT": ["SEQN", "HUQ010","HUQ030","HUQ090"],
    "INQ_L.XPT": ["SEQN", "INDFMMPI","INQ300"],
    "MCQ_L.XPT": ["SEQN","MCQ160B","MCQ160E","MCQ160F","MCQ160P","MCQ220"],
    "OCQ_L.XPT": ["SEQN", "OCD150","OCQ180","OCQ215"],
    "OHQ_L.XPT": ["SEQN", "OHQ845","OHQ680"],
    "PAQ_L.XPT": ["SEQN", "PAD790Q","PAD790U","PAD680"],
    "PBCD_L.XPT": ["SEQN", "LBXBPB","LBXTHG"],
    "SLQ_L.XPT": ["SEQN", "SLD012","SLD013"],
    "VID_L.XPT": ["SEQN", "LBXVD2MS"],
    "WHQ_L.XPT": ["SEQN", "WHQ070"],
}

In [4]:
all_dfs = []

for filename, selected_columns in columns_to_use.items():
    file_path = os.path.join("Datasets", filename)
    df, meta = pyreadstat.read_xport(file_path, encoding='cp1252')
    df = df[selected_columns]
    all_dfs.append(df)
    
merged_df = all_dfs[0]
for df in all_dfs[1:]:
    merged_df = pd.merge(merged_df, df, on="SEQN", how="outer")
    
dpq_cols = ["DPQ010", "DPQ020", "DPQ030", "DPQ040", "DPQ050", "DPQ060", "DPQ070", "DPQ080", "DPQ090"]
merged_df = merged_df.dropna(subset=dpq_cols, how="all")

merged_df.to_csv("merged_nhanes.csv", index=False)