This code is used to download the data from the NHANES website (https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear=2013). 

The NHANES dataset is stored in numerous .xpt files. These files are located in 5 different folders:
'Demographics', 'Dietary', 'Examination', 'Laboratory', 'Questionnaire'

A detaile explanation of each variable can be found here:
- Demographics: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics&CycleBeginYear=2013
- Dietary: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&CycleBeginYear=2013
- Examination: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=2013
- Laboratory: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&CycleBeginYear=2013
- Questionnaire: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2013

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib import request
from os import mkdir
from os import getcwd
from os.path import join
import threading

In [2]:
year = 2013

In [3]:
url = f"https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?BeginYear={year}"
html = request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
pages = list(map(lambda x: x['href'], soup.find_all("div", string="Data, Documentation, Codebooks")[0].parent.findChildren("a")))
pages

['../search/datapage.aspx?Component=Demographics&CycleBeginYear=2013',
 '../search/datapage.aspx?Component=Dietary&CycleBeginYear=2013',
 '../search/datapage.aspx?Component=Examination&CycleBeginYear=2013',
 '../search/datapage.aspx?Component=Laboratory&CycleBeginYear=2013',
 '../search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2013',
 '../search/datapage.aspx?Component=LimitedAccess&CycleBeginYear=2013']

In [4]:
xpts_files = {}
for page in pages:
    page_name = page.split("Component=")[1].split("&")[0]
    print(f"Finding {page_name} data")
    url = f"https://wwwn.cdc.gov/nchs/nhanes{page[2:]}"
    html = request.urlopen(url)
    extract_url = lambda x : str(x).lower().split('"')[1]
    xpts_files[page_name] = [extract_url(line) for line in html.readlines() if '.xpt' in str(line).lower()]

print()
print("\n".join((f"{key}: {value}" for key, value in xpts_files.items())))

Finding Demographics data
Finding Dietary data
Finding Examination data
Finding Laboratory data
Finding Questionnaire data
Finding LimitedAccess data

Demographics: ['/nchs/nhanes/2013-2014/demo_h.xpt']
Dietary: ['/nchs/nhanes/2013-2014/dr1iff_h.xpt', '/nchs/nhanes/2013-2014/dr2iff_h.xpt', '/nchs/nhanes/2013-2014/dr1tot_h.xpt', '/nchs/nhanes/2013-2014/dr2tot_h.xpt', '/nchs/nhanes/2013-2014/drxfcd_h.xpt', '/nchs/nhanes/1999-2000/dsbi.xpt', '/nchs/nhanes/1999-2000/dsii.xpt', '/nchs/nhanes/1999-2000/dspi.xpt', '/nchs/nhanes/2013-2014/ds1ids_h.xpt', '/nchs/nhanes/2013-2014/ds2ids_h.xpt', '/nchs/nhanes/2013-2014/ds1tot_h.xpt', '/nchs/nhanes/2013-2014/ds2tot_h.xpt', '/nchs/nhanes/2013-2014/dsqids_h.xpt', '/nchs/nhanes/2013-2014/dsqtot_h.xpt']
Examination: ['/nchs/nhanes/2013-2014/bpx_h.xpt', '/nchs/nhanes/2013-2014/bmx_h.xpt', '/nchs/nhanes/2013-2014/dxxaac_h.xpt', '/nchs/nhanes/2013-2014/dxxag_h.xpt', '/nchs/nhanes/2013-2014/dxxfem_h.xpt', '/nchs/nhanes/2013-2014/dxxfrx_h.xpt', '/nchs/nhane

In [5]:
def download_file(xpt_file, location, name):
    print(f"\tDownloading {name}")
    request.urlretrieve(f"https://wwwn.cdc.gov/{xpt_file}", join(location, name))

In [6]:
data_location = join(getcwd(), "data")
xpt_location = join(data_location, "xpt")
mkdir(data_location)
mkdir(xpt_location)

In [7]:
threads = []
for key, value in xpts_files.items():
    if len(value) == 0:
        continue

    key_location = join(xpt_location, key)
    mkdir(key_location)
    for file_location in value:
        name = file_location.split(r"/")[-1]
        x = threading.Thread(target=download_file, args=(file_location, key_location, name))
        threads.append(x)
        x.start()

for thread in threads:
    thread.join()

	Downloading demo_h.xpt
	Downloading dr1iff_h.xpt
	Downloading dr2iff_h.xpt
	Downloading dr1tot_h.xpt
	Downloading dr2tot_h.xpt
	Downloading drxfcd_h.xpt
	Downloading dsbi.xpt
	Downloading dsii.xpt
	Downloading dspi.xpt
	Downloading ds1ids_h.xpt
	Downloading ds2ids_h.xpt
	Downloading ds1tot_h.xpt
	Downloading ds2tot_h.xpt
	Downloading dsqids_h.xpt
	Downloading dsqtot_h.xpt
	Downloading bpx_h.xpt
	Downloading bmx_h.xpt
	Downloading dxxaac_h.xpt
	Downloading dxxag_h.xpt
	Downloading dxxfem_h.xpt
	Downloading dxxfrx_h.xpt
	Downloading dxxl1_h.xpt
	Downloading dxxl2_h.xpt
	Downloading dxxl3_h.xpt
	Downloading dxxl4_h.xpt
	Downloading dxxspn_h.xpt
	Downloading dxxt10_h.xpt
	Downloading dxxt11_h.xpt
	Downloading dxxt12_h.xpt
	Downloading dxxt4_h.xpt
	Downloading dxxt5_h.xpt
	Downloading dxxt6_h.xpt
	Downloading dxxt7_h.xpt
	Downloading dxxt8_h.xpt
	Downloading dxxt9_h.xpt
	Downloading dxxvfa_h.xpt
	Downloading dxx_h.xpt
	Downloading flxcln_h.xpt
	Downloading mgx_h.xpt
	Downloading ohxden_h.x