In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_DIR = Path().resolve()

In [None]:
url = "https://data.nber.org/nvss/natality/csv/"
resp = requests.get(url)

soup = BeautifulSoup(resp.text, "html.parser")

hrefs = [urljoin(url, a["href"]) for a in soup.find_all("a", href=True)]
year_links = [h for h in hrefs if re.search(r"/\d{4}/$", h)]


print(year_links)

In [None]:
csv_links = []

for link in year_links:
    year = int(link.split("/")[-2])
    if year < 2000:
        continue
    
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    hrefs = [urljoin(link, a["href"]) for a in soup.find_all("a", href=True)]
    csv_link = [h for h in hrefs if h.endswith(".csv") and 'us' in h][0]
    csv_links.append(csv_link)

csv_links

In [None]:
out_dir = (BASE_DIR / "data" / "natality_data")
out_dir.mkdir(parents=True, exist_ok=True)

def download_file(link):
    fname = Path(urlparse(link).path).name
    out_path = out_dir / fname
    try:
        with requests.get(link, stream=True, timeout=(10, 3600)) as r:
            r.raise_for_status()
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        return fname, True
    except Exception as e:
        return fname, False

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = {executor.submit(download_file, link): link for link in csv_links}

    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading files"):
        fname, success = future.result()
        if success:
            tqdm.write(f"Downloaded {fname}")
        else:
            tqdm.write(f"Failed to download {fname}")