# Obtaining Raw Data 
This project uses the American Community Survey (ACS) PUMS 1-Year national person-level microdata
i.e. Census Data
2018â€“2023 (excluding 2020 due to the unusual circumstances of the pandemic)

Due to size constraints (>10GB raw CSVs), the raw datasets are not tracked in GitHub
<br>
All data can be fully reproduced by running the script, more detail on reproducibility in README file

Raw data source:
<br>
https://www.census.gov/programs-surveys/acs/microdata.html
<br>
https://www2.census.gov/programs-surveys/acs/data/pums/

# CA Data 

In [1]:
import os
import requests
import zipfile
import io
import pandas as pd
from pathlib import Path

In [2]:
def download_acs_1year_person_data(state_abbr="ca", years=[2018,2019, 2021, 2022, 2023]):
    """
    Downloads 1-Year ACS PUMS person files. 
    """
    for year in years:
        url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_p{state_abbr}.zip"
        dest_folder = f"data_persons_{state_abbr}_1yr/{year}"
        os.makedirs(dest_folder, exist_ok=True)
        
        print(f"Downloading {year} 1-Year data...")
        try:
            r = requests.get(url, stream=True)
            r.raise_for_status()
            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                z.extractall(dest_folder)
                print(f"Done: {year}")
        except Exception as e:
            print(f"Skipping {year}: {e}")

# We only run once 
download_acs_1year_person_data()

Downloading 2018 1-Year data...
Done: 2018
Downloading 2019 1-Year data...
Done: 2019
Downloading 2021 1-Year data...
Done: 2021
Downloading 2022 1-Year data...
Done: 2022
Downloading 2023 1-Year data...
Done: 2023


In [5]:
import os
import pandas as pd

BASE_DIR = "data_persons_ca_1yr"
YEARS = [2018, 2019, 2021, 2022, 2023]
FILENAME = "psam_p06.csv"
OUTPUT_PATH = os.path.join(BASE_DIR, "psam_p06_master.csv")

def build_master_common_columns(base_dir=BASE_DIR, years=YEARS, filename=FILENAME, output_path=OUTPUT_PATH):
    # 1) Collect file paths + compute the intersection of column names across years
    paths = {}
    common_cols = None

    for y in years:
        path = os.path.join(base_dir, str(y), filename)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing file for {y}: {path}")
        paths[y] = path

        # Read only header to get columns (fast)
        cols = pd.read_csv(path, nrows=0).columns
        cols_set = set(cols)

        if common_cols is None:
            common_cols = cols_set
        else:
            common_cols = common_cols.intersection(cols_set)

    # We will keep these columns (sorted for stable order) + add "year"
    common_cols = sorted(common_cols)
    if "year" in common_cols:
        # unlikely, but just in case
        common_cols.remove("year")

    print(f"Common columns across {len(years)} years: {len(common_cols)}")

    # 2) Read each year using only the common columns, add year, concat
    dfs = []
    for y in years:
        df = pd.read_csv(
            paths[y],
            usecols=common_cols,     # ensures matching schema
            low_memory=False
        )
        df["year"] = y
        dfs.append(df)
        print(f"Loaded {y}: {df.shape[0]:,} rows, {df.shape[1]} cols (incl year)")

    master = pd.concat(dfs, ignore_index=True)
    print(f"Master shape: {master.shape[0]:,} rows, {master.shape[1]} cols")

    # 3) Write out
    master.to_csv(output_path, index=False)
    print(f"Saved -> {output_path}")

    return master, common_cols

master_df, common_cols = build_master_common_columns()

Common columns across 5 years: 277
Loaded 2018: 378,817 rows, 278 cols (incl year)
Loaded 2019: 380,091 rows, 278 cols (incl year)
Loaded 2021: 386,061 rows, 278 cols (incl year)
Loaded 2022: 391,171 rows, 278 cols (incl year)
Loaded 2023: 392,318 rows, 278 cols (incl year)
Master shape: 1,928,458 rows, 278 cols
Saved -> data_persons_ca_1yr/psam_p06_master.csv


In [6]:
# Rows & columns
print(f"Rows: {master_df.shape[0]:,}")
print(f"Columns: {master_df.shape[1]}")

import os

path = "data_persons_ca_1yr/psam_p06_master.csv"

size_bytes = os.path.getsize(path)
size_gb = size_bytes / 1024**3

print(f"On-disk size: {size_gb:.2f} GB")

Rows: 1,928,458
Columns: 278
On-disk size: 1.37 GB


# USA Data

In [7]:
def download_acs_1yr_us_person(years, base_dir="data_persons_us_1yr"):
    for year in years:
        url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip"
        dest = Path(base_dir) / str(year)
        dest.mkdir(parents=True, exist_ok=True)

        print(f"Downloading {year} US 1-Year person data...")
        try:
            r = requests.get(url, timeout=120)
            r.raise_for_status()

            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                z.extractall(dest)

            print(f"Done: {year} -> {dest}")
        except Exception as e:
            print(f"Skipping {year}: {e}")

YEARS = [2018, 2019, 2021, 2022, 2023]
download_acs_1yr_us_person(YEARS)

Downloading 2018 US 1-Year person data...
Done: 2018 -> data_persons_us_1yr/2018
Downloading 2019 US 1-Year person data...
Done: 2019 -> data_persons_us_1yr/2019
Downloading 2021 US 1-Year person data...
Done: 2021 -> data_persons_us_1yr/2021
Downloading 2022 US 1-Year person data...
Done: 2022 -> data_persons_us_1yr/2022
Downloading 2023 US 1-Year person data...
Done: 2023 -> data_persons_us_1yr/2023


In [8]:
import os
import pandas as pd
from pathlib import Path

BASE_DIR = Path("data_persons_us_1yr")
YEARS = [2018, 2019, 2021, 2022, 2023]
OUTPUT_PATH = BASE_DIR / "psam_pus_master.csv"

def find_person_csvs_for_year(year_folder: Path):
    """
    For US data, Census splits the national person file into parts (A/B),
    e.g., psam_pusa.csv and psam_pusb.csv. This returns BOTH (or all) parts.
    """
    candidates = sorted(year_folder.glob("psam_pus*.csv"))  # grabs pusa/pusb etc.
    if not candidates:
        # fallback just in case naming differs
        candidates = sorted(year_folder.glob("psam_p*.csv"))

    if not candidates:
        raise FileNotFoundError(f"No person CSVs found in {year_folder}")

    return candidates

def get_common_columns_across_all_files(files):
    common = None
    for path in files:
        cols = pd.read_csv(path, nrows=0).columns
        common = set(cols) if common is None else common.intersection(set(cols))

    common = sorted(common)
    if "year" in common:
        common.remove("year")
    return common

def build_master_us_streaming(base_dir=BASE_DIR, years=YEARS, output_path=OUTPUT_PATH, chunksize=200_000):
    # 1) Build list of all input files (includes A/B per year)
    files_by_year = {}
    all_files = []

    for y in years:
        year_folder = base_dir / str(y)
        year_files = find_person_csvs_for_year(year_folder)
        files_by_year[y] = year_files
        all_files.extend(year_files)

    # 2) Compute common columns across ALL files (all years, all parts)
    common_cols = get_common_columns_across_all_files(all_files)
    print(f"Common columns across {len(years)} years (all parts): {len(common_cols)}")

    # 3) Write streaming
    if output_path.exists():
        output_path.unlink()

    wrote_header = False
    total_rows = 0

    for y in years:
        for path in files_by_year[y]:
            print(f"\nProcessing {y} from {path.name} ...")

            for chunk in pd.read_csv(path, usecols=common_cols, chunksize=chunksize, low_memory=False):
                chunk["year"] = y
                total_rows += len(chunk)

                chunk.to_csv(
                    output_path,
                    mode="a",
                    index=False,
                    header=not wrote_header
                )
                wrote_header = True

            print(f"Done {y} {path.name}. Total rows written so far: {total_rows:,}")

    print(f"\nSaved -> {output_path}")
    return common_cols

common_cols = build_master_us_streaming()


Common columns across 5 years (all parts): 277

Processing 2018 from psam_pusa.csv ...
Done 2018 psam_pusa.csv. Total rows written so far: 1,648,512

Processing 2018 from psam_pusb.csv ...
Done 2018 psam_pusb.csv. Total rows written so far: 3,214,539

Processing 2019 from psam_pusa.csv ...
Done 2019 psam_pusa.csv. Total rows written so far: 4,873,347

Processing 2019 from psam_pusb.csv ...
Done 2019 psam_pusb.csv. Total rows written so far: 6,454,092

Processing 2021 from psam_pusa.csv ...
Done 2021 psam_pusa.csv. Total rows written so far: 8,123,471

Processing 2021 from psam_pusb.csv ...
Done 2021 psam_pusb.csv. Total rows written so far: 9,706,691

Processing 2022 from psam_pusa.csv ...
Done 2022 psam_pusa.csv. Total rows written so far: 11,430,845

Processing 2022 from psam_pusb.csv ...
Done 2022 psam_pusb.csv. Total rows written so far: 13,080,069

Processing 2023 from psam_pusa.csv ...
Done 2023 psam_pusa.csv. Total rows written so far: 14,812,412

Processing 2023 from psam_pusb.

In [9]:
import pandas as pd
import os

path = "data_persons_us_1yr/psam_pus_master.csv"

# Columns (read header only)
n_cols = len(pd.read_csv(path, nrows=0).columns)

# Rows (fast-ish line count; subtract header)
with open(path, "rb") as f:
    n_rows = sum(1 for _ in f) - 1

size_gb = os.path.getsize(path) / 1024**3

print(f"Rows: {n_rows:,}")
print(f"Columns: {n_cols}")
print(f"On-disk size: {size_gb:.2f} GB")

Rows: 16,485,878
Columns: 278
On-disk size: 11.62 GB
