In [1]:
import os
import requests
import zipfile
import io
import pandas as pd
from pathlib import Path

In [2]:
def download_acs_1year_person_data(state_abbr="ca", years=[2018,2019, 2021, 2022, 2023]):
    """
    Downloads 1-Year ACS PUMS person files. 
    """
    for year in years:
        url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_p{state_abbr}.zip"
        dest_folder = f"data_persons_{state_abbr}_1yr/{year}"
        os.makedirs(dest_folder, exist_ok=True)
        
        print(f"Downloading {year} 1-Year data...")
        try:
            r = requests.get(url, stream=True)
            r.raise_for_status()
            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                z.extractall(dest_folder)
                print(f"Done: {year}")
        except Exception as e:
            print(f"Skipping {year}: {e}")

# We only run once 
download_acs_1year_person_data()

Downloading 2018 1-Year data...
Done: 2018
Downloading 2019 1-Year data...
Done: 2019
Downloading 2021 1-Year data...
Done: 2021
Downloading 2022 1-Year data...
Done: 2022
Downloading 2023 1-Year data...
Done: 2023


In [5]:
import os
import pandas as pd

BASE_DIR = "data_persons_ca_1yr"
YEARS = [2018, 2019, 2021, 2022, 2023]
FILENAME = "psam_p06.csv"
OUTPUT_PATH = os.path.join(BASE_DIR, "psam_p06_master.csv")

def build_master_common_columns(base_dir=BASE_DIR, years=YEARS, filename=FILENAME, output_path=OUTPUT_PATH):
    # 1) Collect file paths + compute the intersection of column names across years
    paths = {}
    common_cols = None

    for y in years:
        path = os.path.join(base_dir, str(y), filename)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing file for {y}: {path}")
        paths[y] = path

        # Read only header to get columns (fast)
        cols = pd.read_csv(path, nrows=0).columns
        cols_set = set(cols)

        if common_cols is None:
            common_cols = cols_set
        else:
            common_cols = common_cols.intersection(cols_set)

    # We will keep these columns (sorted for stable order) + add "year"
    common_cols = sorted(common_cols)
    if "year" in common_cols:
        # unlikely, but just in case
        common_cols.remove("year")

    print(f"Common columns across {len(years)} years: {len(common_cols)}")

    # 2) Read each year using only the common columns, add year, concat
    dfs = []
    for y in years:
        df = pd.read_csv(
            paths[y],
            usecols=common_cols,     # ensures matching schema
            low_memory=False
        )
        df["year"] = y
        dfs.append(df)
        print(f"Loaded {y}: {df.shape[0]:,} rows, {df.shape[1]} cols (incl year)")

    master = pd.concat(dfs, ignore_index=True)
    print(f"Master shape: {master.shape[0]:,} rows, {master.shape[1]} cols")

    # 3) Write out
    master.to_csv(output_path, index=False)
    print(f"Saved -> {output_path}")

    return master, common_cols

master_df, common_cols = build_master_common_columns()

Common columns across 5 years: 277
Loaded 2018: 378,817 rows, 278 cols (incl year)
Loaded 2019: 380,091 rows, 278 cols (incl year)
Loaded 2021: 386,061 rows, 278 cols (incl year)
Loaded 2022: 391,171 rows, 278 cols (incl year)
Loaded 2023: 392,318 rows, 278 cols (incl year)
Master shape: 1,928,458 rows, 278 cols
Saved -> data_persons_ca_1yr/psam_p06_master.csv


In [6]:
# Rows & columns
print(f"Rows: {master_df.shape[0]:,}")
print(f"Columns: {master_df.shape[1]}")

import os

path = "data_persons_ca_1yr/psam_p06_master.csv"

size_bytes = os.path.getsize(path)
size_gb = size_bytes / 1024**3

print(f"On-disk size: {size_gb:.2f} GB")

Rows: 1,928,458
Columns: 278
On-disk size: 1.37 GB


In [None]:
# def download_acs_1yr_us_person(years, base_dir="data_persons_us_1yr"):
#     for year in years:
#         url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip"
#         dest = Path(base_dir) / str(year)
#         dest.mkdir(parents=True, exist_ok=True)

#         print(f"Downloading {year} US 1-Year person data...")
#         try:
#             r = requests.get(url, timeout=120)
#             r.raise_for_status()

#             with zipfile.ZipFile(io.BytesIO(r.content)) as z:
#                 z.extractall(dest)

#             print(f"Done: {year} -> {dest}")
#         except Exception as e:
#             print(f"Skipping {year}: {e}")

# YEARS = [2018, 2019, 2021, 2022, 2023]
# download_acs_1yr_us_person(YEARS)

In [None]:
# import os
# import pandas as pd
# from pathlib import Path

# BASE_DIR = Path("data_persons_us_1yr")
# YEARS = [2018, 2019, 2021, 2022, 2023]
# OUTPUT_PATH = BASE_DIR / "psam_pus_master.csv"

# def find_person_csv(year_folder: Path) -> Path:
#     # Find the big person file inside the extracted zip (usually starts with psam_p)
#     candidates = list(year_folder.glob("psam_p*.csv"))
#     if not candidates:
#         raise FileNotFoundError(f"No psam_p*.csv found in {year_folder}")
#     # pick the largest (safest if there are multiple)
#     return max(candidates, key=lambda p: p.stat().st_size)

# def get_common_columns(paths_by_year):
#     common = None
#     for y, path in paths_by_year.items():
#         cols = pd.read_csv(path, nrows=0).columns
#         common = set(cols) if common is None else common.intersection(set(cols))
#     common = sorted(common)
#     if "year" in common:
#         common.remove("year")
#     return common

# def build_master_us_streaming(base_dir=BASE_DIR, years=YEARS, output_path=OUTPUT_PATH, chunksize=200_000):
#     # 1) Resolve actual CSV paths for each year
#     paths = {}
#     for y in years:
#         year_folder = base_dir / str(y)
#         paths[y] = find_person_csv(year_folder)

#     # 2) Compute common columns
#     common_cols = get_common_columns(paths)
#     print(f"Common columns across {len(years)} years: {len(common_cols)}")

#     # 3) Write streaming
#     if output_path.exists():
#         output_path.unlink()

#     wrote_header = False
#     total_rows = 0

#     for y in years:
#         print(f"\nProcessing {y} from {paths[y].name} ...")
#         for chunk in pd.read_csv(paths[y], usecols=common_cols, chunksize=chunksize, low_memory=False):
#             chunk["year"] = y
#             total_rows += len(chunk)
#             chunk.to_csv(output_path, mode="a", index=False, header=not wrote_header)
#             wrote_header = True
#         print(f"Done {y}. Total rows written so far: {total_rows:,}")

#     print(f"\nSaved -> {output_path}")
#     return common_cols

# common_cols = build_master_us_streaming()
