# Obtaining Raw Data 
This project uses the American Community Survey (ACS) PUMS 1-Year national person-level microdata
i.e. Census Data
2018–2023 (excluding 2020 due to the unusual circumstances of the pandemic)

Due to size constraints, the raw datasets are not tracked in GitHub
<br>
All data can be fully reproduced by running the script, more detail on reproducibility in README file

Raw data source:
<br>
https://www.census.gov/programs-surveys/acs/microdata.html
<br>
https://www2.census.gov/programs-surveys/acs/data/pums/

# CA Data 

In [None]:
import os
import requests
import zipfile
import io
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [None]:
def download_acs_1year_person_data(state_abbr="ca", years=[2018,2019, 2021, 2022, 2023, 2024]):
    """
    Downloads 1-Year ACS PUMS person files. 
    """
    for year in years:
        url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_p{state_abbr}.zip"
        dest_folder = f"data_persons_{state_abbr}_1yr/{year}"
        os.makedirs(dest_folder, exist_ok=True)
        
        print(f"Downloading {year} 1-Year data...")
        try:
            r = requests.get(url, stream=True)
            r.raise_for_status()
            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                z.extractall(dest_folder)
                print(f"Done: {year}")
        except Exception as e:
            print(f"Skipping {year}: {e}")

# We only run once 
download_acs_1year_person_data()

In [None]:
import os
import pandas as pd

BASE_DIR = "data_persons_ca_1yr"
YEARS = [2018, 2019, 2021, 2022, 2023, 2024]
FILENAME = "psam_p06.csv"
OUTPUT_PATH = os.path.join(BASE_DIR, "persons_master.csv")

def build_master_common_columns(base_dir=BASE_DIR, years=YEARS, filename=FILENAME, output_path=OUTPUT_PATH):
    # 1) Collect file paths + compute the intersection of column names across years
    paths = {}
    common_cols = None

    for y in years:
        path = os.path.join(base_dir, str(y), filename)
        if not os.path.exists(path):
            raise FileNotFoundError(f"Missing file for {y}: {path}")
        paths[y] = path

        # Read only header to get columns (fast)
        cols = pd.read_csv(path, nrows=0).columns
        cols_set = set(cols)

        if common_cols is None:
            common_cols = cols_set
        else:
            common_cols = common_cols.intersection(cols_set)

    # We will keep these columns (sorted for stable order) + add "year"
    common_cols = sorted(common_cols)
    if "year" in common_cols:
        # unlikely, but just in case
        common_cols.remove("year")

    print(f"Common columns across {len(years)} years: {len(common_cols)}")

    # 2) Read each year using only the common columns, add year, concat
    dfs = []
    for y in years:
        df = pd.read_csv(
            paths[y],
            usecols=common_cols,     # ensures matching schema
            low_memory=False
        )
        df["year"] = y
        dfs.append(df)
        print(f"Loaded {y}: {df.shape[0]:,} rows, {df.shape[1]} cols (incl year)")

    master = pd.concat(dfs, ignore_index=True)
    print(f"Master shape: {master.shape[0]:,} rows, {master.shape[1]} cols")

    # 3) Write out
    master.to_csv(output_path, index=False)
    print(f"Saved -> {output_path}")

    return master, common_cols

master_df, common_cols = build_master_common_columns()

In [None]:
# Size
print(f"Rows: {master_df.shape[0]:,}")
print(f"Columns: {master_df.shape[1]}")

In [5]:
# Filter to adult population: exclude 18 and under
original_count = len(master_df)
master_df = master_df[master_df['AGEP'] > 18]
filtered_count = len(master_df)

print(f"Original rows: {original_count:,}")
print(f"Removed (age ≤18): {original_count - filtered_count:,}")
print(f"Final rows (ages 19+): {filtered_count:,}")

# Overwrite persons_master.csv with filtered data
master_df.to_csv(OUTPUT_PATH, index=False)
print(f"Updated -> {OUTPUT_PATH}")

Original rows: 2,322,183
Removed (age ≤18): 479,789
Final rows (ages 19+): 1,842,394
Updated -> data_persons_ca_1yr/persons_master.csv
