In [1]:
##taking a look at variables by mapping encoded column names to label

import pandas as pd

p_file = "data_persons_ca_1yr/2023/psam_p06.csv"
all_headers_persons = pd.read_csv(p_file, nrows=0).columns.tolist()

print("Number of columns:", len(all_headers_persons))
print("First 20 columns:", all_headers_persons[:20])
print("Last 10 columns:", all_headers_persons[-10:])


Number of columns: 287
First 20 columns: ['RT', 'SERIALNO', 'DIVISION', 'SPORDER', 'PUMA', 'REGION', 'STATE', 'ADJINC', 'PWGTP', 'AGEP', 'CIT', 'CITWP', 'COW', 'DDRS', 'DEAR', 'DEYE', 'DOUT', 'DPHY', 'DRAT', 'DRATX']
Last 10 columns: ['PWGTP71', 'PWGTP72', 'PWGTP73', 'PWGTP74', 'PWGTP75', 'PWGTP76', 'PWGTP77', 'PWGTP78', 'PWGTP79', 'PWGTP80']


In [2]:
import pandas as pd
import requests
import json

# 1) get column names
p_file = "data_persons_ca_1yr/2023/psam_p06.csv"
cols = pd.read_csv(p_file, nrows=0).columns.tolist()

print("Columns in file:", len(cols))
print("Example columns:", cols[:12])

# 2) fetch Census metadata dictionary
url = "https://api.census.gov/data/2023/acs/acs5/pums/variables.json"
meta = requests.get(url)
meta.raise_for_status()
vardict = meta.json()["variables"]

# 3) map each column -> explanation
rows = []
for c in cols:
    info = vardict.get(c, {})
    values_obj = info.get("values")
    rows.append({
        "column": c,
        "label": info.get("label"),
        "predicateType": info.get("predicateType"),
        "values_json": json.dumps(values_obj) if values_obj is not None else None
    })

df_dict = pd.DataFrame(rows)

# 5) save
df_dict.to_csv("Variable_exploration_data/all_pums_person_2023_data_dictionary.csv", index=False)
df_dict.head(20)


Columns in file: 287
Example columns: ['RT', 'SERIALNO', 'DIVISION', 'SPORDER', 'PUMA', 'REGION', 'STATE', 'ADJINC', 'PWGTP', 'AGEP', 'CIT', 'CITWP']


Unnamed: 0,column,label,predicateType,values_json
0,RT,Record Type,string,"{""item"": {""H"": ""Housing Record or Group Quarte..."
1,SERIALNO,Housing unit/GQ person serial number,string,
2,DIVISION,Division code,,"{""item"": {""7"": ""West South Central (South Regi..."
3,SPORDER,Person number,int,"{""range"": [{""min"": ""1"", ""max"": ""20"", ""descript..."
4,PUMA,Public use microdata area code (PUMA) based on...,string,"{""item"": {""00605"": ""Public use microdata area ..."
5,REGION,Region code,,"{""item"": {""1"": ""Northeast"", ""9"": ""Puerto Rico""..."
6,STATE,State code,,"{""item"": {""47"": ""Tennessee/TN"", ""04"": ""Arizona..."
7,ADJINC,Adjustment factor for income and earnings doll...,int,"{""item"": {""1019518"": ""2023 factor (1.019518 * ..."
8,PWGTP,Person weight,int,"{""range"": [{""min"": ""1"", ""max"": ""9999"", ""descri..."
9,AGEP,Age,int,"{""item"": {""0"": ""Under 1 year""}, ""range"": [{""mi..."


In [3]:
## original labels 

import pandas as pd
import json

# load your existing dictionary
df_dict = pd.read_csv(
    "Variable_exploration_data/all_pums_person_2023_data_dictionary.csv"
)

print("Original dictionary shape:", df_dict.shape)
df_dict.head()


Original dictionary shape: (287, 4)


Unnamed: 0,column,label,predicateType,values_json
0,RT,Record Type,string,"{""item"": {""H"": ""Housing Record or Group Quarte..."
1,SERIALNO,Housing unit/GQ person serial number,string,
2,DIVISION,Division code,,"{""item"": {""7"": ""West South Central (South Regi..."
3,SPORDER,Person number,int,"{""range"": [{""min"": ""1"", ""max"": ""20"", ""descript..."
4,PUMA,Public use microdata area code (PUMA) based on...,string,"{""item"": {""00605"": ""Public use microdata area ..."


In [4]:
## dropping useless columns

def is_useless(col):
    if col.startswith("PWG"):      # person weights + replicates
        return True
    if col.startswith("F"):        # allocation flags
        return True
    if col in ["RT", "SERIALNO", "SPORDER"]:  # pure identifiers
        return True
    return False

df_clean = df_dict.loc[~df_dict["column"].apply(is_useless)].copy()

print("Clean dictionary shape:", df_clean.shape)
df_clean.head()


Clean dictionary shape: (123, 4)


Unnamed: 0,column,label,predicateType,values_json
2,DIVISION,Division code,,"{""item"": {""7"": ""West South Central (South Regi..."
4,PUMA,Public use microdata area code (PUMA) based on...,string,"{""item"": {""00605"": ""Public use microdata area ..."
5,REGION,Region code,,"{""item"": {""1"": ""Northeast"", ""9"": ""Puerto Rico""..."
6,STATE,State code,,"{""item"": {""47"": ""Tennessee/TN"", ""04"": ""Arizona..."
7,ADJINC,Adjustment factor for income and earnings doll...,int,"{""item"": {""1019518"": ""2023 factor (1.019518 * ..."


In [5]:
df_clean.to_csv(
    "Variable_exploration_data/suggested_clean_data_dictionary.csv",
    index=False
)

print("Saved: suggested_clean_data_dictionary.csv")


Saved: suggested_clean_data_dictionary.csv


In [6]:
## 123 variables is still too many, lets try to cut it down further
# lets try to remove: columns with too many null values, columns that ariables that measure the same concept at different resolutions 
import pandas as pd

df = pd.read_csv(
    "Variable_exploration_data/suggested_clean_data_dictionary.csv"
)

print("Starting columns:", df.shape[0])

Starting columns: 123


In [7]:
import pandas as pd

dict_path = "Variable_exploration_data/suggested_clean_data_dictionary.csv"
df_dict = pd.read_csv(dict_path)

candidate_cols = df_dict["column"].tolist()
print("Number of candidate columns:", len(candidate_cols))


Number of candidate columns: 123


In [8]:
p_file = "data_persons_ca_1yr/2023/psam_p06.csv"

df = pd.read_csv(
    p_file,
    usecols=candidate_cols
)

print("Shape of loaded data:", df.shape)


Shape of loaded data: (392318, 123)


In [9]:
null_summary = (
    df.isna()
      .sum()
      .reset_index()
      .rename(columns={"index": "column", 0: "n_null"})
)

null_summary["pct_null"] = null_summary["n_null"] / len(df)

null_summary = null_summary.sort_values("pct_null", ascending=False)

# load dictionary if not already loaded
df_dict = pd.read_csv(
    "Variable_exploration_data/suggested_clean_data_dictionary.csv"
)

# keep only what we need
labels = df_dict[["column", "label"]]

# merge labels into null summary
null_summary = null_summary.merge(
    labels,
    on="column",
    how="left"
)

# reorder columns for readability
null_summary = null_summary[
    ["column", "label", "n_null", "pct_null"]
]


In [10]:
null_summary.head(60)


Unnamed: 0,column,label,n_null,pct_null
0,GCM,Length of time responsible for grandchildren,390290,0.994831
1,DRAT,Veteran service connected disability rating (p...,387851,0.988614
2,GCR,Grandparents responsible for grandchildren,381734,0.973022
3,MLPA,Served September 2001 or later,375293,0.956604
4,MLPJ,Served World War II (December 1941 - December ...,375293,0.956604
5,MLPIK,Peacetime service before July 1950,375293,0.956604
6,MLPFG,Served February 1955 - July 1964,375293,0.956604
7,MLPE,Served Vietnam era (August 1964 - April 1975),375293,0.956604
8,MLPH,Served Korean War (July 1950 - January 1955),375293,0.956604
9,MLPCD,Served May 1975 - July 1990,375293,0.956604


In [11]:
# lets drop any columns that have over 85% null values 

HIGH_NULL_CUTOFF = 0.85

# 1) Load your current clean dictionary (the 123-col list)
df_dict = pd.read_csv(dict_path)
candidate_cols = df_dict["column"].tolist()

# 2) Load ONLY those columns from the person CSV
df = pd.read_csv(p_file, usecols=candidate_cols)

# 3) Compute null rates
null_summary = (
    df.isna()
      .sum()
      .reset_index()
      .rename(columns={"index": "column", 0: "n_null"})
)
null_summary["pct_null"] = null_summary["n_null"] / len(df)

# 4) Identify high-null columns to drop
high_null_cols = null_summary.loc[
    null_summary["pct_null"] >= HIGH_NULL_CUTOFF,
    "column"
].tolist()

print(f"Columns in dict before: {len(candidate_cols)}")
print(f"Dropping (pct_null >= {HIGH_NULL_CUTOFF}): {len(high_null_cols)}")
print("Dropped columns:", high_null_cols)

# 5) Prune dictionary and overwrite the same file
df_dict_pruned = df_dict.loc[~df_dict["column"].isin(high_null_cols)].copy()
df_dict_pruned.to_csv(dict_path, index=False)

print(f"Columns in dict after: {df_dict_pruned.shape[0]}")

Columns in dict before: 123
Dropping (pct_null >= 0.85): 17
Dropped columns: ['DRAT', 'DRATX', 'GCM', 'GCR', 'MLPA', 'MLPB', 'MLPCD', 'MLPE', 'MLPFG', 'MLPH', 'MLPIK', 'MLPJ', 'MIGPUMA', 'MIGSP', 'SFN', 'SFR', 'VPS']
Columns in dict after: 106


In [12]:
## 106 might still be too many... our best might be measuring feature importance or
## manually picking columns 