# ACS PUMS Weighted Analysis

This notebook uses **proper survey weights** for population-representative estimates.

- `PWGTP` = Person weight (for person-level estimates)
- `WGTP` = Household weight (for household-level estimates)

**Important**: Unweighted counts show sample sizes. Weighted estimates represent the actual US population.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import warnings

sys.path.insert(0, str(Path.cwd().parent))
from analysis.code_mappings import *

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

def weighted_median(data, weights):
    """Calculate weighted median."""
    data = np.array(data)
    weights = np.array(weights)
    mask = ~(np.isnan(data) | np.isnan(weights))
    data, weights = data[mask], weights[mask]
    if len(data) == 0:
        return np.nan
    sorted_idx = np.argsort(data)
    data, weights = data[sorted_idx], weights[sorted_idx]
    cumsum = np.cumsum(weights)
    cutoff = cumsum[-1] / 2.0
    return data[cumsum >= cutoff][0]

def weighted_mean(data, weights):
    """Calculate weighted mean."""
    data = np.array(data)
    weights = np.array(weights)
    mask = ~(np.isnan(data) | np.isnan(weights))
    data, weights = data[mask], weights[mask]
    if len(data) == 0 or weights.sum() == 0:
        return np.nan
    return np.average(data, weights=weights)

def weighted_percentile(data, weights, percentile):
    """Calculate weighted percentile."""
    data = np.array(data)
    weights = np.array(weights)
    mask = ~(np.isnan(data) | np.isnan(weights))
    data, weights = data[mask], weights[mask]
    if len(data) == 0:
        return np.nan
    sorted_idx = np.argsort(data)
    data, weights = data[sorted_idx], weights[sorted_idx]
    cumsum = np.cumsum(weights)
    cutoff = cumsum[-1] * percentile / 100.0
    return data[cumsum >= cutoff][0]

In [None]:
DATA_DIR = Path('../data')

df_hh = pd.read_csv(DATA_DIR / 'pums_household_2023.csv', low_memory=False)
df_hh_sample = df_hh.sample(frac=0.1, random_state=42)
print(f"Household: {len(df_hh):,} records, using {len(df_hh_sample):,} sample")
print(f"  Weighted total: {df_hh_sample['wgtp'].sum() * 10:,.0f} households")

df_person = pd.read_csv(DATA_DIR / 'pums_person_2023.csv', low_memory=False)
df_person_sample = df_person.sample(frac=0.1, random_state=42)
print(f"Person: {len(df_person):,} records, using {len(df_person_sample):,} sample")
print(f"  Weighted total: {df_person_sample['pwgtp'].sum() * 10:,.0f} persons")

In [None]:
# Add descriptive labels
df_hh_sample['state_name'] = df_hh_sample['st'].map(STATE_CODES)
df_hh_sample['tenure_desc'] = df_hh_sample['ten'].map(TENURE_CODES)
df_hh_sample['building_type'] = df_hh_sample['bld'].map(BUILDING_TYPE_CODES)
df_hh_sample['household_type'] = df_hh_sample['hht'].map(HOUSEHOLD_TYPE_CODES)

df_person_sample['state_name'] = df_person_sample['st'].map(STATE_CODES)
df_person_sample['sex_desc'] = df_person_sample['sex'].map(SEX_CODES)
df_person_sample['race_desc'] = df_person_sample['rac1p'].map(RACE_CODES)
df_person_sample['education_category'] = df_person_sample['schl'].apply(get_education_category)
df_person_sample['employment_status'] = df_person_sample['esr'].map(EMPLOYMENT_STATUS_CODES)
df_person_sample['class_of_worker'] = df_person_sample['cow'].map(CLASS_OF_WORKER_CODES)
df_person_sample['education_years'] = df_person_sample['schl'].apply(estimate_education_years)
df_person_sample['experience'] = (df_person_sample['agep'] - df_person_sample['education_years'] - 6).clip(lower=0)
print("Labels added!")

## Weighted Population by State

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Weighted household count by state
ax = axes[0]
state_hh = df_hh_sample.groupby('state_name')['wgtp'].sum().sort_values(ascending=True).tail(20) / 1e6
state_hh.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Estimated Households by State (Millions)', fontsize=12)
ax.set_xlabel('Households (Millions)')

# Weighted person count by state
ax = axes[1]
state_pop = df_person_sample.groupby('state_name')['pwgtp'].sum().sort_values(ascending=True).tail(20) / 1e6
state_pop.plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Estimated Population by State (Millions)', fontsize=12)
ax.set_xlabel('Population (Millions)')

plt.tight_layout()
plt.show()

print(f"\nTotal weighted households: {df_hh_sample['wgtp'].sum():,.0f}")
print(f"Total weighted population: {df_person_sample['pwgtp'].sum():,.0f}")

## Weighted Income Statistics

In [None]:
# Filter to positive income households
hh_income = df_hh_sample[df_hh_sample['hincp'] > 0].copy()

# Weighted statistics
w_median = weighted_median(hh_income['hincp'], hh_income['wgtp'])
w_mean = weighted_mean(hh_income['hincp'], hh_income['wgtp'])
w_p25 = weighted_percentile(hh_income['hincp'], hh_income['wgtp'], 25)
w_p75 = weighted_percentile(hh_income['hincp'], hh_income['wgtp'], 75)
w_p90 = weighted_percentile(hh_income['hincp'], hh_income['wgtp'], 90)

# Unweighted for comparison
uw_median = hh_income['hincp'].median()
uw_mean = hh_income['hincp'].mean()

print("Household Income Statistics")
print("="*50)
print(f"{'Statistic':<20} {'Weighted':>15} {'Unweighted':>15}")
print("-"*50)
print(f"{'Median':<20} ${w_median:>14,.0f} ${uw_median:>14,.0f}")
print(f"{'Mean':<20} ${w_mean:>14,.0f} ${uw_mean:>14,.0f}")
print(f"{'25th Percentile':<20} ${w_p25:>14,.0f}")
print(f"{'75th Percentile':<20} ${w_p75:>14,.0f}")
print(f"{'90th Percentile':<20} ${w_p90:>14,.0f}")

## Weighted Income by State

In [None]:
# Calculate weighted median income by state
state_income = []
for state in hh_income['state_name'].dropna().unique():
    state_data = hh_income[hh_income['state_name'] == state]
    w_med = weighted_median(state_data['hincp'], state_data['wgtp'])
    state_income.append({'state': state, 'weighted_median': w_med})

state_income_df = pd.DataFrame(state_income).set_index('state').sort_values('weighted_median', ascending=True)

fig, ax = plt.subplots(figsize=(12, 10))
state_income_df.tail(25)['weighted_median'].plot(kind='barh', ax=ax, color='steelblue')
ax.set_title('Weighted Median Household Income by State (Top 25)', fontsize=14)
ax.set_xlabel('Median Income ($)')
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
plt.tight_layout()
plt.show()

## Weighted Tenure Distribution

In [None]:
# Weighted tenure distribution
tenure_weighted = df_hh_sample.groupby('tenure_desc')['wgtp'].sum()
tenure_pct = tenure_weighted / tenure_weighted.sum() * 100

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Weighted
ax = axes[0]
tenure_pct.plot(kind='pie', ax=ax, autopct='%1.1f%%')
ax.set_title('Tenure Distribution (Weighted)', fontsize=12)
ax.set_ylabel('')

# Unweighted for comparison
ax = axes[1]
tenure_unweighted = df_hh_sample['tenure_desc'].value_counts(normalize=True) * 100
tenure_unweighted.plot(kind='pie', ax=ax, autopct='%1.1f%%')
ax.set_title('Tenure Distribution (Unweighted)', fontsize=12)
ax.set_ylabel('')

plt.tight_layout()
plt.show()

## Weighted Wages by Education

In [None]:
# Filter to workers with positive wages
workers = df_person_sample[(df_person_sample['wagp'] > 0) & (df_person_sample['agep'] >= 18)].copy()
print(f"Analyzing {len(workers):,} workers (weighted: {workers['pwgtp'].sum():,.0f})")

edu_order = ['Less than HS', 'High School', 'Some College', "Associate's", 
             "Bachelor's", "Master's", 'Professional', 'Doctorate']

# Calculate weighted median wages by education
edu_wages = []
for edu in edu_order:
    edu_data = workers[workers['education_category'] == edu]
    if len(edu_data) > 0:
        w_med = weighted_median(edu_data['wagp'], edu_data['pwgtp'])
        uw_med = edu_data['wagp'].median()
        count = edu_data['pwgtp'].sum()
        edu_wages.append({'education': edu, 'weighted_median': w_med, 
                         'unweighted_median': uw_med, 'weighted_count': count})

edu_wages_df = pd.DataFrame(edu_wages).set_index('education')

print("\nMedian Wages by Education Level")
print("="*60)
print(f"{'Education':<20} {'Weighted':>15} {'Unweighted':>15}")
print("-"*60)
for edu in edu_order:
    if edu in edu_wages_df.index:
        row = edu_wages_df.loc[edu]
        print(f"{edu:<20} ${row['weighted_median']:>14,.0f} ${row['unweighted_median']:>14,.0f}")

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

x = range(len(edu_wages_df))
width = 0.35

bars1 = ax.bar([i - width/2 for i in x], edu_wages_df['weighted_median'], width, 
               label='Weighted', color='steelblue')
bars2 = ax.bar([i + width/2 for i in x], edu_wages_df['unweighted_median'], width, 
               label='Unweighted', color='coral', alpha=0.7)

ax.set_title('Median Wages by Education: Weighted vs Unweighted', fontsize=14)
ax.set_ylabel('Median Wages ($)')
ax.set_xticks(x)
ax.set_xticklabels(edu_wages_df.index, rotation=45, ha='right')
ax.legend()
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

plt.tight_layout()
plt.show()

## Weighted Experience-Earnings Profile

In [None]:
exp_bins = [0, 5, 10, 15, 20, 25, 30, 40, 50]
exp_labels = ['0-5', '5-10', '10-15', '15-20', '20-25', '25-30', '30-40', '40+']
workers['exp_group'] = pd.cut(workers['experience'], bins=exp_bins, labels=exp_labels)

fig, ax = plt.subplots(figsize=(12, 6))
edu_levels = ['High School', "Bachelor's", "Master's", 'Doctorate']
colors = ['#3498db', '#2ecc71', '#9b59b6', '#e74c3c']

for edu, color in zip(edu_levels, colors):
    edu_data = workers[workers['education_category'] == edu]
    if len(edu_data) > 0:
        exp_wages = []
        for exp in exp_labels:
            exp_data = edu_data[edu_data['exp_group'] == exp]
            if len(exp_data) > 10:
                w_med = weighted_median(exp_data['wagp'], exp_data['pwgtp'])
                exp_wages.append(w_med)
            else:
                exp_wages.append(np.nan)
        ax.plot(exp_labels, exp_wages, marker='o', label=edu, linewidth=2, markersize=8, color=color)

ax.set_title('Weighted Experience-Earnings Profile by Education', fontsize=14)
ax.set_xlabel('Years of Experience')
ax.set_ylabel('Weighted Median Wages ($)')
ax.legend(title='Education Level')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Weighted Education Premium

In [None]:
# Prime-age workers
prime_workers = df_person_sample[
    (df_person_sample['wagp'] > 0) & 
    (df_person_sample['agep'] >= 25) & 
    (df_person_sample['agep'] <= 54)
].copy()

# Calculate weighted median by education
edu_premium = []
for edu in edu_order:
    edu_data = prime_workers[prime_workers['education_category'] == edu]
    if len(edu_data) > 0:
        w_med = weighted_median(edu_data['wagp'], edu_data['pwgtp'])
        edu_premium.append({'education': edu, 'weighted_median': w_med})

edu_premium_df = pd.DataFrame(edu_premium).set_index('education')

# Calculate premium vs high school
hs_wage = edu_premium_df.loc['High School', 'weighted_median']
edu_premium_df['premium_pct'] = ((edu_premium_df['weighted_median'] / hs_wage) - 1) * 100

print("Weighted Education Premium (Prime-Age Workers 25-54)")
print("="*60)
print(f"Baseline: High School = ${hs_wage:,.0f}\n")
for edu in edu_order:
    if edu in edu_premium_df.index:
        row = edu_premium_df.loc[edu]
        sign = '+' if row['premium_pct'] >= 0 else ''
        print(f"{edu:<20}: ${row['weighted_median']:>10,.0f}  ({sign}{row['premium_pct']:>5.0f}% vs HS)")

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in edu_premium_df['premium_pct']]
edu_premium_df['premium_pct'].plot(kind='barh', ax=ax, color=colors)
ax.axvline(0, color='black', linewidth=0.5)
ax.set_title('Weighted Wage Premium Relative to High School', fontsize=14)
ax.set_xlabel('Premium (%)')
plt.tight_layout()
plt.show()

## Weighted Gender Wage Gap

In [None]:
male_wage = weighted_median(
    workers[workers['sex_desc'] == 'Male']['wagp'],
    workers[workers['sex_desc'] == 'Male']['pwgtp']
)
female_wage = weighted_median(
    workers[workers['sex_desc'] == 'Female']['wagp'],
    workers[workers['sex_desc'] == 'Female']['pwgtp']
)

gap = (male_wage - female_wage) / male_wage * 100

print("Weighted Gender Wage Analysis")
print("="*40)
print(f"Male median wage:   ${male_wage:,.0f}")
print(f"Female median wage: ${female_wage:,.0f}")
print(f"Gender wage gap:    {gap:.1f}%")
print(f"\nWomen earn ${female_wage/male_wage:.2f} for every $1 men earn")

fig, ax = plt.subplots(figsize=(8, 5))
wages = pd.Series({'Male': male_wage, 'Female': female_wage})
wages.plot(kind='bar', ax=ax, color=['#3498db', '#e74c3c'])
ax.set_title('Weighted Median Wages by Sex', fontsize=14)
ax.set_ylabel('Median Wages ($)')
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=0)
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

for i, v in enumerate(wages):
    ax.text(i, v + 1000, f'${v/1000:.0f}K', ha='center', fontsize=11)

plt.tight_layout()
plt.show()

## Summary: Weighted vs Unweighted Comparison

In [None]:
print("="*70)
print("WEIGHTED VS UNWEIGHTED COMPARISON")
print("="*70)

# Household income
hh_pos = df_hh_sample[df_hh_sample['hincp'] > 0]
w_hh_med = weighted_median(hh_pos['hincp'], hh_pos['wgtp'])
uw_hh_med = hh_pos['hincp'].median()

print(f"\nHousehold Income Median:")
print(f"  Weighted:   ${w_hh_med:,.0f}")
print(f"  Unweighted: ${uw_hh_med:,.0f}")
print(f"  Difference: {(w_hh_med - uw_hh_med) / uw_hh_med * 100:+.1f}%")

# Bachelor's wage
bach = workers[workers['education_category'] == "Bachelor's"]
w_bach = weighted_median(bach['wagp'], bach['pwgtp'])
uw_bach = bach['wagp'].median()

print(f"\nBachelor's Degree Median Wage:")
print(f"  Weighted:   ${w_bach:,.0f}")
print(f"  Unweighted: ${uw_bach:,.0f}")
print(f"  Difference: {(w_bach - uw_bach) / uw_bach * 100:+.1f}%")

print("\n" + "="*70)
print("Note: Differences occur because the sample is not perfectly")
print("representative. Weights adjust for sampling design and nonresponse.")
print("="*70)