In [None]:
import geopandas as gpd
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

config_folder = Path("../configs/Chicago")

# get populationsim synthetic population
popsim_df = pd.read_csv(config_folder / "data/populationsim/output/synthetic_persons.csv")

# get PUMS dataset from POLARIS
pums_df = pd.read_csv(config_folder / "data/populationsim/data/pums_person_chicago.csv", dtype=str)

# get PUMS PUMA geography
# https://catalog.data.gov/dataset/tiger-line-shapefile-2019-2010-state-illinois-2010-census-public-use-microdata-area-puma-state-
puma_gdf = gpd.read_file(config_folder / "data/tl_2019_17_puma10.shp")

# get CMAP planning area
# https://datahub.cmap.illinois.gov/datasets/4834d52310d24e56a0300898a0cb23bc_0/explore
cmap_gdf = gpd.read_file(config_folder / "data/Facility_Planning_Areas_2016.shp")

In [None]:
pums_df

In [None]:
pums_df.dtypes

In [None]:
# get puma areas within cmap planning boundary
cmap_gdf.to_crs(puma_gdf.crs, inplace=True)

cmap_boundary = cmap_gdf.geometry.union_all()

puma_in_cmap_gdf = puma_gdf[puma_gdf.geometry.intersects(cmap_boundary)].reset_index(drop=True)



fig, ax = plt.subplots()

puma_in_cmap_gdf.plot(ax=ax, alpha=0.5)
x, y = cmap_boundary.exterior.xy
ax.plot(x, y)

plt.show()

In [None]:
# add STPUMA to puma_in_cmap gdf
puma_in_cmap_gdf["STPUMA"] = puma_in_cmap_gdf["PUMACE10"].apply(
    lambda x: int("17" + str(x))
)

# get population totals by STPUMA from popsim df
pop_totals = popsim_df.groupby("STPUMA").size().reset_index(name="count")
pop_totals.columns = ["STPUMA", "POP_COUNT"]
puma_with_pop_gdf = puma_in_cmap_gdf.merge(pop_totals, how="left", left_on="STPUMA", right_on="STPUMA")

# get share of population in PUMA areas
puma_with_pop_gdf["SHARE"] = puma_with_pop_gdf.POP_COUNT / puma_with_pop_gdf.POP_COUNT.sum()

fig, ax = plt.subplots()

puma_with_pop_gdf.plot(ax=ax, alpha=0.75, column="POP_COUNT", legend=True)
x, y = cmap_boundary.exterior.xy
ax.plot(x, y)

plt.show()

In [None]:
# okay now filter PUMS dataset by SERIALNO in popsim
pums_in_popsim_df = pums_df[pums_df.SERIALNO.isin(popsim_df.SERIALNO.astype(str).unique())]
pums_in_popsim_df

In [None]:

n_sample = 2000

samples = []
for _, row in puma_with_pop_gdf.iterrows():
    STPUMA = str(row.STPUMA)
    share = row.SHARE

    n = max(int(share*n_sample), 1)
    sample = pums_in_popsim_df[pums_in_popsim_df.STPUMA==STPUMA].sample(
        n=n,
        replace=False,
        random_state=0)
    samples.append(sample)

pums_sample = pd.concat(samples).reset_index(drop=True)




In [None]:
pums_sample