In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import geopandas as gpd
import contextily as cx

In [2]:
import os
os.getcwd()

'/Users/juandavidcaicedocastro/Dropbox/01_berkeley/22_UrbanSim/github/sensitivy_analysis_carb/notebooks'

In [3]:
hdf = pd.HDFStore("custom_mpo_06197001_model_data.h5")

In [4]:
#Statistics
blocks = hdf['/blocks']
households = hdf['/households']
persons = hdf['/persons']

num_blocks = blocks.shape[0]
num_households = households.shape[0]
num_persons = persons.shape[0]

print("2020 BASELINE SYNTHETIC POPULATION STATS")
print(f"Number of blocks: {num_blocks}")
print(f"Number of households: {num_households}")
print(f"Number of persons: {num_persons}")

2020 BASELINE SYNTHETIC POPULATION STATS
Number of blocks: 108469
Number of households: 2852721
Number of persons: 7448391


In [5]:
growth = pd.concat((hdf['/ect'], hdf['/hct']), axis = 1)
growth[growth.index % 5 == 0]

Unnamed: 0_level_0,total_number_of_jobs,total_number_of_households
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,4289724.0,2607987.0
2015,5041155.0,2743715.0
2020,5261920.0,2852721.0
2025,5364976.0,2927113.0
2030,5468408.0,3014649.0
2035,5575060.0,3096392.0
2040,5685031.0,3173057.0
2045,5798430.0,3238257.0
2050,5915370.0,3287392.0


In [6]:
bins = [-np.inf, 80000, 150000, np.inf]
labels = ['Low Income', 'Middle Income', 'High Income']
income_cat = pd.cut(households.income, bins = bins, labels = labels)
income_cat.value_counts(normalize = True)

Low Income       0.517210
Middle Income    0.266915
High Income      0.215875
Name: income, dtype: float64

For the Year 2019, MEDIAN Income in the San Francisco -Oakland - Berkeley Metropolitan Area was $106,025

San Francisco - Oakland - Berkeley
https://data.census.gov/table?q=ACS+5-Year+Estimates+Data+Profiles&g=310XX00US41860&tid=ACSDP5Y2019.DP03&hidePreview=false

(80\% Mediam Income of the Area) The low-income bracket of 0-$80k is reasonable because it includes households earning less than the MSA median income, and is approximately half of all households in the area. This bracket includes those who may struggle to afford the high cost of living in San Francisco, such as lower-income workers, retirees, and students.

(80% to 150% Mediam Income) The middle-income bracket you propose of $80k-$150k could be seen as capturing households who earn more than the median income, but who may still struggle to afford the high cost of living in San Francisco (80% to 150% median Value). This bracket includes many professionals and middle-class workers who may be able to afford a decent standard of living, but who may face challenges such as high housing costs.

(150% Mediam Housheold Income) The high-income bracket you propose of $150k+ captures households who earn significantly more than the median income, and who are likely to have a relatively high standard of living. This bracket includes many high-earning professionals, executives, and entrepreneurs, as well as those in the top 5-10\% of earners in the area.

In [26]:
# Possible Stats: 

#Income 
bins = [-np.inf, 80000, 150000, np.inf]
labels = ['Low Income', 'Middle Income', 'High Income']
income_cat = pd.cut(households.income, bins = bins, labels = labels)
print('Income')
print(income_cat.sort_values().value_counts(normalize = True))
print('')

#Number of Cars 
print('Car Ownership')
print(households.cars.value_counts(normalize = True))
print("")

#Number of Workers 
print('Household Workers')
print(households['hh_workers'].value_counts(normalize = True))
print("")

# HH Size
print('Household Size')
print(households['hh_size'].value_counts(normalize = True))
print("")

# HH Type
print('Household Type')
print(households['hh_type'].value_counts(normalize = True))
print("")

Income
Low Income       0.517210
Middle Income    0.266915
High Income      0.215875
Name: income, dtype: float64

Car Ownership
2    0.419162
1    0.289937
3    0.150222
0    0.076176
4    0.064502
Name: cars, dtype: float64

Household Workers
one            0.404223
two or more    0.372822
none           0.222955
Name: hh_workers, dtype: float64

Household Size
two             0.313318
one             0.266898
four or more    0.258015
three           0.161769
Name: hh_size, dtype: float64

Household Type
3    0.231966
7    0.221620
4    0.220931
2    0.084200
5    0.082485
6    0.068817
8    0.058585
1    0.031395
Name: hh_type, dtype: float64



In [48]:
#Race 
print("Race")
print(persons.race.value_counts(normalize = True))
print('')

#Hispanic 
print("Hispanic")
print(persons['p_hispanic'].value_counts(normalize = True))
print('')

#Sex 
print("Sex")
print(persons['person_sex'].value_counts(normalize = True))
print('')

#Education 
bins = [-np.inf, 15, 17, 21, np.inf]
labels = ['k-12', 'High School Diploma', 'Some College / Bachelor', 'Graudate']
edu_cat = pd.cut(persons['edu'], bins = bins, labels = labels)
print('Education')
print(edu_cat.sort_values().value_counts(normalize = True))
print('')

#Age
bins = [-np.inf, 20, 30, 40, 50, 60, np.inf]
labels = ['0- 20', '20-30', '30-40', '40-50', '50-60', '60+']
age_cat = pd.cut(persons.age, bins = bins, labels = labels)
print('Age')
print(age_cat.sort_values().value_counts(normalize = True))
print('')

Race
white    0.571237
asian    0.238087
other    0.134274
black    0.056403
Name: race, dtype: float64

Hispanic
no     0.777822
yes    0.222178
Name: p_hispanic, dtype: float64

Sex
female    0.508706
male      0.491294
Name: person_sex, dtype: float64

Education
Some College / Bachelor    0.420280
k-12                       0.316127
High School Diploma        0.137300
Graudate                   0.126293
Name: edu, dtype: float64

Age
0- 20    0.255820
60+      0.183662
50-60    0.152754
40-50    0.149938
30-40    0.135098
20-30    0.122727
Name: age, dtype: float64



Potential Things to add:
- Geographyc Distribution

# Plot Bay Area Residential Units Density Plot

In [None]:
blocks_fpath = 'blocks_sf/block_sfbay.shp'
blocks_geo = gpd.read_file(blocks_fpath)

In [None]:
blocks_merged = blocks_geo.merge(blocks, how = 'inner', left_on = "GEOID", right_index = True)
blocks_merged = blocks_merged.to_crs('EPSG:5070')
taz = blocks_merged.dissolve(by='taz_zone_id', aggfunc='sum')
taz['area_acres'] = taz.geometry.area * 0.000247105 # Area in Acres
taz['residential_density'] = taz['residential_unit_capacity']/taz['area_acres']
taz = taz.to_crs(epsg=3857)

In [None]:
# Densitiy Bins
bins = [0, 6, 30, 200]
labels = ['0-5', '5-30', '30+']
taz['bin'] = pd.cut(taz['residential_density'], bins=bins, labels=labels)

# Bay Area Boundary
bay_area_boundary = taz.dissolve().geometry.boundary

# Plot
ax = taz.plot(figsize = (20,20), column = 'bin', legend = True, cmap='OrRd', alpha = 0.6, categorical = True)
bay_area_boundary.plot(ax = ax, color = 'black')
cx.add_basemap(ax, source=cx.providers.Stamen.TonerLite)
cx.add_basemap(ax, source=cx.providers.Stamen.TonerLabels)

# Customize the legend properties
legend = ax.get_legend()
legend.set_title('Residential Units per Acre', prop={'family': 'serif'})
legend.get_title().set_fontsize(30)
legend.set_bbox_to_anchor((0.45, 0.2))


# legend.align = 'left'
for text in legend.get_texts():
    text.set_fontsize(30)
    text.set_fontfamily('serif')
    text.set_ha('left')
    
ax.set_axis_off()