In [1]:
import numpy as np
import pandas as pd
import math
import zipfile         # a core library for working with zip files
import requests        # third-party library for making HTTP requests
pd.set_option('display.max_columns', 100)
pd.options.display.float_format = '{:.2f}'.format

# Outline

###  Part A. Sorting Households by Income Category
 1. Bring in 2016 PUMS data
 2. Assign county_id to each puma
 3. Bring in and clean, reformat county AMIs data
 4. Create dummy variables for each income category and assign to households by number of people and hh income
 5. Create count variables for households and people in each income category by  multiplying each dummy by weight variable and number of people in household

### Part B. Counting People by Age Group and Income Category
1. Create count variables for each income category and age group (under 18, adult, senior?)
2. Calculate count variable by multiplying each income category dummy by the number of people within that age category
 
### Part C. Aggregating at PUMA Level
1. Aggregate at PUMA level using groupby

## Part A1. Bring in 2016 PUMS data

In [2]:
#load zipfile from PUMS website
url = "https://www2.census.gov/programs-surveys/acs/data/pums/2016/1-Year/csv_hca.zip"
with open('csv_hca.zip', 'wb') as f:
    r = requests.get(url)
    f.write(r.content)

In [3]:
#open zipfile
z = zipfile.ZipFile('csv_hca.zip')

In [4]:
#import table to dataframe
variable_types = {"NP":"int64","NOC":"float","BDSP":"float","RMSP":"str","MV":"float","WGTP":"float","TAXP":"float"}
columns = ["NP","R65","NRC","PUMA","RT","BDSP","BLD","RNTP","MRGP","SMP","CONP","TEN","VACS","VALP","TAXP","GRPIP","GRNTP","HINCP","MV","WGTP"]
pums_df = pd.read_csv(z.open('ss16hca.csv'), 
                     low_memory=False,
                     usecols=columns)
pums_df

Unnamed: 0,RT,PUMA,WGTP,NP,BDSP,BLD,CONP,MRGP,RNTP,SMP,TEN,VACS,VALP,GRNTP,GRPIP,HINCP,MV,NRC,R65,TAXP
0,H,10702,102,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00
1,H,6514,122,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00
2,H,3759,58,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00
3,H,5917,73,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00
4,H,3739,52,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,
5,H,5913,120,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00
6,H,7501,72,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00
7,H,10703,66,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00
8,H,7322,0,1,,,,,,,,,,,,,,,,
9,H,8508,55,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,


In [3]:
#rename variables
pums_df.rename(columns={"RT":"record_type","BDSP":"number_bedrooms",
    "BLD":"units_in_structure",
    "RNTP":"monthly_rent",
    "MRGP":"first_mortgage",
    "SMP":"second_mortgage",
    "CONP":"condo_fee",
    "TEN":"tenure",
    "VACS":"vacancy_status",
    "VALP":"property_value",
    "GRPIP":"gross_rent_pct_of_income",
    "GRNTP":"gross_rent",
    "HINCP":"hh_income",
    "N65":"ppl_over_65",
    "NP":"ppl_in_hh",
    "NRC":"number_related_children",
    "WGTP":"weight",
    "TAXP":"prop_tax",
    "MV":"moved_in"}, inplace=True)
pums_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00
8,H,7322,0.00,1,,,,,,,,,,,,,,,,
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,


## Part A2. Assign county id to each puma

In [4]:
# load in crosswalk file
crosswalk_df=pd.read_csv("PUMA_County_Crosswalk_v2.csv", delimiter=",")
crosswalk_df

Unnamed: 0,PUMA,county1,county2,county3,county4,county5,county6,county7
0,101,Alameda CA,,,,,,
1,102,Alameda CA,,,,,,
2,103,Alameda CA,,,,,,
3,104,Alameda CA,,,,,,
4,105,Alameda CA,,,,,,
5,106,Alameda CA,,,,,,
6,107,Alameda CA,,,,,,
7,108,Alameda CA,,,,,,
8,109,Alameda CA,,,,,,
9,110,Alameda CA,,,,,,


In [5]:
# add county name column to puma file
puma_county_df=pums_df.merge(crosswalk_df, how='left', left_on = "PUMA",right_on = "PUMA")
puma_county_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax,county1,county2,county3,county4,county5,county6,county7
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00,Tulare CA,,,,,,
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00,Riverside CA,,,,,,
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00,Los Angeles CA,,,,,,
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00,Orange CA,,,,,,
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,,Los Angeles CA,,,,,,
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00,Orange CA,,,,,,
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00,San Francisco CA,,,,,,
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00,Tulare CA,,,,,,
8,H,7322,0.00,1,,,,,,,,,,,,,,,,,San Diego CA,,,,,,
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,,Santa Clara CA,,,,,,


In [6]:
# remove " CA" from end of each county name
puma_county_df['County'] = puma_county_df['county1'].str.replace(r' CA', '')
puma_county_df['county2'] = puma_county_df['county2'].str.replace(r' CA', '')
puma_county_df['county3'] = puma_county_df['county3'].str.replace(r' CA', '')
puma_county_df['county4'] = puma_county_df['county4'].str.replace(r' CA', '')
puma_county_df['county5'] = puma_county_df['county5'].str.replace(r' CA', '')
puma_county_df['county6'] = puma_county_df['county6'].str.replace(r' CA', '')
puma_county_df['county7'] = puma_county_df['county7'].str.replace(r' CA', '')
puma_county_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax,county1,county2,county3,county4,county5,county6,county7,County
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00,Tulare CA,,,,,,,Tulare
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00,Riverside CA,,,,,,,Riverside
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00,Los Angeles CA,,,,,,,Los Angeles
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00,Orange CA,,,,,,,Orange
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,,Los Angeles CA,,,,,,,Los Angeles
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00,Orange CA,,,,,,,Orange
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00,San Francisco CA,,,,,,,San Francisco
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00,Tulare CA,,,,,,,Tulare
8,H,7322,0.00,1,,,,,,,,,,,,,,,,,San Diego CA,,,,,,,San Diego
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,,Santa Clara CA,,,,,,,Santa Clara


## Part A3. Bring in and clean, reformat county AMIs data

In [7]:
#bring in 2016 county AMIs file
df = pd.read_csv("2016_AMIs.csv", delimiter=",")
df

Unnamed: 0,County,Income_Category,1,2,3,4,5,6,7,8
0,Alameda County\n4-Per,Extremely Low,20500,23400,26350,29250,31600,33950,36730,40890
1,Alameda County\n4-Per,Very Low Income,34150,39000,43900,48750,52650,56550,60450,64350
2,Alameda County\n4-Per,Low Income,52650,60150,67650,75150,81200,87200,93200,99200
3,Alameda County\n4-Per,Median Income,65500,74900,84250,93600,101100,108600,116050,123550
4,Alameda County\n4-Per,Moderate Income,78600,89850,101050,112300,121300,130250,139250,148250
5,Alpine County\n4-Pers,Extremely Low,18150,20750,23350,25900,28440,32580,36730,40890
6,Alpine County\n4-Pers,Very Low Income,30250,34600,38900,43200,46700,50150,53600,57050
7,Alpine County\n4-Pers,Low Income,46100,52650,59250,65800,71100,76350,81600,86900
8,Alpine County\n4-Pers,Median Income,66450,75900,85400,94900,102500,110100,117700,125250
9,Alpine County\n4-Pers,Moderate Income,79750,91100,102500,113900,123000,132100,141250,150350


In [8]:
#clean county names variable
df['County'] = df['County'].str.replace(r' County\n4-Person', '')
df['County'] = df['County'].str.replace(r' County\n4-Perso', '')
df['County'] = df['County'].str.replace(r' County\n4-Pers', '')
df['County'] = df['County'].str.replace(r' County\n4-Per', '')
df['County'] = df['County'].str.replace(r' County\n4-Pe', '')
df['County'] = df['County'].str.replace(r' County\n4-P', '')
df['County'] = df['County'].str.replace(r' County\n4-', '')
df['County'] = df['County'].str.replace(r' County\n4', '')
df['County'] = df['County'].str.replace(r' County\n', '')
df['County'] = df['County'].str.replace(r' County\ ' , '')
df['County'] = df['County'].str.replace(r' County', '')
df['County'] = df['County'].str.replace(r' Count', '')
df['County'] = df['County'].str.replace(r' Coun', '')
df

Unnamed: 0,County,Income_Category,1,2,3,4,5,6,7,8
0,Alameda,Extremely Low,20500,23400,26350,29250,31600,33950,36730,40890
1,Alameda,Very Low Income,34150,39000,43900,48750,52650,56550,60450,64350
2,Alameda,Low Income,52650,60150,67650,75150,81200,87200,93200,99200
3,Alameda,Median Income,65500,74900,84250,93600,101100,108600,116050,123550
4,Alameda,Moderate Income,78600,89850,101050,112300,121300,130250,139250,148250
5,Alpine,Extremely Low,18150,20750,23350,25900,28440,32580,36730,40890
6,Alpine,Very Low Income,30250,34600,38900,43200,46700,50150,53600,57050
7,Alpine,Low Income,46100,52650,59250,65800,71100,76350,81600,86900
8,Alpine,Median Income,66450,75900,85400,94900,102500,110100,117700,125250
9,Alpine,Moderate Income,79750,91100,102500,113900,123000,132100,141250,150350


In [9]:
#create df for each income level and rename variables
eli_df = df[df.Income_Category =="Extremely Low"].copy()
eli_df.rename(columns={"1":"ELI_1",
                       "2":"ELI_2",
                       "3":"ELI_3",
                       "4":"ELI_4",
                       "5":"ELI_5",
                       "6":"ELI_6",
                       "7":"ELI_7",
                       "8":"ELI_8"}, inplace=True)
eli_df.drop('Income_Category', axis=1, inplace=True)
eli_df

Unnamed: 0,County,ELI_1,ELI_2,ELI_3,ELI_4,ELI_5,ELI_6,ELI_7,ELI_8
0,Alameda,20500,23400,26350,29250,31600,33950,36730,40890
5,Alpine,18150,20750,23350,25900,28440,32580,36730,40890
10,Amador,15200,17400,20160,24300,28440,32580,36730,40890
15,Butte,12400,16020,20160,24300,28440,32580,36550,38900
20,Calaveras,14750,16850,20160,24300,28440,32580,36730,40890
25,Colusa,12400,16020,20160,24300,28440,32580,36550,38900
30,Contra Costa,20500,23400,26350,29250,31600,33950,36730,40890
35,Del Norte,12400,16020,20160,24300,28440,32580,36550,38900
40,El Dorado,16000,18300,20600,24300,28440,32580,36730,40890
45,Fresno,12400,16020,20160,24300,28440,32580,36550,38900


In [10]:
vli_df = df[df.Income_Category =="Very Low Income"].copy()
vli_df.rename(columns={"1":"VLI_1",
                       "2":"VLI_2",
                       "3":"VLI_3",
                       "4":"VLI_4",
                       "5":"VLI_5",
                       "6":"VLI_6",
                       "7":"VLI_7",
                       "8":"VLI_8"}, inplace=True)
vli_df.drop('Income_Category', axis=1, inplace=True)
vli_df

Unnamed: 0,County,VLI_1,VLI_2,VLI_3,VLI_4,VLI_5,VLI_6,VLI_7,VLI_8
1,Alameda,34150,39000,43900,48750,52650,56550,60450,64350
6,Alpine,30250,34600,38900,43200,46700,50150,53600,57050
11,Amador,25350,28950,32550,36150,39050,41950,44850,47750
16,Butte,20650,23600,26550,29450,31850,34200,36550,38900
21,Calaveras,24600,28100,31600,35100,37950,40750,43550,46350
26,Colusa,20650,23600,26550,29450,31850,34200,36550,38900
31,Contra Costa,34150,39000,43900,48750,52650,56550,60450,64350
36,Del Norte,20650,23600,26550,29450,31850,34200,36550,38900
41,El Dorado,26650,30450,34250,38050,41100,44150,47200,50250
46,Fresno,20650,23600,26550,29450,31850,34200,36550,38900


In [11]:
li_df = df[df.Income_Category =="Low Income"].copy()
li_df.rename(columns={"1":"LI_1",
                       "2":"LI_2",
                       "3":"LI_3",
                       "4":"LI_4",
                       "5":"LI_5",
                       "6":"LI_6",
                       "7":"LI_7",
                       "8":"LI_8"}, inplace=True)
li_df.drop('Income_Category', axis=1, inplace=True)
li_df

Unnamed: 0,County,LI_1,LI_2,LI_3,LI_4,LI_5,LI_6,LI_7,LI_8
2,Alameda,52650,60150,67650,75150,81200,87200,93200,99200
7,Alpine,46100,52650,59250,65800,71100,76350,81600,86900
12,Amador,40500,46300,52100,57850,62500,67150,71750,76400
17,Butte,33000,37700,42400,47100,50900,54650,58450,62200
22,Calaveras,39350,44950,50550,56150,60650,65150,69650,74150
27,Colusa,33000,37700,42400,47100,50900,54650,58450,62200
32,Contra Costa,52650,60150,67650,75150,81200,87200,93200,99200
37,Del Norte,33000,37700,42400,47100,50900,54650,58450,62200
42,El Dorado,42650,48750,54850,60900,65800,70650,75550,80400
47,Fresno,33000,37700,42400,47100,50900,54650,58450,62200


In [12]:
mi_df = df[df.Income_Category =="Median Income"].copy()
mi_df.rename(columns={"1":"MI_1",
                       "2":"MI_2",
                       "3":"MI_3",
                       "4":"MI_4",
                       "5":"MI_5",
                       "6":"MI_6",
                       "7":"MI_7",
                       "8":"MI_8"}, inplace=True)
mi_df.drop('Income_Category', axis=1, inplace=True)
mi_df

Unnamed: 0,County,MI_1,MI_2,MI_3,MI_4,MI_5,MI_6,MI_7,MI_8
3,Alameda,65500,74900,84250,93600,101100,108600,116050,123550
8,Alpine,66450,75900,85400,94900,102500,110100,117700,125250
13,Amador,50600,57850,65050,72300,78100,83850,89650,95450
18,Butte,41250,47100,53000,58900,63600,68300,73050,77750
23,Calaveras,49150,56150,63200,70200,75800,81450,87050,92650
28,Colusa,41250,47100,53000,58900,63600,68300,73050,77750
33,Contra Costa,65500,74900,84250,93600,101100,108600,116050,123550
38,Del Norte,41250,47100,53000,58900,63600,68300,73050,77750
43,El Dorado,53250,60900,68500,76100,82200,88300,94350,100450
48,Fresno,41250,47100,53000,58900,63600,68300,73050,77750


In [13]:
moi_df = df[df.Income_Category =="Moderate Income"].copy()
moi_df.rename(columns={"1":"MoI_1",
                       "2":"MoI_2",
                       "3":"MoI_3",
                       "4":"MoI_4",
                       "5":"MoI_5",
                       "6":"MoI_6",
                       "7":"MoI_7",
                       "8":"MoI_8"}, inplace=True)
moi_df.drop('Income_Category', axis=1, inplace=True)
moi_df

Unnamed: 0,County,MoI_1,MoI_2,MoI_3,MoI_4,MoI_5,MoI_6,MoI_7,MoI_8
4,Alameda,78600,89850,101050,112300,121300,130250,139250,148250
9,Alpine,79750,91100,102500,113900,123000,132100,141250,150350
14,Amador,60700,69400,78100,86750,93700,100650,107550,114500
19,Butte,49500,56550,63650,70700,76350,82000,87650,93300
24,Calaveras,58950,67400,75850,84250,91000,97750,104450,111200
29,Colusa,49500,56550,63650,70700,76350,82000,87650,93300
34,Contra Costa,78600,89850,101050,112300,121300,130250,139250,148250
39,Del Norte,49500,56550,63650,70700,76350,82000,87650,93300
44,El Dorado,63900,73050,82150,91300,98600,105900,113200,120500
49,Fresno,49500,56550,63650,70700,76350,82000,87650,93300


In [14]:
#combine dfs
all_amis_df=eli_df.merge(vli_df, how='left', left_on = "County",right_on = "County")
all_amis_df=all_amis_df.merge(li_df, how='left', left_on = "County",right_on = "County")
all_amis_df=all_amis_df.merge(mi_df, how='left', left_on = "County",right_on = "County")
all_amis_df=all_amis_df.merge(moi_df, how='left', left_on = "County",right_on = "County")
all_amis_df

Unnamed: 0,County,ELI_1,ELI_2,ELI_3,ELI_4,ELI_5,ELI_6,ELI_7,ELI_8,VLI_1,VLI_2,VLI_3,VLI_4,VLI_5,VLI_6,VLI_7,VLI_8,LI_1,LI_2,LI_3,LI_4,LI_5,LI_6,LI_7,LI_8,MI_1,MI_2,MI_3,MI_4,MI_5,MI_6,MI_7,MI_8,MoI_1,MoI_2,MoI_3,MoI_4,MoI_5,MoI_6,MoI_7,MoI_8
0,Alameda,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,52650,60150,67650,75150,81200,87200,93200,99200,65500,74900,84250,93600,101100,108600,116050,123550,78600,89850,101050,112300,121300,130250,139250,148250
1,Alpine,18150,20750,23350,25900,28440,32580,36730,40890,30250,34600,38900,43200,46700,50150,53600,57050,46100,52650,59250,65800,71100,76350,81600,86900,66450,75900,85400,94900,102500,110100,117700,125250,79750,91100,102500,113900,123000,132100,141250,150350
2,Amador,15200,17400,20160,24300,28440,32580,36730,40890,25350,28950,32550,36150,39050,41950,44850,47750,40500,46300,52100,57850,62500,67150,71750,76400,50600,57850,65050,72300,78100,83850,89650,95450,60700,69400,78100,86750,93700,100650,107550,114500
3,Butte,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300
4,Calaveras,14750,16850,20160,24300,28440,32580,36730,40890,24600,28100,31600,35100,37950,40750,43550,46350,39350,44950,50550,56150,60650,65150,69650,74150,49150,56150,63200,70200,75800,81450,87050,92650,58950,67400,75850,84250,91000,97750,104450,111200
5,Colusa,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300
6,Contra Costa,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,52650,60150,67650,75150,81200,87200,93200,99200,65500,74900,84250,93600,101100,108600,116050,123550,78600,89850,101050,112300,121300,130250,139250,148250
7,Del Norte,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300
8,El Dorado,16000,18300,20600,24300,28440,32580,36730,40890,26650,30450,34250,38050,41100,44150,47200,50250,42650,48750,54850,60900,65800,70650,75550,80400,53250,60900,68500,76100,82200,88300,94350,100450,63900,73050,82150,91300,98600,105900,113200,120500
9,Fresno,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300


## Part A4. Create dummy variables for each income category and assign to households by number of people and hh income

In [15]:
# merge datasets
hh_df=puma_county_df.merge(all_amis_df, how='left', left_on = "County",right_on = "County")
hh_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax,county1,county2,county3,county4,county5,county6,county7,County,ELI_1,ELI_2,ELI_3,ELI_4,ELI_5,ELI_6,ELI_7,ELI_8,VLI_1,VLI_2,VLI_3,VLI_4,VLI_5,VLI_6,VLI_7,VLI_8,LI_1,LI_2,LI_3,LI_4,LI_5,LI_6,LI_7,LI_8,MI_1,MI_2,MI_3,MI_4,MI_5,MI_6,MI_7,MI_8,MoI_1,MoI_2,MoI_3,MoI_4,MoI_5,MoI_6,MoI_7,MoI_8
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00,Riverside CA,,,,,,,Riverside,14100,16100,20160,24300,28440,32580,36730,40890,23450,26800,30150,33500,36200,38900,41550,44250,37550,42900,48250,53600,57900,62200,66500,70800,45500,52000,58500,65000,70200,75400,80600,85800,54600,62400,70200,78000,84250,90500,96700,102950
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00,San Francisco CA,,,,,,,San Francisco,25850,29550,33250,36900,39900,42850,45800,48750,43050,49200,55350,61500,66450,71350,76300,81200,68950,78800,88650,98500,106400,114300,122150,130050,75400,86150,96950,107700,116300,124950,133550,142150,90500,103400,116350,129250,139600,149950,160250,170600
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300
8,H,7322,0.00,1,,,,,,,,,,,,,,,,,San Diego CA,,,,,,,San Diego,17850,20400,22950,25500,28440,32580,36730,40890,29750,34000,38250,42500,45900,49300,52700,56100,47600,54400,61200,68000,73450,78900,84350,89800,53150,60700,68300,75900,81950,88050,94100,100200,63750,72900,82000,91100,98400,105700,112950,120250
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,,Santa Clara CA,,,,,,,Santa Clara,23450,26800,30150,33500,36200,38900,41550,44250,39100,44650,50250,55800,60300,64750,69200,73700,59400,67900,76400,84900,91650,98450,105250,112050,74950,85700,96400,107100,115650,124250,132800,141350,89950,102800,115650,128500,138800,149050,159350,169600


In [16]:
#create count variables for households in each income group

hh_df["ELI_count"]=np.where((hh_df.ppl_in_hh==1)&(hh_df.hh_income<=hh_df.ELI_1)|
                            (hh_df.ppl_in_hh==2)&(hh_df.hh_income<=hh_df.ELI_2)|
                            (hh_df.ppl_in_hh==3)&(hh_df.hh_income<=hh_df.ELI_3)|
                            (hh_df.ppl_in_hh==4)&(hh_df.hh_income<=hh_df.ELI_4)|
                            (hh_df.ppl_in_hh==5)&(hh_df.hh_income<=hh_df.ELI_5)|
                            (hh_df.ppl_in_hh==6)&(hh_df.hh_income<=hh_df.ELI_6)|
                            (hh_df.ppl_in_hh==7)&(hh_df.hh_income<=hh_df.ELI_7)|
                            (hh_df.ppl_in_hh==8)&(hh_df.hh_income<=hh_df.ELI_8),1,0)
hh_df["VLI_count"]=np.where((hh_df.ppl_in_hh==1)&(hh_df.hh_income<=hh_df.VLI_1)&(hh_df.hh_income>hh_df.ELI_1)|
                            (hh_df.ppl_in_hh==2)&(hh_df.hh_income<=hh_df.VLI_2)&(hh_df.hh_income>hh_df.ELI_2)|
                            (hh_df.ppl_in_hh==3)&(hh_df.hh_income<=hh_df.VLI_3)&(hh_df.hh_income>hh_df.ELI_3)|
                            (hh_df.ppl_in_hh==4)&(hh_df.hh_income<=hh_df.VLI_4)&(hh_df.hh_income>hh_df.ELI_4)|
                            (hh_df.ppl_in_hh==5)&(hh_df.hh_income<=hh_df.VLI_5)&(hh_df.hh_income>hh_df.ELI_5)|
                            (hh_df.ppl_in_hh==6)&(hh_df.hh_income<=hh_df.VLI_6)&(hh_df.hh_income>hh_df.ELI_6)|
                            (hh_df.ppl_in_hh==7)&(hh_df.hh_income<=hh_df.VLI_7)&(hh_df.hh_income>hh_df.ELI_7)|
                            (hh_df.ppl_in_hh==8)&(hh_df.hh_income<=hh_df.VLI_8)&(hh_df.hh_income>hh_df.ELI_8),1,0)
hh_df["LI_count"]=np.where((hh_df.ppl_in_hh==1)&(hh_df.hh_income<=hh_df.LI_1)&(hh_df.hh_income>hh_df.VLI_1)|
                           (hh_df.ppl_in_hh==2)&(hh_df.hh_income<=hh_df.LI_2)&(hh_df.hh_income>hh_df.VLI_2)|
                           (hh_df.ppl_in_hh==3)&(hh_df.hh_income<=hh_df.LI_3)&(hh_df.hh_income>hh_df.VLI_3)|
                           (hh_df.ppl_in_hh==4)&(hh_df.hh_income<=hh_df.LI_4)&(hh_df.hh_income>hh_df.VLI_4)|
                           (hh_df.ppl_in_hh==5)&(hh_df.hh_income<=hh_df.LI_5)&(hh_df.hh_income>hh_df.VLI_5)|
                           (hh_df.ppl_in_hh==6)&(hh_df.hh_income<=hh_df.LI_6)&(hh_df.hh_income>hh_df.VLI_6)|
                           (hh_df.ppl_in_hh==7)&(hh_df.hh_income<=hh_df.LI_7)&(hh_df.hh_income>hh_df.VLI_7)|
                           (hh_df.ppl_in_hh==8)&(hh_df.hh_income<=hh_df.LI_8)&(hh_df.hh_income>hh_df.VLI_8),1,0)
hh_df["MoI_count"]=np.where((hh_df.ppl_in_hh==1)&(hh_df.hh_income<=hh_df.MoI_1)&(hh_df.hh_income>hh_df.LI_1)|
                            (hh_df.ppl_in_hh==2)&(hh_df.hh_income<=hh_df.MoI_2)&(hh_df.hh_income>hh_df.LI_2)|
                            (hh_df.ppl_in_hh==3)&(hh_df.hh_income<=hh_df.MoI_3)&(hh_df.hh_income>hh_df.LI_3)|
                            (hh_df.ppl_in_hh==4)&(hh_df.hh_income<=hh_df.MoI_4)&(hh_df.hh_income>hh_df.LI_4)|
                            (hh_df.ppl_in_hh==5)&(hh_df.hh_income<=hh_df.MoI_5)&(hh_df.hh_income>hh_df.LI_5)|
                            (hh_df.ppl_in_hh==6)&(hh_df.hh_income<=hh_df.MoI_6)&(hh_df.hh_income>hh_df.LI_6)|
                            (hh_df.ppl_in_hh==7)&(hh_df.hh_income<=hh_df.MoI_7)&(hh_df.hh_income>hh_df.LI_7)|
                            (hh_df.ppl_in_hh==8)&(hh_df.hh_income<=hh_df.MoI_8)&(hh_df.hh_income>hh_df.LI_8),1,0)
hh_df["HI_count"]=np.where((hh_df.ppl_in_hh==1)&(hh_df.hh_income>hh_df.MoI_1)|
                           (hh_df.ppl_in_hh==2)&(hh_df.hh_income>hh_df.MoI_2)|
                           (hh_df.ppl_in_hh==3)&(hh_df.hh_income>hh_df.MoI_3)|
                           (hh_df.ppl_in_hh==4)&(hh_df.hh_income>hh_df.MoI_4)|
                           (hh_df.ppl_in_hh==5)&(hh_df.hh_income>hh_df.MoI_5)|
                           (hh_df.ppl_in_hh==6)&(hh_df.hh_income>hh_df.MoI_6)|
                           (hh_df.ppl_in_hh==7)&(hh_df.hh_income>hh_df.MoI_7)|
                           (hh_df.ppl_in_hh==8)&(hh_df.hh_income>hh_df.MoI_8),1,0)
hh_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax,county1,county2,county3,county4,county5,county6,county7,County,ELI_1,ELI_2,ELI_3,ELI_4,ELI_5,ELI_6,ELI_7,ELI_8,VLI_1,VLI_2,VLI_3,VLI_4,VLI_5,VLI_6,VLI_7,VLI_8,LI_1,LI_2,LI_3,LI_4,LI_5,LI_6,LI_7,LI_8,MI_1,MI_2,MI_3,MI_4,MI_5,MI_6,MI_7,MI_8,MoI_1,MoI_2,MoI_3,MoI_4,MoI_5,MoI_6,MoI_7,MoI_8,ELI_count,VLI_count,LI_count,MoI_count,HI_count
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300,0,0,0,1,0
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00,Riverside CA,,,,,,,Riverside,14100,16100,20160,24300,28440,32580,36730,40890,23450,26800,30150,33500,36200,38900,41550,44250,37550,42900,48250,53600,57900,62200,66500,70800,45500,52000,58500,65000,70200,75400,80600,85800,54600,62400,70200,78000,84250,90500,96700,102950,0,0,1,0,0
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650,0,0,0,0,1
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150,0,0,0,0,1
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650,0,0,0,0,1
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150,0,0,0,1,0
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00,San Francisco CA,,,,,,,San Francisco,25850,29550,33250,36900,39900,42850,45800,48750,43050,49200,55350,61500,66450,71350,76300,81200,68950,78800,88650,98500,106400,114300,122150,130050,75400,86150,96950,107700,116300,124950,133550,142150,90500,103400,116350,129250,139600,149950,160250,170600,0,0,0,0,1
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300,0,0,1,0,0
8,H,7322,0.00,1,,,,,,,,,,,,,,,,,San Diego CA,,,,,,,San Diego,17850,20400,22950,25500,28440,32580,36730,40890,29750,34000,38250,42500,45900,49300,52700,56100,47600,54400,61200,68000,73450,78900,84350,89800,53150,60700,68300,75900,81950,88050,94100,100200,63750,72900,82000,91100,98400,105700,112950,120250,0,0,0,0,0
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,,Santa Clara CA,,,,,,,Santa Clara,23450,26800,30150,33500,36200,38900,41550,44250,39100,44650,50250,55800,60300,64750,69200,73700,59400,67900,76400,84900,91650,98450,105250,112050,74950,85700,96400,107100,115650,124250,132800,141350,89950,102800,115650,128500,138800,149050,159350,169600,1,0,0,0,0


## Part. A5. Create count variables for households and people in each income category by  multiplying each dummy by weight variable and number of people in household

In [17]:
hh_df["16_ELI_hh_count"]=hh_df.ELI_count*hh_df.weight
hh_df["16_ELI_ppl_count"]=hh_df.ELI_count*hh_df.ppl_in_hh*hh_df.weight
hh_df["16_VLI_hh_count"]=hh_df.VLI_count*hh_df.weight
hh_df["16_VLI_ppl_count"]=hh_df.VLI_count*hh_df.ppl_in_hh*hh_df.weight
hh_df["16_LI_hh_count"]=hh_df.LI_count*hh_df.weight
hh_df["16_LI_ppl_count"]=hh_df.LI_count*hh_df.ppl_in_hh*hh_df.weight
hh_df["16_MoI_hh_count"]=hh_df.MoI_count*hh_df.weight
hh_df["16_MoI_ppl_count"]=hh_df.MoI_count*hh_df.ppl_in_hh*hh_df.weight
hh_df["16_HI_hh_count"]=hh_df.HI_count*hh_df.weight
hh_df["16_HI_ppl_count"]=hh_df.HI_count*hh_df.ppl_in_hh*hh_df.weight
hh_df

Unnamed: 0,record_type,PUMA,weight,ppl_in_hh,number_bedrooms,units_in_structure,condo_fee,first_mortgage,monthly_rent,second_mortgage,tenure,vacancy_status,property_value,gross_rent,gross_rent_pct_of_income,hh_income,moved_in,number_related_children,R65,prop_tax,county1,county2,county3,county4,county5,county6,county7,County,ELI_1,ELI_2,ELI_3,ELI_4,ELI_5,ELI_6,ELI_7,ELI_8,VLI_1,VLI_2,VLI_3,VLI_4,VLI_5,VLI_6,VLI_7,VLI_8,LI_1,LI_2,LI_3,LI_4,LI_5,LI_6,LI_7,LI_8,MI_1,MI_2,MI_3,MI_4,MI_5,MI_6,MI_7,MI_8,MoI_1,MoI_2,MoI_3,MoI_4,MoI_5,MoI_6,MoI_7,MoI_8,ELI_count,VLI_count,LI_count,MoI_count,HI_count,16_ELI_hh_count,16_ELI_ppl_count,16_VLI_hh_count,16_VLI_ppl_count,16_LI_hh_count,16_LI_ppl_count,16_MoI_hh_count,16_MoI_ppl_count,16_HI_hh_count,16_HI_ppl_count
0,H,10702,102.00,2,3.00,2.00,0.00,,,,2.00,,160000.00,,,54000.00,6.00,0.00,2.00,25.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300,0,0,0,1,0,0.00,0.00,0.00,0.00,0.00,0.00,102.00,204.00,0.00,0.00
1,H,6514,122.00,3,4.00,2.00,0.00,1000.00,,,1.00,,290000.00,,,41500.00,7.00,0.00,0.00,28.00,Riverside CA,,,,,,,Riverside,14100,16100,20160,24300,28440,32580,36730,40890,23450,26800,30150,33500,36200,38900,41550,44250,37550,42900,48250,53600,57900,62200,66500,70800,45500,52000,58500,65000,70200,75400,80600,85800,54600,62400,70200,78000,84250,90500,96700,102950,0,0,1,0,0,0.00,0.00,0.00,0.00,122.00,366.00,0.00,0.00,0.00,0.00
2,H,3759,58.00,2,3.00,2.00,0.00,2000.00,,,1.00,,700000.00,,,204000.00,3.00,0.00,0.00,65.00,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650,0,0,0,0,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,58.00,116.00
3,H,5917,73.00,2,2.00,2.00,300.00,2100.00,,,1.00,,400000.00,,,130000.00,1.00,0.00,0.00,30.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150,0,0,0,0,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,73.00,146.00
4,H,3739,52.00,2,1.00,6.00,,,1100.00,,3.00,,,1150.00,18.00,76000.00,3.00,0.00,0.00,,Los Angeles CA,,,,,,,Los Angeles,18250,20850,23450,26050,28440,32580,36730,40890,30400,34750,39100,43400,46900,50350,53850,57300,48650,55600,62550,69450,75050,80600,86150,91700,45350,51850,58300,64800,70000,75150,80350,85550,54450,62200,70000,77750,83950,90200,96400,102650,0,0,0,0,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,52.00,104.00
5,H,5913,120.00,7,3.00,2.00,0.00,1500.00,,,1.00,,420000.00,,,126100.00,5.00,3.00,2.00,39.00,Orange CA,,,,,,,Orange,20500,23400,26350,29250,31600,33950,36730,40890,34150,39000,43900,48750,52650,56550,60450,64350,54600,62400,70200,78000,84250,90500,96750,103000,61050,69750,78500,87200,94200,101150,108150,115100,73250,83700,94200,104650,113000,121400,129750,138150,0,0,0,1,0,0.00,0.00,0.00,0.00,0.00,0.00,120.00,840.00,0.00,0.00
6,H,7501,72.00,2,2.00,4.00,0.00,5000.00,,,1.00,,2000000.00,,,131100.00,7.00,0.00,2.00,66.00,San Francisco CA,,,,,,,San Francisco,25850,29550,33250,36900,39900,42850,45800,48750,43050,49200,55350,61500,66450,71350,76300,81200,68950,78800,88650,98500,106400,114300,122150,130050,75400,86150,96950,107700,116300,124950,133550,142150,90500,103400,116350,129250,139600,149950,160250,170600,0,0,0,0,1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,72.00,144.00
7,H,10703,66.00,4,3.00,2.00,0.00,100.00,,,1.00,,9500.00,,,35000.00,3.00,2.00,0.00,18.00,Tulare CA,,,,,,,Tulare,12400,16020,20160,24300,28440,32580,36550,38900,20650,23600,26550,29450,31850,34200,36550,38900,33000,37700,42400,47100,50900,54650,58450,62200,41250,47100,53000,58900,63600,68300,73050,77750,49500,56550,63650,70700,76350,82000,87650,93300,0,0,1,0,0,0.00,0.00,0.00,0.00,66.00,264.00,0.00,0.00,0.00,0.00
8,H,7322,0.00,1,,,,,,,,,,,,,,,,,San Diego CA,,,,,,,San Diego,17850,20400,22950,25500,28440,32580,36730,40890,29750,34000,38250,42500,45900,49300,52700,56100,47600,54400,61200,68000,73450,78900,84350,89800,53150,60700,68300,75900,81950,88050,94100,100200,63750,72900,82000,91100,98400,105700,112950,120250,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
9,H,8508,55.00,2,0.00,4.00,,,1600.00,,3.00,,,1690.00,101.00,6700.00,3.00,0.00,0.00,,Santa Clara CA,,,,,,,Santa Clara,23450,26800,30150,33500,36200,38900,41550,44250,39100,44650,50250,55800,60300,64750,69200,73700,59400,67900,76400,84900,91650,98450,105250,112050,74950,85700,96400,107100,115650,124250,132800,141350,89950,102800,115650,128500,138800,149050,159350,169600,1,0,0,0,0,55.00,110.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


# Part C1. Aggregating at the PUMA level

In [18]:
puma_sums_df = hh_df.groupby("PUMA").sum()
puma_counts_df = puma_sums_df[["16_ELI_hh_count","16_ELI_ppl_count",
                              "16_VLI_hh_count",
                              "16_VLI_ppl_count",
                              "16_LI_hh_count",
                              "16_LI_ppl_count",
                              "16_MoI_hh_count",
                              "16_MoI_ppl_count",
                              "16_HI_hh_count",
                              "16_HI_ppl_count"]].copy()
puma_counts_df

Unnamed: 0_level_0,16_ELI_hh_count,16_ELI_ppl_count,16_VLI_hh_count,16_VLI_ppl_count,16_LI_hh_count,16_LI_ppl_count,16_MoI_hh_count,16_MoI_ppl_count,16_HI_hh_count,16_HI_ppl_count
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101,9835.00,16874.00,5387.00,10283.00,6655.00,14414.00,6714.00,14505.00,25370.00,67245.00
102,19309.00,40228.00,8540.00,17615.00,9890.00,20778.00,12278.00,27405.00,25207.00,54149.00
103,4748.00,7642.00,3797.00,6482.00,3936.00,8783.00,6279.00,14216.00,34055.00,84439.00
104,12566.00,33425.00,7530.00,23486.00,6165.00,22226.00,6973.00,20124.00,4696.00,14221.00
105,8545.00,16888.00,7510.00,17554.00,10225.00,24688.00,12962.00,32742.00,27228.00,73907.00
106,5951.00,13077.00,5171.00,11023.00,8789.00,25177.00,9833.00,26885.00,19551.00,54832.00
107,7120.00,14746.00,5833.00,19446.00,7324.00,24112.00,9926.00,33755.00,15814.00,50959.00
108,3515.00,7796.00,3627.00,10664.00,5193.00,16527.00,7710.00,27501.00,23270.00,76270.00
109,4705.00,9044.00,3822.00,8125.00,7601.00,22733.00,11365.00,35586.00,35466.00,113279.00
110,6725.00,12798.00,4697.00,9184.00,6942.00,16655.00,12634.00,38417.00,50842.00,152470.00


In [19]:
#export it
puma_counts_df.to_csv("2016_ppl_hh_puma_counts.csv")