# Zip code & Tract merging Notebook

Adapted from [Sarah Wang's notebook](https://github.com/jcweaver/broadband-capstone/blob/main/notebooks/census_broadband_merge.ipynb)

This notebook does a bit of data cleaning to pad all zipcode, county and tract IDs with 0s where needed.

Then this notebook calculates the weighted average of each Census statistic for each zipcode and appends these columns to the broadband dataset.

In [110]:
## imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import os

## Reading in data & some cleaning to pad with zeroes

In [111]:
# get broadband data
broadband_data = pd.read_csv("../data/merged_broadband.csv", index_col=0)
broadband_data.head()
# zipcode

Unnamed: 0,Zip,Population,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,...,All25_3_2015,All100_3.1,Total_Enrolled_Households,ST,COUNTY NAME,COUNTY ID,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD
0,29639,1742.0,3.0,0.0,8.0,3.0,3.0,5.0,3.0,163.0,...,3.0,3.0,21.0,SC,Abbeville,45001,0.948,0.034,0.11,0.002
1,29620,12934.0,6.0,0.0,11.0,5.0,3.0,7.0,3.0,2536.0,...,3.0,3.0,542.0,SC,Abbeville,45001,0.398,0.002,0.007,0.0
2,29659,,,,,,,,,,...,,,,SC,Abbeville,45001,0.206,0.152,0.608,0.043
3,29638,2944.0,6.0,1.0,13.0,4.0,4.0,6.0,4.0,272.0,...,2.0,2.0,68.0,SC,Abbeville,45001,0.369,0.01,0.031,-0.001
4,29628,2759.0,4.0,0.0,8.0,3.0,2.0,5.0,2.0,100.0,...,3.0,3.0,102.0,SC,Abbeville,45001,0.221,0.014,0.043,0.0


In [112]:
broadband_data.Zip.value_counts()[broadband_data.Zip.value_counts() > 1]

56160    2
99644    2
55003    2
99563    2
56446    2
        ..
56347    2
56528    2
55071    2
56389    2
99550    2
Name: Zip, Length: 82, dtype: int64

In [113]:
broadband_data[broadband_data.Zip=="56318"][['ST', 'COUNTY NAME', 'COUNTY ID',
       'BROADBAND USAGE', 'ERROR RANGE (MAE)(+/-)', 'ERROR RANGE (95%)(+/-)',
       'MSD', ]]

Unnamed: 0,ST,COUNTY NAME,COUNTY ID,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD


In [114]:
broadband_data[broadband_data.Zip=="99550"][['ST', 'COUNTY NAME', 'COUNTY ID',
       'BROADBAND USAGE', 'ERROR RANGE (MAE)(+/-)', 'ERROR RANGE (95%)(+/-)',
       'MSD', ]]

Unnamed: 0,ST,COUNTY NAME,COUNTY ID,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD


In [115]:
## Dropping the County ID column based on some EDA is was sometimes wrong
broadband_data = broadband_data.drop(columns=["COUNTY ID"])
broadband_data = broadband_data.drop_duplicates()

In [116]:
## Notice how ME states only 4 digits but zipcodes should be 5 digits
broadband_data[broadband_data.ST=="ME"]

Unnamed: 0,Zip,Population,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,...,Wired100_3_2015,All25_3_2015,All100_3.1,Total_Enrolled_Households,ST,COUNTY NAME,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD
719,4236,4350.0,2.0,2.0,10.0,2.0,1.0,6.0,2.0,1414.0,...,0.0,1.0,0.0,50.0,ME,Androscoggin,0.236,0.010,0.031,-0.001
720,4258,2617.0,2.0,1.0,9.0,1.0,1.0,4.0,1.0,24.0,...,0.0,1.0,0.0,26.0,ME,Androscoggin,0.170,0.014,0.043,0.000
721,4250,4228.0,3.0,0.0,9.0,2.0,2.0,4.0,2.0,1962.0,...,0.0,1.0,0.0,140.0,ME,Androscoggin,0.130,0.010,0.031,-0.001
722,4282,5734.0,2.0,0.0,8.0,2.0,1.0,4.0,1.0,2104.0,...,0.0,1.0,0.0,66.0,ME,Androscoggin,0.161,0.006,0.018,-0.001
723,4210,23045.0,5.0,2.0,13.0,3.0,3.0,7.0,4.0,25452.0,...,0.0,1.0,0.0,956.0,ME,Androscoggin,0.756,0.002,0.005,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32594,4049,3659.0,3.0,0.0,9.0,2.0,1.0,4.0,1.0,734.0,...,0.0,1.0,0.0,46.0,ME,York,0.149,0.010,0.031,-0.001
32595,4046,7496.0,4.0,0.0,10.0,4.0,3.0,6.0,3.0,3900.0,...,0.0,1.0,0.0,49.0,ME,York,0.647,0.004,0.013,0.000
32596,4048,2946.0,3.0,0.0,9.0,2.0,1.0,4.0,1.0,916.0,...,0.0,1.0,0.0,47.0,ME,York,0.343,0.010,0.031,-0.001
32597,3907,894.0,3.0,0.0,8.0,3.0,2.0,5.0,2.0,176.0,...,0.0,1.0,0.0,2.0,ME,York,0.536,0.020,0.061,0.000


In [117]:
## Helper function
def add_front_padding(x):
    zipcode = str(x)
    while len(zipcode) < 5:
        zipcode = "0"+zipcode
    return zipcode

In [118]:
## Add front padding for US zip codes which all should be 5 digits
broadband_data["Zip"] = broadband_data["Zip"].apply(add_front_padding)
zipcode = broadband_data.Zip.unique()

In [119]:
## Confirm ME states start with 0
broadband_data[broadband_data.ST=="ME"]

Unnamed: 0,Zip,Population,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,...,Wired100_3_2015,All25_3_2015,All100_3.1,Total_Enrolled_Households,ST,COUNTY NAME,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD
719,04236,4350.0,2.0,2.0,10.0,2.0,1.0,6.0,2.0,1414.0,...,0.0,1.0,0.0,50.0,ME,Androscoggin,0.236,0.010,0.031,-0.001
720,04258,2617.0,2.0,1.0,9.0,1.0,1.0,4.0,1.0,24.0,...,0.0,1.0,0.0,26.0,ME,Androscoggin,0.170,0.014,0.043,0.000
721,04250,4228.0,3.0,0.0,9.0,2.0,2.0,4.0,2.0,1962.0,...,0.0,1.0,0.0,140.0,ME,Androscoggin,0.130,0.010,0.031,-0.001
722,04282,5734.0,2.0,0.0,8.0,2.0,1.0,4.0,1.0,2104.0,...,0.0,1.0,0.0,66.0,ME,Androscoggin,0.161,0.006,0.018,-0.001
723,04210,23045.0,5.0,2.0,13.0,3.0,3.0,7.0,4.0,25452.0,...,0.0,1.0,0.0,956.0,ME,Androscoggin,0.756,0.002,0.005,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32594,04049,3659.0,3.0,0.0,9.0,2.0,1.0,4.0,1.0,734.0,...,0.0,1.0,0.0,46.0,ME,York,0.149,0.010,0.031,-0.001
32595,04046,7496.0,4.0,0.0,10.0,4.0,3.0,6.0,3.0,3900.0,...,0.0,1.0,0.0,49.0,ME,York,0.647,0.004,0.013,0.000
32596,04048,2946.0,3.0,0.0,9.0,2.0,1.0,4.0,1.0,916.0,...,0.0,1.0,0.0,47.0,ME,York,0.343,0.010,0.031,-0.001
32597,03907,894.0,3.0,0.0,8.0,3.0,2.0,5.0,2.0,176.0,...,0.0,1.0,0.0,2.0,ME,York,0.536,0.020,0.061,0.000


In [120]:
print(f'Total number of unique zipcode in broadband data: {len(zipcode)}')

Total number of unique zipcode in broadband data: 32653


In [121]:
#get census data
census_data = pd.read_csv("../data/relabeled_census.csv")
census_data.head()


Unnamed: 0,NAME,median_age_overall,median_age_male,median_age_female,state,county,tract,employment_rate,median_income,total_households,...,pct_internet_broadband_satellite,pct_internet_only_satellite,pct_internet_other,pct_internet_no_subscrp,pct_internet_none,pct_computer,pct_computer_with_dialup,pct_computer_with_broadband,pct_computer_no_internet,pct_no_computer
0,"Census Tract 11, Jefferson County, Alabama",39.0,42.5,38.1,1,73,1100,51.0,37030.0,1851.0,...,0.090222,0.009184,0.0,0.011345,0.248514,0.808212,0.0,0.74014,0.068071,0.191788
1,"Census Tract 14, Jefferson County, Alabama",44.3,40.5,49.1,1,73,1400,45.4,36066.0,816.0,...,0.04902,0.0,0.0,0.020833,0.254902,0.856618,0.0,0.710784,0.145833,0.143382
2,"Census Tract 20, Jefferson County, Alabama",34.0,31.0,36.4,1,73,2000,47.7,27159.0,1419.0,...,0.046512,0.0,0.0,0.0,0.454545,0.713178,0.0,0.545455,0.167724,0.286822
3,"Census Tract 38.02, Jefferson County, Alabama",35.8,31.7,37.3,1,73,3802,51.7,38721.0,1894.0,...,0.039599,0.0,0.0,0.063358,0.336325,0.857445,0.0,0.594509,0.262936,0.142555
4,"Census Tract 40, Jefferson County, Alabama",52.1,51.6,53.8,1,73,4000,36.9,18525.0,1429.0,...,0.045486,0.019594,0.0,0.051085,0.475157,0.630511,0.0,0.447866,0.182645,0.369489


In [122]:
print(f'Total number of rows in census data: {len(census_data.index)}')

Total number of rows in census data: 73056


In [123]:
## This dataset came from: https://mcdc.missouri.edu/applications/geocorr2018.html
#get zipcode, county name, and state for mapping
tract_data = pd.read_csv('../data/zip_conversation_data/all_states_zip_conversion.csv', converters={'zcta5' : lambda x: str(x)}, skiprows = [1])
tract_data = tract_data.rename(columns={"zcta5" : "Zip"})
tract_data


Unnamed: 0,Zip,county,tract,cntyname,zipname,intptlon,intptlat,pop10,afact,AFACT2
0,01001,25013,8132.05,Hampden MA,"Agawam Town, MA",-72.630818,42.051901,3775,0.225118,0.504611
1,01001,25013,8132.06,Hampden MA,"Agawam Town, MA",-72.638052,42.066801,297,0.017711,0.073478
2,01001,25013,8132.07,Hampden MA,"Agawam Town, MA",-72.636153,42.087216,4133,0.246467,0.808648
3,01001,25013,8132.08,Hampden MA,"Agawam Town, MA",-72.609240,42.056361,2918,0.174011,1.000000
4,01001,25013,8132.09,Hampden MA,"Agawam Town, MA",-72.612651,42.075544,5646,0.336693,1.000000
...,...,...,...,...,...,...,...,...,...,...
144119,99999,49045,1307.01,Tooele UT,99999,-112.225848,40.722768,18,0.003210,0.005396
144120,99999,49045,1307.03,Tooele UT,99999,-112.293643,40.034868,6,0.001070,0.000858
144121,99999,51001,9802.00,Accomack VA,99999,-75.511891,37.838676,5,0.000892,1.000000
144122,99999,56023,9782.00,Lincoln WY,99999,-110.665086,42.055432,6,0.001070,0.003009


In [124]:
print(f'Total number of unique zipcodes in mapping table: {len(tract_data.Zip.unique())}')


Total number of unique zipcodes in mapping table: 32846


In [125]:
## Census tracts are 6 digits long
## See: https://www.census.gov/programs-surveys/geography/guidance/geo-identifiers.html
## https://www2.census.gov/geo/pdfs/education/CensusTracts.pdf

def pad_tract(x):
    ## Use string formatting to ensure two decimal points at the end
    ## This will add 00s for appropriate tracts
    tract = "{:.2f}".format(x)
    
    ## Remove period
    tract = tract.replace(".", "")
    
    ##Add additional front padding if needed
    while len(tract) < 6:
        tract = "0" + tract
        
    return tract

In [126]:
pad_tract(18.00)

'001800'

In [127]:
## Before - See decimals & shorter length of some codes
tract_data["tract"]

0         8132.05
1         8132.06
2         8132.07
3         8132.08
4         8132.09
           ...   
144119    1307.01
144120    1307.03
144121    9802.00
144122    9782.00
144123      18.00
Name: tract, Length: 144124, dtype: float64

In [128]:
## After - see all 6 digits and no decimals
tract_data["tract"] = tract_data["tract"].apply(pad_tract)
tract_data["tract"]

0         813205
1         813206
2         813207
3         813208
4         813209
           ...  
144119    130701
144120    130703
144121    980200
144122    978200
144123    001800
Name: tract, Length: 144124, dtype: object

In [129]:
def pad_to_six(x):
    tract = str(x)
    while len(tract) < 6:
        tract = "0" + tract
    return tract

In [130]:
## Before - Notice 00s already at end for some but some are not yet 6 digits
census_data["tract"]

0        1100
1        1400
2        2000
3        3802
4        4000
         ... 
73051    3400
73052     100
73053    3502
73054    2701
73055    2800
Name: tract, Length: 73056, dtype: int64

In [131]:
## The Census data already removed the decimals and added 0s at the end where needed for tracts but
## the tracts need to be prepended with 0s to get to 6 digits
census_data["tract"] = census_data["tract"].apply(pad_to_six)

## Missing ZipCode EDA

In [132]:
## These zipcodes are in the broadband data but not in the mapping data
zips_not_in_mapping_data = broadband_data[~broadband_data.Zip.isin(tract_data.Zip)]
zips_not_in_mapping_data[["Zip", "ST", "COUNTY NAME", "Population"]]

## All of these have population of 0, except for Carson City which is NaN
## But this site: https://www.unitedstateszipcodes.org/89702/ says that Carson City zipcode has 0 population
## Let's drop these

Unnamed: 0,Zip,ST,COUNTY NAME,Population
903,22214,VA,Arlington,0.0
1727,76508,TX,Bell,0.0
4057,89702,NV,Carson City,
5575,73019,OK,Cleveland,0.0
7101,53792,WI,Dane,0.0
13764,80419,CO,Jefferson,0.0
14815,98174,WA,King,0.0
14828,98195,WA,King,0.0
21006,10110,NY,New York,0.0
25586,84144,UT,Salt Lake,0.0


In [133]:
## These zipcodes are in the mapping data but not in the broadband data
zips_not_in_broadband_data = tract_data[~tract_data.Zip.isin(broadband_data.Zip)]
zips_not_in_broadband_data[["Zip", "zipname", "pop10", "afact", "AFACT2"]]


Unnamed: 0,Zip,zipname,pop10,afact,AFACT2
102,01066,"North Hatfield, MA (PO Boxes)",64,1.000000,0.019518
1806,02584,"Nantucket, MA",10,1.000000,0.008591
2777,03754,"Guild, NH (PO Boxes)",86,1.000000,0.013217
3177,04271,"Paris, ME (PO Boxes)",67,1.000000,0.012927
3377,04570,"Squirrel Island, ME",2,1.000000,0.000722
...,...,...,...,...,...
144119,99999,99999,18,0.003210,0.005396
144120,99999,99999,6,0.001070,0.000858
144121,99999,99999,5,0.000892,1.000000
144122,99999,99999,6,0.001070,0.003009


## Joining the Data

We will join the data using a left join to take all keys from the tract set and leave out the few keys we saw above that are in the broadband set but not the tract set.

In [134]:
#merge tract data with broadband data to map a tract with every Zip
#merged_zips_with_tracts = tract_data.merge(broadband_data, how = 'left', on="Zip")
#merged_zips_with_tracts

In [135]:
## Notice how some county codes only have 4-digits, these should have a 0 pre-pended before merging
tract_data[tract_data.county<20000]

Unnamed: 0,Zip,county,tract,cntyname,zipname,intptlon,intptlat,pop10,afact,AFACT2
4186,06001,9003,460302,Hartford CT,"Avon, CT",-72.879362,41.763873,355,0.019309,0.088024
4187,06001,9003,462101,Hartford CT,"Avon, CT",-72.878749,41.786317,5833,0.317269,1.000000
4188,06001,9003,462102,Hartford CT,"Avon, CT",-72.906296,41.789819,4239,0.230568,1.000000
4189,06001,9003,462201,Hartford CT,"Avon, CT",-72.850445,41.798346,5256,0.285885,1.000000
4190,06001,9003,462202,Hartford CT,"Avon, CT",-72.815894,41.794739,2692,0.146424,1.000000
...,...,...,...,...,...,...,...,...,...,...
144019,99999,16079,960200,Shoshone ID,99999,-116.155171,47.886140,2,0.000357,0.000479
144020,99999,16083,001500,Twin Falls ID,99999,-114.730683,42.212968,2,0.000357,0.000749
144021,99999,16085,970100,Valley ID,99999,-116.045233,44.417043,1,0.000178,0.000381
144022,99999,17017,960300,Cass IL,99999,-90.346830,40.083678,2,0.000357,0.000607


In [136]:
def pad_county_code(x):
    county = str(x)
    while len(county) < 5:
        county = "0" + county
    return county

tract_data["county"] = tract_data.county.apply(pad_county_code)

## Confirm zeroes appropriately pre-pended
tract_data[tract_data["county"] == "09003"]

Unnamed: 0,Zip,county,tract,cntyname,zipname,intptlon,intptlat,pop10,afact,AFACT2
4186,06001,09003,460302,Hartford CT,"Avon, CT",-72.879362,41.763873,355,0.019309,0.088024
4187,06001,09003,462101,Hartford CT,"Avon, CT",-72.878749,41.786317,5833,0.317269,1.000000
4188,06001,09003,462102,Hartford CT,"Avon, CT",-72.906296,41.789819,4239,0.230568,1.000000
4189,06001,09003,462201,Hartford CT,"Avon, CT",-72.850445,41.798346,5256,0.285885,1.000000
4190,06001,09003,462202,Hartford CT,"Avon, CT",-72.815894,41.794739,2692,0.146424,1.000000
...,...,...,...,...,...,...,...,...,...,...
4862,06489,09003,430301,Hartford CT,"Southington, CT",-72.854174,41.566638,3293,0.102691,0.914976
4863,06489,09003,430302,Hartford CT,"Southington, CT",-72.872427,41.580643,1292,0.040291,0.468965
4864,06489,09003,430500,Hartford CT,"Southington, CT",-72.913091,41.601834,2580,0.080456,0.394073
4865,06489,09003,430601,Hartford CT,"Southington, CT",-72.890297,41.621677,5202,0.162223,1.000000


In [137]:
## Now I want to merge the Census data with this above dataset
## I'll need to merge on tract, and county code combination

## Notice how the county codes for the tract mapping dataset were 5 digits - This is a combination of a 2-digit
## state code pre-pended to a 3 digit county code. I need to modify the census data to match this format.
## I'll use the helper functions below


def pad_state(x):
    state = str(x)
    while len(state) < 2:
        state = "0" + state
    return state

def pad_county(x):
    county = str(x)
    while len(county) < 3:
        county = "0" + county
    return county


In [138]:
census_data[["county", "state", "tract"]]

Unnamed: 0,county,state,tract
0,73,1,001100
1,73,1,001400
2,73,1,002000
3,73,1,003802
4,73,1,004000
...,...,...,...
73051,7,50,003400
73052,7,50,000100
73053,7,50,003502
73054,7,50,002701


In [139]:
## Modifying the census dataset county code to be 5 digits

census_data["county"] = census_data["county"].apply(pad_county)
census_data["state"] = census_data["state"].apply(pad_state)

census_data[["county", "state", "tract"]]

Unnamed: 0,county,state,tract
0,073,01,001100
1,073,01,001400
2,073,01,002000
3,073,01,003802
4,073,01,004000
...,...,...,...
73051,007,50,003400
73052,007,50,000100
73053,007,50,003502
73054,007,50,002701


In [140]:
census_data["county"] = census_data["state"]+census_data["county"]
census_data[["county", "state", "tract"]]

Unnamed: 0,county,state,tract
0,01073,01,001100
1,01073,01,001400
2,01073,01,002000
3,01073,01,003802
4,01073,01,004000
...,...,...,...
73051,50007,50,003400
73052,50007,50,000100
73053,50007,50,003502
73054,50007,50,002701


In [141]:
## Now we can merge on tract & county codes

merged = tract_data.merge(census_data, how="left", on=["tract", "county"])
merged

Unnamed: 0,Zip,county,tract,cntyname,zipname,intptlon,intptlat,pop10,afact,AFACT2,...,pct_internet_broadband_satellite,pct_internet_only_satellite,pct_internet_other,pct_internet_no_subscrp,pct_internet_none,pct_computer,pct_computer_with_dialup,pct_computer_with_broadband,pct_computer_no_internet,pct_no_computer
0,01001,25013,813205,Hampden MA,"Agawam Town, MA",-72.630818,42.051901,3775,0.225118,0.504611,...,0.024736,0.004947,0.0,0.031003,0.091359,0.921834,0.000000,0.877639,0.044195,0.078166
1,01001,25013,813206,Hampden MA,"Agawam Town, MA",-72.638052,42.066801,297,0.017711,0.073478,...,0.006927,0.000000,0.0,0.000000,0.150504,0.916877,0.010705,0.827456,0.078715,0.083123
2,01001,25013,813207,Hampden MA,"Agawam Town, MA",-72.636153,42.087216,4133,0.246467,0.808648,...,0.030165,0.000000,0.0,0.073583,0.117002,0.897623,0.006399,0.803016,0.088208,0.102377
3,01001,25013,813208,Hampden MA,"Agawam Town, MA",-72.609240,42.056361,2918,0.174011,1.000000,...,0.005776,0.000000,0.0,0.000000,0.084158,0.919142,0.000000,0.891914,0.027228,0.080858
4,01001,25013,813209,Hampden MA,"Agawam Town, MA",-72.612651,42.075544,5646,0.336693,1.000000,...,0.031587,0.000000,0.0,0.053883,0.185433,0.826830,0.005946,0.742847,0.078038,0.173170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144119,99999,49045,130701,Tooele UT,99999,-112.225848,40.722768,18,0.003210,0.005396,...,,,,,,,,,,
144120,99999,49045,130703,Tooele UT,99999,-112.293643,40.034868,6,0.001070,0.000858,...,,,,,,,,,,
144121,99999,51001,980200,Accomack VA,99999,-75.511891,37.838676,5,0.000892,1.000000,...,,,,,,,,,,
144122,99999,56023,978200,Lincoln WY,99999,-110.665086,42.055432,6,0.001070,0.003009,...,,,,,,,,,,


## Aggregating by Zip Code

In [142]:
## Now that we've merged, our dataset contains multiple rows per zip, if there were multiple tracts per zip
## Let's aggregate on a zip to take the weighted average of the values we are evaluating


## From: https://www.statology.org/pandas-weighted-average/
def w_avg(df, values, weights):
    d = df[values]
    w = df[weights]
    return (d * w).sum() / w.sum()

In [143]:
full_zips = merged.groupby("Zip")
full_zips.count()

Unnamed: 0_level_0,county,tract,cntyname,zipname,intptlon,intptlat,pop10,afact,AFACT2,NAME,...,pct_internet_broadband_satellite,pct_internet_only_satellite,pct_internet_other,pct_internet_no_subscrp,pct_internet_none,pct_computer,pct_computer_with_dialup,pct_computer_with_broadband,pct_computer_no_internet,pct_no_computer
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01001,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
01002,9,9,9,9,9,9,9,9,9,9,...,8,8,8,8,8,8,8,8,8,8
01003,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
01005,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
01007,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99925,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
99926,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
99927,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
99929,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [144]:
employment_rate = full_zips.apply(w_avg, 'employment_rate', "afact")

employment_rate

Zip
01001    62.054017
01002    59.048759
01003    33.100000
01005    66.100000
01007    70.799047
           ...    
99925     0.000000
99926     0.000000
99927     0.000000
99929     0.000000
99999    32.759117
Length: 32846, dtype: float64

In [145]:
## Confirm the values map to the output above
test = broadband_data.merge(employment_rate.rename('employment_rate'), on="Zip", how="left")
test[test.Zip=="01001"]

Unnamed: 0,Zip,Population,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,...,All25_3_2015,All100_3.1,Total_Enrolled_Households,ST,COUNTY NAME,BROADBAND USAGE,ERROR RANGE (MAE)(+/-),ERROR RANGE (95%)(+/-),MSD,employment_rate
11398,1001,16769.0,3.0,1.0,10.0,1.0,1.0,4.0,2.0,279210.0,...,1.0,1.0,252.0,MA,Hampden,0.808,0.002,0.007,0.0,62.054017


In [146]:
## Let's test to confirm with 01001

merged[merged.Zip == "01001"][["NAME", "Zip", "tract","employment_rate","afact" ]]

Unnamed: 0,NAME,Zip,tract,employment_rate,afact
0,"Census Tract 8132.05, Hampden County, Massachu...",1001,813205,65.7,0.225118
1,"Census Tract 8132.06, Hampden County, Massachu...",1001,813206,64.0,0.017711
2,"Census Tract 8132.07, Hampden County, Massachu...",1001,813207,64.4,0.246467
3,"Census Tract 8132.08, Hampden County, Massachu...",1001,813208,69.4,0.174011
4,"Census Tract 8132.09, Hampden County, Massachu...",1001,813209,54.0,0.336693


In [147]:
broadband_data.columns

Index(['Zip', 'Population', 'WiredCount_2020', 'Fwcount_2020',
       'AllProviderCount_2020', 'Wired25_3_2020', 'Wired100_3_2020',
       'All25_3_2020', 'All100_3', 'TestCount', 'AverageMbps',
       'FastestAverageMbps', '%Access to Terrestrial Broadband',
       'Lowest Priced Terrestrial Broadband Plan', 'WiredCount_2015',
       'Fwcount_2015', 'AllProviderCount_2015', 'Wired25_3_2015',
       'Wired100_3_2015', 'All25_3_2015', 'All100_3.1',
       'Total_Enrolled_Households', 'ST', 'COUNTY NAME', 'BROADBAND USAGE',
       'ERROR RANGE (MAE)(+/-)', 'ERROR RANGE (95%)(+/-)', 'MSD'],
      dtype='object')

In [148]:
## Now what we want to do is add a new data column to the broadband dataset for (almost) each column
## in our census dataset using the weighted average calculation above

cols_to_keep = ['median_age_overall', 'median_age_male', 'median_age_female', 'employment_rate', 'median_income',
       'total_households', 'ave_household_size', 'ave_family_size',
       'total_population', 'median_house_value', 'pct_white',
       'pct_hisp_latino', 'pct_black', 'pct_native', 'pct_asian', 'pct_hi_pi',
       'pct_other_race', 'pct_two+_race', 'pct_rent_burdened', 'poverty_rate',
       'pct_pop_bachelors+', 'pct_pop_hs+', 'pct_internet',
       'pct_internet_dial_up', 'pct_internet_broadband_any_type',
       'pct_internet_cellular', 'pct_only_cellular',
       'pct_internet_broadband_fiber', 'pct_internet_broadband_satellite',
       'pct_internet_only_satellite', 'pct_internet_other',
       'pct_internet_no_subscrp', 'pct_internet_none', 'pct_computer',
       'pct_computer_with_dialup', 'pct_computer_with_broadband',
       'pct_computer_no_internet', 'pct_no_computer']

for col in cols_to_keep:
    temp = full_zips.apply(w_avg, col, "afact")
    broadband_data = broadband_data.merge(temp.rename(col), on="Zip", how="left")

In [149]:
broadband_data.columns

Index(['Zip', 'Population', 'WiredCount_2020', 'Fwcount_2020',
       'AllProviderCount_2020', 'Wired25_3_2020', 'Wired100_3_2020',
       'All25_3_2020', 'All100_3', 'TestCount', 'AverageMbps',
       'FastestAverageMbps', '%Access to Terrestrial Broadband',
       'Lowest Priced Terrestrial Broadband Plan', 'WiredCount_2015',
       'Fwcount_2015', 'AllProviderCount_2015', 'Wired25_3_2015',
       'Wired100_3_2015', 'All25_3_2015', 'All100_3.1',
       'Total_Enrolled_Households', 'ST', 'COUNTY NAME', 'BROADBAND USAGE',
       'ERROR RANGE (MAE)(+/-)', 'ERROR RANGE (95%)(+/-)', 'MSD',
       'median_age_overall', 'median_age_male', 'median_age_female',
       'employment_rate', 'median_income', 'total_households',
       'ave_household_size', 'ave_family_size', 'total_population',
       'median_house_value', 'pct_white', 'pct_hisp_latino', 'pct_black',
       'pct_native', 'pct_asian', 'pct_hi_pi', 'pct_other_race',
       'pct_two+_race', 'pct_rent_burdened', 'poverty_rate',
      

In [150]:
# Drop unnecessary columns
# Population was from the Broadband Now data but we've pulled this from the Census in total_population
cols_to_drop = ["Population"]
output = broadband_data.drop(columns=cols_to_drop)
output

Unnamed: 0,Zip,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,AverageMbps,...,pct_internet_broadband_satellite,pct_internet_only_satellite,pct_internet_other,pct_internet_no_subscrp,pct_internet_none,pct_computer,pct_computer_with_dialup,pct_computer_with_broadband,pct_computer_no_internet,pct_no_computer
0,29639,3.0,0.0,8.0,3.0,3.0,5.0,3.0,163.0,93.12,...,0.050319,0.021704,0.003752,0.057475,0.255626,0.824939,0.000748,0.672679,0.151512,0.175061
1,29620,6.0,0.0,11.0,5.0,3.0,7.0,3.0,2536.0,212.50,...,0.045589,0.004185,0.004542,0.047060,0.273381,0.807127,0.000006,0.658196,0.148926,0.192873
2,29659,,,,,,,,,,...,0.037938,0.000000,0.002147,0.129563,0.252684,0.852541,0.000000,0.617752,0.234789,0.147459
3,29638,6.0,1.0,13.0,4.0,4.0,6.0,4.0,272.0,82.79,...,0.048407,0.010415,0.001569,0.037764,0.243848,0.813197,0.004041,0.702475,0.106681,0.186803
4,29628,4.0,0.0,8.0,3.0,2.0,5.0,2.0,100.0,51.12,...,0.032459,0.000108,0.004426,0.042021,0.253247,0.782861,0.000000,0.678955,0.103906,0.217139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32648,78839,3.0,3.0,11.0,1.0,1.0,4.0,1.0,1392.0,48.94,...,0.030826,0.010743,0.000000,0.083742,0.367047,0.728907,0.007010,0.531195,0.190702,0.271093
32649,78872,,,,,,,,,,...,0.082642,0.043367,0.000000,0.074260,0.536452,0.748251,0.000000,0.388613,0.359637,0.251749
32650,57622,,,,,,,,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
32651,57748,4.0,1.0,9.0,2.0,2.0,5.0,2.0,,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [151]:
## Testing it worked out

In [152]:
merged[merged.Zip=="02144"][["Zip", "pct_pop_hs+", "pct_internet", "median_age_overall", "afact"]]

Unnamed: 0,Zip,pct_pop_hs+,pct_internet,median_age_overall,afact
1347,2144,0.138547,0.863711,32.8,0.006027
1348,2144,0.115688,0.942276,31.1,0.233017
1349,2144,0.149563,0.880342,30.6,0.068185
1350,2144,0.136394,0.91607,22.1,0.115357
1351,2144,0.116524,0.783153,31.9,0.25315
1352,2144,0.088689,0.90067,31.4,0.076514
1353,2144,0.086156,0.924338,29.9,0.136076
1354,2144,0.066155,0.946048,29.8,0.106525
1355,2144,0.096631,0.878422,33.2,0.005148


In [153]:
output[output.Zip=="02144"][["Zip", "pct_pop_hs+", "pct_internet", "median_age_overall"]]

Unnamed: 0,Zip,pct_pop_hs+,pct_internet,median_age_overall
19252,2144,0.109277,0.888723,29.972451


## Merging with Rural/Urban Data

In [154]:
## merging of rural/urban data
rural = pd.read_excel("../data/zips_rural_urban.xlsx", sheet_name="Data", converters={'ZIP_CODE' : lambda x: str(x)})
rural = rural.rename(columns={"ZIP_CODE" : "Zip"})
rural

Unnamed: 0,Zip,STATE,ZIP_TYPE,RUCA1,RUCA2
0,00001,AK,Zip Code Area,10,10.0
1,00002,AK,Zip Code Area,10,10.0
2,00003,AK,Zip Code Area,10,10.0
3,00004,AK,Zip Code Area,10,10.0
4,00005,AK,Zip Code Area,10,10.0
...,...,...,...,...,...
41159,99926,AK,Zip Code Area,10,10.0
41160,99927,AK,Zip Code Area,10,10.0
41161,99928,AK,Post Office or large volume customer,4,4.0
41162,99929,AK,Zip Code Area,10,10.0


In [155]:
output = output.merge(rural, how="left", on="Zip")

In [156]:
output.to_csv("../data/weighted_merged_all.csv", index=False)