In [21]:
import pandas as pd
import numpy as np

# Overview of this Notebook

This notebook is meant to be run on the data output of running the Data Gathering notebook in the census scripts folder which produced a csv file of census variable columns (such as B01002_001E, B01002_002E, etc...) for each census tract in the US. 

The list of variables we pulled is here: https://docs.google.com/document/d/1eslJJ1DKmmNkJAtZbOdxHAbHCZ34L6cb2tkgMoExPwc/edit?usp=sharing

In [22]:
data = pd.read_csv("../data/census_data.csv")
data

Unnamed: 0,NAME,B01002_001E,B01002_002E,B01002_003E,B15003_001E,B15003_017E,B15003_018E,B15003_019E,B15003_020E,B15003_021E,...,DP05_0001E,DP04_0089E,DP05_0077PE,DP05_0071PE,DP05_0078PE,DP05_0079PE,DP05_0080PE,DP05_0081PE,DP05_0082PE,DP05_0083PE
0,"Census Tract 11, Jefferson County, Alabama",39.0,42.5,38.1,3285,552,201,269,534,362,...,4781,85500,1.6,0.4,96.8,1.2,0.0,0.0,0.0,0.0
1,"Census Tract 14, Jefferson County, Alabama",44.3,40.5,49.1,1274,300,15,57,399,63,...,1946,67800,0.0,6.8,91.7,0.0,0.0,0.0,1.1,0.4
2,"Census Tract 20, Jefferson County, Alabama",34.0,31.0,36.4,2802,720,134,267,571,199,...,4080,68400,27.7,7.5,63.8,0.0,0.0,0.0,0.0,0.9
3,"Census Tract 38.02, Jefferson County, Alabama",35.8,31.7,37.3,3464,1294,32,221,551,585,...,5291,64100,1.6,0.0,97.5,0.0,0.0,0.0,0.0,0.9
4,"Census Tract 40, Jefferson County, Alabama",52.1,51.6,53.8,1971,540,144,130,509,117,...,2533,67300,6.6,6.8,85.7,0.9,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73051,"Census Tract 34, Chittenden County, Vermont",48.1,44.8,49.3,5614,630,71,175,356,374,...,7717,386900,93.4,3.0,0.9,0.0,1.1,0.0,0.0,1.5
73052,"Census Tract 1, Chittenden County, Vermont",39.7,33.5,42.7,3094,776,162,99,405,247,...,4377,232500,75.6,3.4,9.9,0.0,8.8,0.0,0.0,2.3
73053,"Census Tract 35.02, Chittenden County, Vermont",41.6,40.2,44.0,3606,959,48,255,296,344,...,5135,331300,98.7,0.4,0.3,0.4,0.0,0.2,0.0,0.0
73054,"Census Tract 27.01, Chittenden County, Vermont",43.5,42.2,44.1,4161,543,68,81,394,396,...,5880,327200,85.0,2.4,3.8,0.5,3.1,0.0,0.0,5.2


## Rename Columns

In [23]:
## Renaming data columns
data = data.rename(columns = {"B01002_001E" : "median_age_overall", "B01002_002E": "median_age_male", "B01002_003E" : 
                      "median_age_female", "DP05_0001E" : "total_population", "DP04_0089E" : "median_house_value",
                      "S1901_C01_012E" : "median_income", "S2301_C03_001E" : "employment_rate",
                      "S1101_C01_002E": "ave_household_size", "S1101_C01_004E" : "ave_family_size", "S1101_C01_001E" :
                             "total_households"})

## Calculating Additional Data Columns

In [24]:
data["DP04_0142PE"]

0        30.9
1        47.6
2        50.1
3        38.1
4        40.9
         ... 
73051    49.3
73052    48.1
73053    32.5
73054    44.3
73055    24.6
Name: DP04_0142PE, Length: 73056, dtype: float64

In [25]:
def safe_division(x, y):
    try:
        return x / y
    except:
        #y would have been 0 so return NAN
        return np.nan

In [26]:
## Calculating Additional Data Columns
data["pct_rent_burdened"] = data.apply(lambda x:  x["DP04_0142PE"] + x["DP04_0141PE"], axis=1)
data["poverty_rate"] = data.apply(lambda x: safe_division(x["S1701_C02_001E"], x["S1701_C01_001E"]), axis=1)
data["pct_pop_bachelors+"] = data.apply(lambda x: safe_division((x["B15003_022E"] + x["B15003_023E"] + x["B15003_024E"] +
                                                 x["B15003_025E"]), x["B15003_001E"]), axis=1)
data["pct_pop_hs+"] = data.apply(lambda x: safe_division((x["pct_pop_bachelors+"] + x["B15003_018E"] + x["B15003_019E"] + 
                                            x["B15003_020E"] + x["B15003_021E"]), x["B15003_001E"]), axis=1)

## Internet Statistics
data["pct_internet"] = data.apply(lambda x: safe_division(x["B28002_002E"] , x["B28002_001E"]), axis=1)
data["pct_internet_dial_up"]= data.apply(lambda x: safe_division(x["B28002_003E"] , x["B28002_001E"]), axis=1)
data["pct_internet_broadband_any_type"]= data.apply(lambda x: safe_division(x["B28002_004E"] , x["B28002_001E"]), axis=1)
data["pct_internet_cellular"]= data.apply(lambda x: safe_division(x["B28002_005E"] , x["B28002_001E"]), axis=1)
data["pct_only_cellular"]= data.apply(lambda x: safe_division(x["B28002_006E"] , x["B28002_001E"]), axis=1)
data["pct_internet_broadband_fiber"]= data.apply(lambda x: safe_division(x["B28002_008E"] , x["B28002_001E"]), axis=1)
data["pct_internet_broadband_satellite"]= data.apply(lambda x: safe_division(x["B28002_009E"] , x["B28002_001E"]), axis=1)
data["pct_internet_only_satellite"]= data.apply(lambda x: safe_division(x["B28002_010E"] , x["B28002_001E"]), axis=1)
data["pct_internet_other"]= data.apply(lambda x: safe_division(x["B28002_011E"] , x["B28002_001E"]), axis=1)
data["pct_internet_no_subscrp"]= data.apply(lambda x: safe_division(x["B28002_012E"] , x["B28002_001E"]), axis=1)
data["pct_internet_none"]= data.apply(lambda x: safe_division(x["B28002_013E"] , x["B28002_001E"]), axis=1)


## Computer Statistics
data["pct_computer"] = data.apply(lambda x: safe_division(x["B28003_002E"] , x["B28003_001E"]), axis=1)
data["pct_computer_with_dialup"] = data.apply(lambda x: safe_division(x["B28003_003E"] , x["B28003_001E"]), axis=1)
data["pct_computer_with_broadband"] = data.apply(lambda x: safe_division(x["B28003_004E"] , x["B28003_001E"]), axis=1)
data["pct_computer_no_internet"] = data.apply(lambda x: safe_division(x["B28003_005E"] , x["B28003_001E"]), axis=1)
data["pct_no_computer"] = data.apply(lambda x: safe_division(x["B28003_006E"] , x["B28003_001E"]), axis=1)

In [27]:
## Race/Ethnicity Statistics To Rename

data = data.rename(columns = {"DP05_0077PE" : "pct_white", "DP05_0071PE" : "pct_hisp_latino", "DP05_0078PE" :
                             "pct_black", "DP05_0079PE" : "pct_native", "DP05_0080PE" : "pct_asian", "DP05_0081PE" :
                             "pct_hi_pi", "DP05_0082PE" : "pct_other_race", "DP05_0083PE" : "pct_two+_race"})

In [28]:
cols_to_drop = []
for col in data.columns:
    if col.startswith("DP"):
        cols_to_drop.append(col)
    elif col.startswith("B"):
        cols_to_drop.append(col)
    elif col.startswith("S"):
        cols_to_drop.append(col)
        
cols_to_drop

['B15003_001E',
 'B15003_017E',
 'B15003_018E',
 'B15003_019E',
 'B15003_020E',
 'B15003_021E',
 'B15003_022E',
 'B15003_023E',
 'B15003_024E',
 'B15003_025E',
 'B28002_001E',
 'B28002_002E',
 'B28002_003E',
 'B28002_004E',
 'B28002_005E',
 'B28002_006E',
 'B28002_007E',
 'B28002_008E',
 'B28002_009E',
 'B28002_010E',
 'B28002_011E',
 'B28002_012E',
 'B28002_013E',
 'B28003_001E',
 'B28003_002E',
 'B28003_003E',
 'B28003_004E',
 'B28003_005E',
 'B28003_006E',
 'S1701_C02_001E',
 'S1701_C01_001E',
 'DP04_0142PE',
 'DP04_0141PE']

In [30]:
new_data = data.drop(columns=cols_to_drop)
new_data.columns

Index(['NAME', 'median_age_overall', 'median_age_male', 'median_age_female',
       'state', 'county', 'tract', 'employment_rate', 'median_income',
       'total_households', 'ave_household_size', 'ave_family_size',
       'total_population', 'median_house_value', 'pct_white',
       'pct_hisp_latino', 'pct_black', 'pct_native', 'pct_asian', 'pct_hi_pi',
       'pct_other_race', 'pct_two+_race', 'pct_rent_burdened', 'poverty_rate',
       'pct_pop_bachelors+', 'pct_pop_hs+', 'pct_internet',
       'pct_internet_dial_up', 'pct_internet_broadband_any_type',
       'pct_internet_cellular', 'pct_only_cellular',
       'pct_internet_broadband_fiber', 'pct_internet_broadband_satellite',
       'pct_internet_only_satellite', 'pct_internet_other',
       'pct_internet_no_subscrp', 'pct_internet_none', 'pct_computer',
       'pct_computer_with_dialup', 'pct_computer_with_broadband',
       'pct_computer_no_internet', 'pct_no_computer'],
      dtype='object')

In [31]:
new_data.to_csv("relabeled_census.csv",index=False)