In [8]:
import requests
import pandas as pd

In [5]:
# Build base URL
HOST = "https://api.census.gov/data"
year = "2012"
dataset = "acs/acs5"
base_url = "/".join([HOST, year, dataset])

# Specify requested variables
# B01001_001E = Total population (estimate)
# B03002_003E = Nonhispanic White population (estimate)
# B03002_004E = Nonhispanic Black population (estimate)
get_vars = ["NAME", "B01001_001E", "B03002_003E", "B03002_004E"]

# Specify requested variables
get_vars = ["NAME", "B01001_001E", "B03002_003E", "B03002_004E"]
# Create dictionary of predicates
predicates = {}
predicates["get"] = ",".join(get_vars)
# Requested geography
predicates["for"] = \
            "metropolitan statistical area/micropolitan statistical area:*"
r = requests.get(base_url, params=predicates)

[['NAME', 'B01001_001E', 'B03002_003E', 'B03002_004E', 'metropolitan statistical area/micropolitan statistical area'], ['San Francisco-Oakland-Fremont, CA Metro Area', '4348880', '1848583', '346719', '41860'], ['San Germán-Cabo Rojo, PR Metro Area', '137194', '802', '14', '41900'], ['San Jose-Sunnyvale-Santa Clara, CA Metro Area', '1843860', '650273', '43126', '41940'], ['San Juan-Caguas-Guaynabo, PR Metro Area', '2473084', '17978', '2661', '41980']]


In [9]:
# Create user-friendly column names
col_names = ["name", "pop", "white", "black", "msa"]
# Load JSON response into DataFrame
msa = pd.DataFrame(columns=col_names, data=r.json()[1:])
# Cast count columns to int data type
msa[["pop", "white", "black"]] = msa[["pop", "white", "black"]].astype(int)

In [12]:
msa[msa['msa']=='10100']

Unnamed: 0,name,pop,white,black,msa
178,"Aberdeen, SD Micro Area",40612,37776,259,10100


## Measuring Segregation
Explore racial segregation in America. Calculate the Index of Dissimilarity, and important measure of segregation. Learn about and use Metropolitan Statistical Areas, and important geography for urban research. Study segregation changes over time in Chicago.

#### Calculating D for One State
Compute the Index of Dissimilarity for the state of Georgia. The formula for the Index of Dissimilarity is:

$$D = \frac{1}{2}\sum{\left\lvert \frac{a}{A} - \frac{b}{B} \right\rvert}$$

In this case, Group A will be Whites, Group B will be Blacks. $ a $ and $ b $ represent the White and Black population of the small geography (tracts), while $ A $ and $ B $ represent the White and Black population of the larger, containing geography (e.g. Georgia, postal code = GA, FIPS code = 13).

In [39]:
# Define convenience variables to hold column names
w = "white"
b = "black"

# Extract Georgia tracts
ga_tracts = tracts[tracts['state']=='13']

# Print sums of Black and White residents of Georgia
print(ga_tracts[[w,b]].sum())

# Calculate Index of Dissimilarity and print rounded result
w_total = ga_tracts[w].sum()
b_total = ga_tracts[b].sum()
D = 0.5 * sum(abs(ga_tracts[w] / w_total - ga_tracts[b] / b_total))

print("Dissimilarity (Georgia):", round(D, 3))  

NameError: name 'tracts' is not defined

#### Calculating D in a Loop
Is Georgia's Index of Dissimilarity of 0.544 high or low? Let's compare it to Illinois (FIPS = 17), home of Chicago.

In this exercise we will use a loop to calculate $ D$ for all states, then compare Georgia and Illinois.

In [None]:
# Get list of state FIPS Codes
states = list(tracts["state"].unique())

state_D = {}  # Initialize dictionary as collector
for state in states:
    # Filter by state
    tmp = tracts[tracts['state']==state]
    
    # Add Index of Dissimilarity to Dictionary
    w_tot = tmp[w].sum()
    b_tot =tmp[b].sum()
    state_D[state] = 0.5 * sum(abs(tmp[w]/w_tot-tmp[b]/b_tot))

# Print D for Georgia (FIPS = 13) and Illinois (FIPS = 17)    
print("Georgia D =", round(state_D["13"], 3))
print("Illinois D =", round(state_D["17"], 3))

#### Calculating D Using Grouping in Pandas
Performing a calculation over subsets of a DataFrame is so common that `pandas` gives us an alternative to doing it in a loop, the `groupby` method. In the sample code, `groupby` is used first to group tracts by state, i.e. those rows having the same value in the `"state"` column. The `sum()` method is applied by group to the columns.

In [None]:
# Sum Black and White residents grouped by state
sums_by_state = tracts.groupby("state")[[w, b]].sum()
print(sums_by_state.head(2))

# Drop columns if they already exist to prevent duplicates
columns_to_add=['white_sum','black_sum']
tracts=tracts.drop(columns=[col for col in columns_to_add if col in tracts], errors='ignore')
 
# Merge the sum with the original tract populations
tracts = pd.merge(tracts, sums_by_state, left_on = "state", 
    right_index = True, suffixes = ("", "_sum"))
print(tracts.head(2))