## API for Getting Population Data From American Community Survey ##

### First, import the modules needed:

#### 1. pandas (for dataframe)
#### 2. requests (for working with the API)

### Next, we need to open and read the .txt file containing the API Key:
https://api.census.gov/data/key_signup.html; input the fields required and save the key as a .txt file

In [1]:
## Resources: 
## https://medium.com/@mcmanus_data_works/using-the-u-s-census-bureau-api-with-python-5c30ad34dbd7
## https://api.census.gov/data/2019/acs/acs1/variables.html

## Import the modules
import pandas as pd 
import requests

## Read the value of the key stored in the .txt file
key = open('C:/Users/kwaim/OneDrive/Documents/CCHE_Files/api_keys/acskey.txt', "r") ## modify the directory of your key here 
api_key = key.read()
key.close()

In [2]:
## View the key value
print(api_key)

9567f4de9dbb64aed07d3219b856b14dd519bce7


### Then, define the functions needed for the API data pulling tasks:

In [3]:
# This function pulls down this table: https://api.census.gov/data/2019/acs/acs1/variables.html
# This table is basically a lookup for the variable names we need 

def get_variable_table_df(year):
    variable_table_url = f'https://api.census.gov/data/{year}/acs/acs1/variables.html'
    v_table = pd.read_html(variable_table_url)
    variable_df = pd.DataFrame(v_table[0])
    variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True) # replace !! with spaces

    return variable_df

In [4]:
# Find the indices for the variables we want in the the table. 

## Function to return indices of Variables of interest in the variable table

def get_male_by_age_race_index(variable_table):
    indices = []

    ## Tuple of Racial Groups needed for querying; unwanted ones are commented out
    racial_groups = (
        #"WHITE ALONE", 
        "BLACK OR AFRICAN AMERICAN ALONE",
        #"AMERICAN INDIAN AND ALASKA NATIVE ALONE", 
        #"ASIAN ALONE",
        #"NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE",
        #"SOME OTHER RACE ALONE", 
        #"TWO OR MORE RACES", 
        "WHITE ALONE, NOT HISPANIC OR LATINO",
        "HISPANIC OR LATINO")
    
    for race in racial_groups: 

        ## Used for querying against the "CONCEPT" variable 
        
        # total pop size of each race by age 
        query = "SEX BY AGE (" + race + ")"
        condition_start = "Label == 'Estimate Total Male' and Concept == '" + query + "'"
        condition_end = "Label == 'Estimate Total Male 85 years and over' and Concept == '" + query + "'"

        index_start = variable_table.query(condition_start).index[0]
        index_end = variable_table.query(condition_end).index[0]

        indices.extend((index_start, index_end))
        
    return indices


In [5]:
## Function to get the names of the variables of interest;
## Using argument of the variable table, and the indices of the variables of interest

def get_variable_names(variable_table, indices):
    
    ### Placeholder list to store the variables
    total_male_by_age_variables = []
    
    ### Note that it is limited to 50 variables for each API Call
    
    ### Here, we make a step of 2 in the for loop, because we want all the indices BETWEEN each of the 
    ### 2-pairs of element in the indices list
    for i in range(0,len(indices),2):
        
        temp = list((variable_table.iloc[indices[i]: indices[i+1], 0].values))
        total_male_by_age_variables += temp
        
    ## Returning a string of the names of variables joined by a comma    
    return ','.join(total_male_by_age_variables)

In [6]:
# To query you have to look up a specific url -- this function combines all of the information for us. 

def get_query_url(year, variables):
    # API Reference: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html
    # Data Dictionary: https://api.census.gov/data.html
    host = 'https://api.census.gov/data'
    year = f'/{year}'
    dataset_acronym = '/acs/acs1'
    g = '?get='
    
    ## Metropolitan Statistical Area: 16980 is the code for Chicago MSA 
    location = f'&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:16980'
    
    ## api_key object from the first cell in this notebook 
    usr_key = f"&key={api_key}"

    query_url = f"{host}{year}{dataset_acronym}{g}{variables}{location}{usr_key}"

    return query_url

In [7]:
## Get the response data from the URL generated earlier

def get_query_text(query_url):
    response = requests.get(query_url)
    return response.text

In [8]:
## Get the values in the specified columns in the requested data

def get_col_name(variable_df, indices, col_name):
    
    col_values = []
    
    for i in range(0, len(indices),2):
        temp = [i.replace("!!", " ").replace(":", "") for i in variable_df.iloc[indices[i]:indices[i+1]][col_name].values]
        col_values += temp
    
    return col_values

### Example

The following demonstrate an example that pulls the data for 2018 for the population of male in Chicago Metropolitan Area by the following three races: 
* White (Non-Hispanic) 
* Black 
* Hispanic Non-Black   

In [9]:
## Get the variable table for 2018
table_2018 = get_variable_table_df(2018)

In [10]:
## Get the index
index_2018 = get_male_by_age_race_index(table_2018)

In [11]:
## Get the Variable Names Based on the Index and the Variable Table
var_names_2018 = get_variable_names(table_2018, index_2018)

## Preview the Variable Names
print(var_names_2018)

B01001B_002E,B01001B_003E,B01001B_004E,B01001B_005E,B01001B_006E,B01001B_007E,B01001B_008E,B01001B_009E,B01001B_010E,B01001B_011E,B01001B_012E,B01001B_013E,B01001B_014E,B01001B_015E,B01001H_002E,B01001H_003E,B01001H_004E,B01001H_005E,B01001H_006E,B01001H_007E,B01001H_008E,B01001H_009E,B01001H_010E,B01001H_011E,B01001H_012E,B01001H_013E,B01001H_014E,B01001H_015E,B01001I_002E,B01001I_003E,B01001I_004E,B01001I_005E,B01001I_006E,B01001I_007E,B01001I_008E,B01001I_009E,B01001I_010E,B01001I_011E,B01001I_012E,B01001I_013E,B01001I_014E,B01001I_015E


In [12]:
## Call the get_query_url function to get the URL for passing along to the API call function
url_2018 = get_query_url("2018", "NAME," + var_names_2018)

In [13]:
# Preview the URL obtained
print(url_2018)

https://api.census.gov/data/2018/acs/acs1?get=NAME,B01001B_002E,B01001B_003E,B01001B_004E,B01001B_005E,B01001B_006E,B01001B_007E,B01001B_008E,B01001B_009E,B01001B_010E,B01001B_011E,B01001B_012E,B01001B_013E,B01001B_014E,B01001B_015E,B01001H_002E,B01001H_003E,B01001H_004E,B01001H_005E,B01001H_006E,B01001H_007E,B01001H_008E,B01001H_009E,B01001H_010E,B01001H_011E,B01001H_012E,B01001H_013E,B01001H_014E,B01001H_015E,B01001I_002E,B01001I_003E,B01001I_004E,B01001I_005E,B01001I_006E,B01001I_007E,B01001I_008E,B01001I_009E,B01001I_010E,B01001I_011E,B01001I_012E,B01001I_013E,B01001I_014E,B01001I_015E&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:16980&key=9567f4de9dbb64aed07d3219b856b14dd519bce7


In [14]:
## Call the get_query_text function to obtain the names of the variables
response_text_2018 = get_query_text(url_2018)

In [15]:
## Preview the names of the variables
print(response_text_2018)

[["NAME","B01001B_002E","B01001B_003E","B01001B_004E","B01001B_005E","B01001B_006E","B01001B_007E","B01001B_008E","B01001B_009E","B01001B_010E","B01001B_011E","B01001B_012E","B01001B_013E","B01001B_014E","B01001B_015E","B01001H_002E","B01001H_003E","B01001H_004E","B01001H_005E","B01001H_006E","B01001H_007E","B01001H_008E","B01001H_009E","B01001H_010E","B01001H_011E","B01001H_012E","B01001H_013E","B01001H_014E","B01001H_015E","B01001I_002E","B01001I_003E","B01001I_004E","B01001I_005E","B01001I_006E","B01001I_007E","B01001I_008E","B01001I_009E","B01001I_010E","B01001I_011E","B01001I_012E","B01001I_013E","B01001I_014E","B01001I_015E","metropolitan statistical area/micropolitan statistical area"],
["Chicago-Naperville-Elgin, IL-IN-WI Metro Area","715237","49331","49492","55081","33180","22438","57281","59844","45276","85107","88182","88072","50869","24561","2450317","125306","122357","134917","84237","54567","141211","169636","172577","313163","343771","376506","251648","115897","1084521",

In [16]:
## Split the response by new lines ('\n')
response_2018_split = response_text_2018.split('\n')

## Then slice the 2nd element of the response object to obtain the values we need
response_value_2018 = response_2018_split[1].split(',')[:-1]

In [17]:
## Change the formatting of the response values to interger
response_value_2018_Series = pd.Series(response_value_2018[2:]).apply(lambda x:int(x.replace("[","").replace('"','')))

In [18]:
## Get the names of the LABEL and CONCEPT in the dataset we requested
label_2018 = get_col_name(table_2018, index_2018, "Label")
concept_2018 = get_col_name(table_2018, index_2018, "Concept")

In [19]:
## Convert LABEL and CONCEPT into pandas series
label_Series_2018 = pd.Series(label_2018)
concept_Series_2018 = pd.Series(concept_2018)

In [20]:
## Extract the text string of the MSA value (i.e. Chicago-Naperville-Elgin)
MSA_value = response_value_2018[0][2:]

## Create a dataframe
df_chicago = pd.DataFrame(data=dict(Value=response_value_2018_Series, Concept=concept_Series_2018, MSA=MSA_value))
df_chicago.index = label_Series_2018

In [21]:
## Display the final data frame output
display(df_chicago)

Unnamed: 0,Value,Concept,MSA
Estimate Total Male,715237,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male Under 5 years,49331,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 5 to 9 years,49492,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 10 to 14 years,55081,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 15 to 17 years,33180,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 18 and 19 years,22438,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 20 to 24 years,57281,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 25 to 29 years,59844,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 30 to 34 years,45276,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin
Estimate Total Male 35 to 44 years,85107,SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE),Chicago-Naperville-Elgin


In [22]:
## Create a timestamp
from datetime import datetime
time_stamp = datetime.now().strftime(format='%B_%d_%Y_%H%M')

## Save the output table to a .csv file with timestamp in the file name
df_chicago.to_csv(f"df_chicago_{time_stamp}.csv")