## US Census Median Income API Access 
**Intended Utility**
>This databrick is designed to be utilized as part of a pipeline.<br>

>US Census Data is not updated frequently, so this tool will only <br>
>allow for access attempts every 6 months. Changing this behavior will<br>
>require syntax override or removing file history to throw an exception.

**Configuration:**
> These cells are responsible for configuring the primary aspects of the databrick.<br>

**Config Part 1:** Import Libraries

In [0]:
import requests
import pyspark.pandas as ps
from pyspark.sql.functions import lit
from pyspark.sql.functions import array, col, explode, lit, struct, concat_ws, StringType, split
from pyspark.sql import DataFrame
from typing import Iterable 
import datetime as dt
import time

**Config Part 2:** Create I/O widgets for pipeline.

In [0]:
# These widgets allow the pipeline to set the file I/O information.

dbutils.widgets.text("input", "","") 
dbutils.widgets.get("input")
 
dbutils.widgets.text("output", "","") 
dbutils.widgets.get("output")
 
dbutils.widgets.text("filename", "","") 
dbutils.widgets.get("filename")

**Config Part 3:** Designate Mount Points

In [0]:
def mount_storage(mount_goal):
    storageAccount = mount_goal['account']
    storageContainer = mount_goal['container']
    clientSecret = "B4g8Q~1VyZJa5WszLHwdEQNq4YIaHmT4DevRBcwI"
    clientid = "2ca50102-5717-4373-b796-39d06568588d"
    mount_point = mount_goal['mount']

    configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientid,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
           "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

    try: 
        dbutils.fs.unmount(mount_point)
    except:
        pass

    dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)
    
    return mount_point
    
in_path = getArgument("input").split("/")
out_path = getArgument("output").split("/")

storage_info = {
    'read': {'account': in_path[0], 'container': in_path[1], 'mount': "/mnt/arctic_analysts_income_scraper_read"},
    'write': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_income_scraper_write"},
    #'check': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_income_scraper_check"}
}

try:
    read_path = mount_storage(storage_info['read'])
    write_path = mount_storage(storage_info['write'])
    #check_path = mount_storage(storage_info['check'])
except Exception as E:
    print(E[:-50])
    
print(f'Read Path: {read_path}\nWrite Path: {write_path}')

**Data Access Validation**
>These cells check if an api call is reasonable.<br>

**Data Check Part 1:** Get the list of available files from the source.

In [0]:
#Validation
# need to get the list of years from the database, and then scrape only future years

def read_from_database(table):
    database = "arctic_analysts_capstone"
    table = f"dbo.{table}"
    user = "arctic_analysts"
    password  = "ThisPassw0rd!"
    server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

    jdbc = spark.read.format("jdbc") \
        .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
        .option("dbtable", table) \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    return jdbc

# Get current year
current_year = dt.datetime.today().year

# Check for any available years that are less than the current year, but not in the database.
try:
    years = read_from_database('year')
    median_income = read_from_database('median_income')

    income_years = years.join(median_income, (years.YearID == median_income.YearID))
    most_recent_year = max(income_years.select('Year').rdd.flatMap(lambda x: x).collect())
    allowed_attempts = [_ for _ in range(most_recent_year + 1, current_year)]
    rule = 'scrape_new'
except:
    # If exceptions, assume that the data is missing and we need to repopulate it.
    rule = 'scrape_all'
    allowed_attempts = [_ for _ in range(2005,current_year)]
print(allowed_attempts)

## API Access
>We will access the US Census API by first getting the list of variables<br>
>for the table that we are interested in for each year. The reason for this step<br>
>is that the variables can change over the years, and we want to make sure that we have<br>
>the correct variable for each year.<br>

>We are then creating a list of endpoints of the variables that we retrieve from the variable<br>
>table, and passing that list into the api call.

In [0]:
groups = ['B19049']
def get_var_dict(acs_variables):
    """ This function will return the list of variables and what they are to the primary api routine. """
    name_list = acs_variables[(acs_variables.Group.isin(groups))][['Name','Label', 'Concept']].Name.tolist()
    label_list = acs_variables[(acs_variables.Group.isin(groups))][['Name','Label', 'Concept']].Label.tolist()

    var_dict = {}
    for i, name in enumerate(name_list):
        label = label_list[i]
        label = label.replace('Estimate!!','').replace(":","").replace("Total!!","").replace("$","") 

        add_on = '_inflation_adjusted_in_data_year'
        
        label = label.split('!!')[1]
        var_dict[name] = label + add_on
     
    return var_dict

In [0]:
def get_median_income_data(year):
    # We need to get the variables from each unique year to ensure that any changes are caught.
    acs_variables = ps.read_html(f'https://api.census.gov/data/{year}/acs/acs1/variables.html')[0]
   
    # Create a string of endpoints that will be passed to the api call.
    selected_endpoints = acs_variables[(acs_variables.Group.isin(groups))].Name.tolist()
    endpoints = ",".join(selected_endpoints)
    
    # Get the variables and their information.
    var_dict = get_var_dict(acs_variables)
    
    # Call the census api (might need to add this to a while loop to check a few time in case of exceptions)
    geography = 'county'
    acs_api = f'https://api.census.gov/data/{year}/acs/acs1?get=NAME,{endpoints}&for={geography}:*'
    data = requests.get(acs_api).json()
    
    # Create a dataframe from the api_response.
    api_response_df = spark.createDataFrame(data)

    # Get top row
    vals = api_response_df.limit(1).collect()[0].asDict()

    # Create New DF with renamed columns
    new_column_names = [vals[key] for key in vals]
    cleaned_column_names = []
    for column_name in new_column_names:
        if column_name in var_dict:
            cleaned_column_names.append(var_dict[column_name])
        else:
            cleaned_column_names.append(column_name)

    new_df = api_response_df.toDF(*cleaned_column_names)
    
    # Filter out unneeded rows and assign the year to a column.
    new_df = new_df.where(new_df.NAME != 'NAME')
    new_df = new_df.withColumn("YEAR", lit(year))
        
    return new_df


def melt(
    df: DataFrame,
    id_vars: Iterable[str], value_vars: Iterable[str],
    var_name: str="AgeGroup",
    value_name: str="MedianIncome"):
    """ This function converts the data from wide to long form. """
    
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))
    
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
    
    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    
    return _tmp.select(*cols)

  
def clean_data(df):
    
    new_column_names = {
        'Householder 25 to 44 years_inflation_adjusted_in_data_year': '25-44',
        'Householder 45 to 64 years_inflation_adjusted_in_data_year':  '45-64',
        'Householder 65 years and over_inflation_adjusted_in_data_year': '65-plus',
        'Householder under 25 years_inflation_adjusted_in_data_year': 'under-25',
        'NAME': 'County',
        'Total_inflation_adjusted_in_data_year': 'overall',
        'YEAR': 'Year',
        'county': 'CountyFips',
        'state': 'StateFips'
    }
    
    # Assign new column names
    new_columns = [new_column_names[_] for _ in df.columns]
    new_df = df.toDF(*new_columns)
    
    # Convert the data from wide form to long form.
    melted_frame = melt(new_df, id_vars = ['Year','County','CountyFips','StateFips'], value_vars = ['25-44','45-64','65-plus','under-25','overall'])
    
    # Combine state and county fips to be one fips.
    other_columns = [_ for _ in melted_frame.columns if _ not in ['StateFips','CountyFips']]
    final_frame = melted_frame.select(concat_ws('', melted_frame.StateFips, melted_frame.CountyFips).alias('FIPS'),*other_columns)
    
    # Filter for New Jersey Data
    final_frame = final_frame.where(col("FIPS").like('34%'))
    
    # Split the "County" column to get only the only county name
    final_frame = final_frame.withColumn("County", split(col("County"),",").getItem(0))
    return final_frame

In [0]:
def save_data_to_blob(df, method):
    final_path = write_path + '/median_income_data_from_databrick'

    # Reduce to single file and write to blob
    df.repartition(1).write.format("com.databricks.spark.json") \
    .mode(method).option("header", "True") \
    .json(final_path)

    # Locate file in blob
    files = dbutils.fs.ls(final_path)
    json_file = [x.path for x in files if x.path.endswith(".json")][0]

    # Move file out of directory into main blob and delete junk files
    dbutils.fs.mv(json_file, final_path + ".json")
    dbutils.fs.rm(final_path, recurse = True)

## API Call Controller
> This code block controls the api calls

In [0]:
errors = 0
primary_df = None
for i, year in enumerate(allowed_attempts):
    print(f'Gathering {i+1} of {len(allowed_attempts)} | {year}')
    try:
        if i == 0:
            primary_df = get_median_income_data(year)
        else:
            primary_df = primary_df.unionByName(get_median_income_data(year))
            
        print(primary_df.count())
    except Exception as E:
        errors += 1
        print(E)
        if errors > 5:
            print("Exiting due to high number of exceptions.")
            break
        

# Clean the data    


##Full Clean
> This code block sends the data to a cleaning function if necessary.

In [0]:
if rule == 'scrape_all':
    if primary_df.count() > 0:
        finished_df = clean_data(primary_df)   
    #     call_history_df = create_history()
    #     update_history(call_history_df, 'overwrite')
        save_data_to_blob(finished_df, 'overwrite')
    else:
        print('No new data.')
else:
    try:
        if primary_df.count() > 0:
            finished_df = clean_data(primary_df)
            save_data_to_blob(finished_df, 'append')
            print('Blob Updates')
    except Exception as E:
        print(E)
        print("Nothing to update.")


In [0]:
# If we want to read the data from the blob
#income_df = spark.read.options(inferSchema='True', header='True').json(read_path + "/" + getArgument('filename') + ".json")