## US Census Data Building Permit Scraper
**Intended Utility**
>This databrick is designed to be utilized as part of a pipeline.<br><br>
>The data can be scraped at any point to check for updates. The databrick will check<br>
>the available files, and check a scraped list in the container, and if there is a new file<br>
>then that file will be processed and the database will be updated.<br><br>
>If there are no new files then the databrick will take no action.

**Configuration:**
> These cells are responsible for configuring the primary aspects of the databrick.<br>

**Config Part 1:** Import Libraries

In [0]:
import requests
import datetime as dt
from operator import add
from functools import reduce
from bs4 import BeautifulSoup
from pyspark import SparkFiles
from pyspark.sql.functions import *
from pyspark.sql.types import DateType
from pyspark.sql.types import StringType
from pyspark.sql.functions import to_date
from pyspark.sql.functions import concat_ws, trim

**Config Part 2:** Create I/O widgets for pipeline.

In [0]:
# These widgets allow the pipeline to set the file I/O information.

dbutils.widgets.text("input", "","") 
dbutils.widgets.get("input")
 
dbutils.widgets.text("output", "","") 
dbutils.widgets.get("output")
 
dbutils.widgets.text("filename", "","") 
dbutils.widgets.get("filename")

**Config Part 3:** Designate Mount Points

In [0]:
def mount_storage(mount_goal):
    storageAccount = mount_goal['account']
    storageContainer = mount_goal['container']
    clientSecret = "B4g8Q~1VyZJa5WszLHwdEQNq4YIaHmT4DevRBcwI"
    clientid = "2ca50102-5717-4373-b796-39d06568588d"
    mount_point = mount_goal['mount']

    configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientid,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
           "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

    try: 
        dbutils.fs.unmount(mount_point)
    except:
        pass

    dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)
    
    return mount_point
    
in_path = getArgument("input").split("/")
out_path = getArgument("output").split("/")

storage_info = {
    'read': {'account': in_path[0], 'container': in_path[1], 'mount': "/mnt/arctic_analysts_bps_scraper_read"},
    'write': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_bps_scraper_write"},
    'check': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_bps_scraper_check"}
}

try:
    read_path = mount_storage(storage_info['read'])
    write_path = mount_storage(storage_info['write'])
    check_path = mount_storage(storage_info['check'])
except Exception as E:
    print(E[:-50])
    
print(f'Read Path: {read_path}\nWrite Path: {write_path}')

**Database Data Check**
>These cells check if there are any new files available.<br>

**Data Check Part 1:** Get the list of available files from the source.

In [0]:
primary_link = 'https://www2.census.gov/econ/bps/County/'
source = 'https://www2.census.gov'

raw_html = requests.get(primary_link).text
soup = BeautifulSoup(raw_html)

links = soup.find_all('a')
data_links = []
for link in links:
    try:
        # 'c' indicates data tabulated by month
        # the other option is 'y' which is 
        # year-to-date summed data from the beginning
        # of the year
        if "c.txt" in link.get('href'):
            data_links.append(primary_link + link.get('href'))
    except:
        continue

**Data Check Part 2:** Check if there are any new files available.
> This code block will compare a previously created file and check if there are any new links<br>
>from the previous step that are not in the previously gathered information.

In [0]:
try:
    previously_gathered = spark.read.options(inferSchema='True', header='True').json(check_path + '/scrape-history/bps_in_database.json')
    previously_gathered_list = previously_gathered.select('InDatabase').rdd.flatMap(lambda x: x).collect()
    new_links = [_ for _ in data_links if _ not in previously_gathered_list]
    if len(new_links) == 0:
        new_data = 'none'
        print('Data is up to date.')
    else:
        new_data = 'update'
        print('There is new data available.')
        
except Exception as E:
    new_data = 'all'
    new_links = data_links
    print(E)
    print('\nThere seems to be no previously gathered data, or you deleted the list of previously collected files.')
    print('An alternative to this check method would be to parse the date information from the links and see if the date is in the database.')


**New Data Gathering** <br>
**Functions**
>These are functions that manage how to scrape the data and what to do with it.

In [0]:
def parse_BPS(link):
    """ This function is responsible for converting a BPS text file to a Spark frame. """
    
    # Get the filename from the link.
    file = link.split("/")[-1]
    
    # Add the file to the driver.
    spark.sparkContext.addFile(link)
    
    # Read in the file from the driver.
    df = spark.read.csv("file://"+SparkFiles.get(file), header = True, inferSchema=True)
    
    # Get the values from the top row to add to the column names.
    vals = df.limit(1).collect()[0].asDict()
    
    # Create new column names by combining old with new.
    new_column_names = [(key + vals[key]).lower() for key in vals]
    new_df = df.toDF(*new_column_names)
    
    # Drop the row with the unused data.
    new_df = new_df.where(new_df.surveydate != 'Date')
    return new_df

def process_new_data():
    """ This function processes data from each link in 
    the list of new links that was collected during the data
    validation phase.
    """
    
    # Count any errors
    errors = 0
    
    # Create empty list for links that were successfully checked.
    checked_links = []
    
    # Create empty list to store errors
    error_links = []
    
    # Iterate through all of the links
    for i, link in enumerate(new_links):
        print(f"Processing {i+1} of {len(data_links)}")
        try:
            if i == 0:
                # Attempt to parse the data from the link for the first link
                response = parse_BPS(link)
            else:
                # Attempting to parse data from the next links
                # then combining with the current master data frame 
                # which is called "response"
                response = response.unionByName(parse_BPS(link))
                
            # If successful, then add the link to the checked links
            checked_links.append(link)

        except Exception as E:
            error_links.append(link)
            print(E)
            errors += 1
            if errors > 3:
                print('More than 3 failures have occurred. Exiting Loop')
                break
        print(response.count())
        
    if len(error_links) > 0:
        print('These are the links that caused errors.')
        print(error_links)
    return response, checked_links

def update_links(df, write_mode):
    final_path = check_path + '/scrape-history/bps_in_database'

    # Reduce to single file and write to blob
    df.repartition(1).write.format("com.databricks.spark.json") \
    .mode(write_mode).option("header", "True") \
    .json(final_path)

    # Locate file in blob
    files = dbutils.fs.ls(final_path)
    json_file = [x.path for x in files if x.path.endswith(".json")][0]

    # Move file out of directory into main blob and delete junk files
    dbutils.fs.mv(json_file, final_path + ".json")
    dbutils.fs.rm(final_path, recurse = True)
    print("Updated the file containing scrape history.")
    
def save_new_data(df):   
    # Define write path
    final_path = write_path + '/county_level_building_permits_from_databrick'

    # Reduce to single file and write to blob
    df.repartition(1).write.format("com.databricks.spark.json") \
    .mode(write_mode).option("header", "True") \
    .json(final_path)

    # Locate file in blob
    files = dbutils.fs.ls(final_path)
    json_file = [x.path for x in files if x.path.endswith(".json")][0]

    # Move file out of directory into main blob and delete junk files
    dbutils.fs.mv(json_file, final_path + ".json")
    dbutils.fs.rm(final_path, recurse = True)

In [0]:
# Dropping the Value and Reported from the dataset, keeping imputed data

def data_shower(df):
    
    # c2 references the column value from the original dataset, i.e. _c21, _c24
    # Those columns held data related to the reported numbers which we are dropping
    cols_to_drop = [_ for _ in df.columns if ('value' in _ or 'rep' in _ or '_c2' in _ or 'division' in _ or 'region' in _ or '_c18' in _)]
    building_permit_df = df.drop(*cols_to_drop)

    new_columns_names = {
        '1-unitunits': '1_Unit',
        '2-unitsunits': '2_Unit',
        '3-4 unitsunits': '3-4_Units',
        '5+ unitsunits': '5_plus_Units',
        '_c12bldgs': '3-4_UnitBuilding',
        '_c15bldgs': '5_plus_UnitBuilding',
        '_c6bldgs': '1_UnitBuilding',
        '_c9bldgs': '2_UnitBuilding',
        'countyname': 'County',
        'fips1state': 'StateFips',
        'fips2county': 'CountyFips',
        'surveydate': 'Date'
    }

    new_columns = [new_columns_names[_] for _ in building_permit_df.columns]
    new_df = building_permit_df.toDF(*new_columns)
    
    # Create a date column from the shortened date
    func = udf(lambda x: dt.datetime.strptime(x, "%y%M%d"), DateType())
    fixed_date = new_df.withColumn('new_Date', to_date(col('Date'), "yyyyMM"))
    #fixed_date = new_df.withColumn('new_Date', date_format(func(col('Date')), 'y-M'))
    
    month_dict = {
    '01': 'Jan',
    '02': 'Feb',
    '03': 'Mar',
    '04': 'Apr',
    '05': 'May',
    '06': 'Jun',
    '07': 'Jul',
    '08': 'Aug',
    '09': 'Sep',
    '10': 'Oct',
    '11': 'Nov',
    '12': 'Dec' 
    }
    
    # Split the year and month from the data and create new columns for those.
    month_convert = udf(lambda x: month_dict[x])
    year_month_split = fixed_date.withColumn('Year', split(fixed_date.new_Date,"-")[0]) \
                                 .withColumn('Month', split(fixed_date.new_Date,"-")[1])
    
    # Converting month to string month and then dropping the messy date
    year_month_split = year_month_split.withColumn('Month', month_convert(col("Month")))
    cleaned_df = year_month_split.drop("Date")
    
    # Rename Date Column, (and drop date again because it didn't fully drop the first time..)
    fixed_date = cleaned_df.drop('Date')
    fixed_date = fixed_date.withColumnRenamed("new_Date", "Date")
    
    # Combine State and County Fips into single FIPS
    other_columns = [_ for _ in fixed_date.columns if _ not in ['StateFips','CountyFips']]
    finalized_frame = fixed_date.select(concat_ws('', fixed_date.StateFips, fixed_date.CountyFips).alias('FIPS'),*other_columns)

    # Filter to only New Jersey records
    finalized_frame = finalized_frame.where(col("FIPS").like('34%'))
    finalized_frame = finalized_frame.withColumn('County', trim(col('County')))
    
    # Replace nulls with zero and add units and buildings for overall.
    reduced_df = finalized_frame.na.fill(0).withColumn("NewUnits", col("1_Unit") + col('2_Unit') + col('3-4_Units') + col('5_plus_Units'))
    reduced_df = reduced_df.withColumn("NewBuildings", col("3-4_UnitBuilding") + col("5_plus_UnitBuilding") + col("1_UnitBuilding") + col("2_UnitBuilding"))

    # Drop Unneccessary columns.
    cols_to_keep = ['FIPS', 'County', 'Year', 'Month', 'NewUnits', 'NewBuildings', 'Date']
    cols_to_drop = [_ for _ in reduced_df.columns if _ not in cols_to_keep]
    reduced_df = reduced_df.drop(*cols_to_drop)
    return reduced_df

**Might remove this code block because we aren't adding to the database in this step any more**

In [0]:

# def add_to_database(df):
#     database = "arctic_analysts_capstone"
#     table = "dbo.building_permits"
#     user = "arctic_analysts"
#     password  = "ThisPassw0rd!"
#     server = "gen10-data-fundamentals-22-02-sql-server.database.windows.net"

#     # WRITE <--- dataframe to database
#     df.write.format("jdbc") \
#       .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
#       .mode("append") \
#       .option("dbtable", table) \
#       .option("user", user) \
#       .option("password", password) \
#       .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
#       .save()
#     print('Successfully added the data to the dataframe.')


##Data Scrape Controller<br>

> This cell controls whether to scrape or not, manages the results of the scrape, and saves any new links to the scrape history.

In [0]:
# If the scrape validator did find new data or needs to scrape.
if new_data != 'none':
    
    print('Processing New Links')
    # Scrape new data
    response, checked_links = process_new_data()
    
    # Establish whether to overwrite or append new data
    if new_data == 'all':
        write_mode = 'overwrite'
    else:
        write_mode = 'append'
    
    # Create the checked link history
    new_links_frame = spark.createDataFrame(checked_links, StringType())
    new_links = new_links_frame.withColumnRenamed('value', 'InDatabase')
    
    print("Cleaning...")
    # Send the data to the cleaning function
    cleansed = data_shower(response)

    # Drop duplicates where date and location are the same
    # This occurred because some files were duplicated
    cleansed = cleansed.dropDuplicates(['Date','FIPS'])
    
    print("Saving...")
    # Send the data to the save function
    save_new_data(cleansed)
    ###################add_to_database(cleansed) Not adding to the database in this brick
    
    # Update the scrape history
    update_links(new_links, write_mode)
else:
    print('Data is up to date.')
print("Finished.")

In [0]:
#building_permit_df = spark.read.options(inferSchema='True', header='True').json(read_path + "/" + getArgument('filename') + ".json")