## Zillow Median Home Price Scraper
**Intended Utility**
>This databrick is designed to be utilized as part of a pipeline.<br><br>
>The data scrape gets a csv file from the publicly accessible Zillow site<br>
>and then parses the data into dataframe where it can be cleaned and prepared for storage.<br>

>The scrape can be run once a month, or as needed to check for new data. <br>
>The scraper will check the database and note the most recent data, and then<br>
>parse the scrape and check if there is any data that is newer than what is in the database.<br>

>If there is newer data, then the new data will be cleaned and appended to the container before being added to the database.

**Configuration:**
> These cells are responsible for configuring the primary aspects of the databrick.<br>

**Config Part 1:** Import Libraries

In [0]:
#import os.path #<-Not sure if we actually need this one.

import json
import random
import requests
import datetime as dt
from time import sleep
from typing import Iterable 
from pyspark.sql import DataFrame
from pyspark.sql.functions import array, col, explode, lit, struct, split, concat_ws, when

**Config Part 2:** Create I/O widgets for pipeline.

In [0]:
# These widgets allow the pipeline to set the file I/O information.

dbutils.widgets.text("input", "","") 
dbutils.widgets.get("input")
 
dbutils.widgets.text("output", "","") 
dbutils.widgets.get("output")
 
dbutils.widgets.text("filename", "","") 
dbutils.widgets.get("filename")

**Config Part 3:** Designate Mount Points

In [0]:
# Mounting container to read to check existing file and to write data to

def mount_storage(mount_goal):
    storageAccount = mount_goal['account']
    storageContainer = mount_goal['container']
    clientSecret = "B4g8Q~1VyZJa5WszLHwdEQNq4YIaHmT4DevRBcwI"
    clientid = "2ca50102-5717-4373-b796-39d06568588d"
    mount_point = mount_goal['mount']

    configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientid,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
           "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

    try: 
        dbutils.fs.unmount(mount_point)
    except:
        pass

    dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)
    
    return mount_point
    
in_path = getArgument("input").split("/")
out_path = getArgument("output").split("/")

storage_info = {
    'read': {'account': in_path[0], 'container': in_path[1], 'mount': "/mnt/arctic_analysts_zillow_scraper_read"},
    'write': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_zillow_scraper_write"}
}

try:
    read_path = mount_storage(storage_info['read'])
    write_path = mount_storage(storage_info['write'])
except Exception as E:
    print(E[:-50])
    
print(f'Read Path: {read_path}\nWrite Path: {write_path}')

**Database Data Check**
>These cells check if there is any new data.<br>

**Data Check Part 1:** Check the most recent data in the data lake.

In [0]:
# Check if there is an associated file in the data lake
try:
    house_prices = spark.read.json('/mnt/arctic_analyst_path/county_level_house_prices_from_databrick.json')
    file_found = True
except Exception as E:
    print(E)
    print('There does not appear to be a file associated with this dataset.')
    print('A new file will be created.')
    file_found = False
    
# If there is a file, check it and get the most recent date. 
if file_found:
    todays_date = dt.datetime.today().date()
    most_recent_date = house_prices.sort(house_prices.Date.desc()).select("Date").limit(1).collect()[0].asDict()['Date']
    most_recent_date = dt.datetime.strptime(most_recent_date, '%Y-%m-%d').date()
    days_since = (todays_date - most_recent_date).days
    
    # If it has been more than 30 days since the last scrape, get the file again.
    if days_since > 30:
        print('More than 30 days have elapsed since the last scrape.')
        print('A new scrape will be run.')
        scrape_permitted = True
    else:
        print(f'Not enough time has elapsed. You must wait for {30 - (days_since)} more days to scrape again.')
        scrape_permitted = False
else:
    scrape_permitted = True

In [0]:
# Functions
# Scraping data Zillow csv file download and creating a dataframe from it
# Melting from wide form from long form
# Cleaning the data - rename columns, month values, and combining County and State Fips into FIPS
# Updating the container 

def scrape_zillow():
    URL = "https://files.zillowstatic.com/research/public_csvs/zhvi/County_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv?t=1652280577"
    
    # Get the file data from the url.
    response = requests.get(URL)
    response_content = response.content
    response_content_lines = response_content.splitlines()

    #type(response_content_lines[0].decode('utf-8'))
    
    # Extract each column from the response, and add to the list of columns
    columns = []
    for obj in response_content_lines[0].decode('utf-8').split(','):
        columns.append(obj)

    # Creating rows from the data
    values_list = []
    for line in response_content_lines[1:]:
        values = []
        for obj in line.decode('utf-8').split(','):
            values.append(obj)
        values_list.append(values)

    # Convert prepared data to dataframe
    df = spark.createDataFrame(values_list,columns)
    return df

def filter_dates():
    """ This function may be obsolete after testing."""
    print(most_recent_date)
    #This has not been fully implemented, but will be used to retain only the data that is not in the database
    #this filter will occur before the melt happens in the cleaning process.
    
    #     dates = ("2013-01-01",  "2015-07-01")
    #     date_from, date_to = [to_date(lit(s)).cast(TimestampType()) for s in dates]

    #     sf.where((sf.my_col > date_from) & (sf.my_col < date_to))

def melt(
    df_temp: DataFrame,
    id_vars: Iterable[str], value_vars: Iterable[str],
    var_name: str="Date",
    value_name: str="MedianHousePrice"):
    
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))
    
    _tmp = df_temp.withColumn("_vars_and_vals", explode(_vars_and_vals))
    
    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    
    return _tmp.select(*cols)

def clean_scrape(df):
    # Rename and drop columns
    new_df = df.withColumnRenamed('RegionName', 'County').drop(
        'RegionID','SizeRank','RegionType','Metro','StateName' 
    )
    new_df = new_df.withColumnRenamed('MunicipalCodeFIPS', 'CountyFIPS')
    
    # Drop more columns. 
    keep_columns = ['County','State','StateCodeFIPS', 'CountyFIPS']
    value_columns = [_ for _ in new_df.columns if _ not in keep_columns]
    
    # Convert from wide form to long form
    melted_frame = melt(
        new_df, id_vars = keep_columns, value_vars = value_columns
    )
    
    month_dict = {
    '01': 'Jan',
    '02': 'Feb',
    '03': 'Mar',
    '04': 'Apr',
    '05': 'May',
    '06': 'Jun',
    '07': 'Jul',
    '08': 'Aug',
    '09': 'Sep',
    '10': 'Oct',
    '11': 'Nov',
    '12': 'Dec' 
    }
    
    # Convert date information into month and year
    month_convert = udf(lambda x: month_dict[x])
    year_month_split = melted_frame.withColumn('Year', split(melted_frame.Date,"-")[0]) \
                                 .withColumn('Month', split(melted_frame.Date,"-")[1])
    
    cleaned_df = year_month_split.withColumn('Month', month_convert(col("Month")))
    
    # Combine FIPS columns into single FIPS
    other_columns = [_ for _ in house_prices.columns if _ not in ['StateCodeFIPS','CountyFIPS']]
    final_frame = house_prices.select(concat_ws('', house_prices.StateCodeFIPS, house_prices.CountyFIPS).alias('FIPS'),*other_columns)
    
    # Filter data for only New Jersey
    final_frame = final_frame.where(col("FIPS").like("34%"))
    return cleaned_df

def update_blob(df, method):
    final_path = write_path + '/county_level_house_prices_from_databrick'

    # Reduce to single file and write to blob
    df.repartition(1).write.format("com.databricks.spark.json") \
    .mode(method).option("header", "True") \
    .json(final_path)

    # Locate file in blob
    files = dbutils.fs.ls(final_path)
    json_file = [x.path for x in files if x.path.endswith(".json")][0]

    # Move file out of directory into main blob and delete junk files
    dbutils.fs.mv(json_file, final_path + ".json")
    dbutils.fs.rm(final_path, recurse = True)

In [0]:
# Main cell to call function to scrape, clean, and update

if scrape_permitted:
    df = scrape_zillow()
    
    cleaned_zillow = clean_scrape(df)
#     if file_found:
#         method = 'append'
#         # This method has not been fully implemented. A filter function is in progress. See def filter_dates
#     else:
    # Overwrite the blob with the new data.
    method = 'overwrite'
    update_blob(cleaned_zillow, method)
    print(" Data Update Successfull")
else:
    print('No data updated.')

County,CountyFIPS,Date,MedianHousePrice,Month,State,StateCodeFIPS,Year
Los Angeles County,37,2000-01-31,223502.0,Jan,CA,6,2000
Los Angeles County,37,2000-02-29,223603.0,Feb,CA,6,2000
Los Angeles County,37,2000-03-31,224605.0,Mar,CA,6,2000
Los Angeles County,37,2000-04-30,226408.0,Apr,CA,6,2000
Los Angeles County,37,2000-05-31,228664.0,May,CA,6,2000
Los Angeles County,37,2000-06-30,230737.0,Jun,CA,6,2000
Los Angeles County,37,2000-07-31,232883.0,Jul,CA,6,2000
Los Angeles County,37,2000-08-31,235316.0,Aug,CA,6,2000
Los Angeles County,37,2000-09-30,237500.0,Sep,CA,6,2000
Los Angeles County,37,2000-10-31,239245.0,Oct,CA,6,2000
