## Mortgage Rates Scraper
**Intended Utility**
>This databrick is designed to be utilized as part of a pipeline.<br><br>
>The data can be scraped once a month to check for any updates. The databrick will check<br>
>the database and if the last data is older than 1 month, then it will scrape again. If the <br>
>data is not older than 1 month, then the databrick will take no action.<br><br>
>If a scrape is conducted, then the databrick will clean and transform the data before sending it<br>
>to a storage container and then updating the database.

**Helpful Links**
>https://dc.urbanturf.com/articles/blog/first-timer_primer_interest_rates_and_mortgage_points/6745

**Configuration:**
> These cells are responsible for configuring the primary aspects of the databrick.<br>

**Config Part 1:** Import Libraries

In [0]:
import io
import requests
import datetime as dt
import pyspark.pandas as ps
from typing import Iterable 
from bs4 import BeautifulSoup
from pyspark.sql import DataFrame
from pyspark.sql.functions import split
from pyspark.sql.functions import array, col, explode, lit, struct, when


**Config Part 2:** Create I/O widgets for pipeline.

In [0]:
# These widgets allow the pipeline to set the file I/O information.

dbutils.widgets.text("input", "","") 
dbutils.widgets.get("input")
 
dbutils.widgets.text("output", "","") 
dbutils.widgets.get("output")
 
dbutils.widgets.text("filename", "","") 
dbutils.widgets.get("filename")

**Config Part 3:** Designate Mount Points

In [0]:
def mount_storage(mount_goal):
    storageAccount = mount_goal['account']
    storageContainer = mount_goal['container']
    clientSecret = "B4g8Q~1VyZJa5WszLHwdEQNq4YIaHmT4DevRBcwI"
    clientid = "2ca50102-5717-4373-b796-39d06568588d"
    mount_point = mount_goal['mount']

    configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientid,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
           "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

    try: 
        dbutils.fs.unmount(mount_point)
    except:
        pass

    dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)
    
    return mount_point
    
in_path = getArgument("input").split("/")
out_path = getArgument("output").split("/")

storage_info = {
    'read': {'account': in_path[0], 'container': in_path[1], 'mount': "/mnt/arctic_analysts_mortgage_info_scraper_read"},
    'write': {'account': out_path[0], 'container': out_path[1], 'mount': "/mnt/arctic_analysts_mortgage_info_scraper_write"}
}

try:
    read_path = mount_storage(storage_info['read'])
    write_path = mount_storage(storage_info['write'])
        
except Exception as E:
    print(E[:-50])
    
print(f'Read Path: {read_path}\nWrite Path: {write_path}')

**Database Data Check**
>These cells check if there is any new data.<br>

**Data Check Part 1:** Check the most recent data in the data lake.

In [0]:
# Check if we already have mortgage rate data. 
try:
    filename = getArgument("filename")
    mortgage_rates = spark.read.json(read_path + '/' + filename + '.json')
    file_found = True
except Exception as E:
    print(E)
    print('There does not appear to be a file associated with this dataset.')
    print('A new file will be created.')
    file_found = False
    
# Check if enough days have elapsed to justify another scrape.
if file_found:
    # Get todays date
    todays_date = dt.datetime.today().date()
    calculate_date = udf(lambda x: dt.datetime.strftime(dt.datetime.strptime(x[1]+ "-" + x[0] + '-' + '01', '%Y-%B-%d'), '%Y-%m-%d'))
    
    # Get the last date in the current data.
    computed_date = mortgage_rates.where(col("Month") != 'ANNUAL AVERAGE').withColumn("Date", calculate_date(array('Month','Year')))
    computed_date = computed_date.filter(computed_date.AveragePoints.isNotNull())
    most_recent_date = computed_date.sort(computed_date.Date.desc()).select("Date").limit(1).collect()[0].asDict()['Date']
    most_recent_date = dt.datetime.strptime(most_recent_date, '%Y-%m-%d').date()
    
    # Check if more than 60 days have elapsed since the last data.
    days_since = (todays_date - most_recent_date).days
    if days_since > 60:
        
        # This is set to 60 temporarily, but might be dropped to 45
        print('More than 60 days have elapsed since the last scrape.')
        print('A new scrape will be run.')
        scrape_permitted = True
    else:
        print(f'Not enough time has elapsed. You must wait for {60 - (days_since)} more days to scrape again.')
        scrape_permitted = False
else:
    scrape_permitted = True

## Data Scraper
> These cells are responsible for controlling the scrape behavior.<br>

**Scrape Part 1:**

In [0]:
# Establish a list of months for conversions.
month_list = [
        'JANUARY',
        'FEBRUARY',
        'MARCH',
        'APRIL',
        'MAY',
        'JUNE',
        'JULY',
        'AUGUST',
        'SEPTEMBER',
        'OCTOBER',
        'NOVEMBER',
        'DECEMBER',
        'ANNUAL AVERAGE'
    ]

def get_header(table):
    """ This function returns the header row of the scraped table. """
    row_values = []

    row = table.find('tr').get_text("|").split("|")
    row_values.append(row)
    new_header = []
    for i, item in enumerate(row_values[0]):
        if ('2' in item or '1' in item):
            new_header.append(item + '_Rate')
            new_header.append(item + '_Pts')
    return new_header

def get_rows(table):
    cells = table.find_all('tr')
    
    table_rows = []
    for item in cells:
        new_row = item.text.upper().strip().replace('\n',"|").replace('\xa0',"").split("|")
        if new_row[0] in month_list:
            table_rows.append(new_row)
    return table_rows

def melt(
    df: DataFrame,
    id_vars: Iterable[str], value_vars: Iterable[str],
    var_name: str="YearAttribute",
    value_name: str="MortgageRateValue"):
    
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))
    
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
    
    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    
    return _tmp.select(*cols)

def scrape_mortgage_rates():
    url = 'https://www.freddiemac.com/pmms/pmms30'
    
    # Get the data from the website
    raw_data = requests.get(url).text
    
    # Create soup with BeautifulSoup
    soup = BeautifulSoup(raw_data)
    
    # Get all tables on the page
    tables = soup.find_all('table')
    
    # Create a list for dataframes
    dataframes = []
    
    # For every table in the available tables, (except for the last few)
    for table_count, table in enumerate(tables[:-2]):
        
        # Get the header of the table
        new_header = get_header(table)
        
        # Add a month column to the header that is named in a way that we can drop duplicates.
        if table_count == 0:
            new_header.insert(0, 'Primary_Month')
        else:
            new_header.insert(0, 'Month')

        # Get all the rows of the table
        table_rows = get_rows(table)

        # Create a dataframe from the rows and set header as column names
        df = spark.createDataFrame(table_rows, new_header)
        
        # Add the new dataframe to the list of dataframes
        dataframes.append(df)    

    # Combine all dataframes to large dataframe
    for i, dataframe in enumerate(dataframes):
        if i == 0:
            final_frame = dataframe
        else:
            final_frame = final_frame.join(dataframe, final_frame["Primary_Month"] == dataframe["Month"], "outer")
            final_frame = final_frame.drop('Month')
            
    # Convert from wide form to long form.
    melted_frame = melt(final_frame, id_vars = ['Primary_Month'], value_vars = [_ for _ in final_frame.columns if _ != 'Primary_Month'])

    # Create a new column to hold the year as well as a new column to hold the attribute
    split_df = melted_frame.withColumn('Year', split(melted_frame['YearAttribute'], '_').getItem(0)) \
                           .withColumn('Attribute', split(melted_frame['YearAttribute'], '_').getItem(1))

    # Drop the YearAttribute column and rename the month column for continuity
    split_df = split_df.drop(col("YearAttribute"))
    renamed_df = split_df.withColumnRenamed('Primary_Month','PrimaryMonth')
    
    # Replacing all the blanks in the the MortgageRateValue with None
    final_df = renamed_df.withColumn("MortgageRateValue", when(col("MortgageRateValue")=="", None).otherwise(col("MortgageRateValue")))  
    final_df = final_df.withColumn('MortgageRateValue', when(col("MortgageRateValue")=="", None).otherwise(col("MortgageRateValue")))
    
    # Split the dataframe into two, one with points, and one with rate
    attribute_frame = final_df.where(final_df.Attribute == 'Pts') \
                              .withColumnRenamed("MortgageRateValue", "AveragePoints") \
                              .drop('Attribute')

    rate_frame = final_df.where(final_df.Attribute == 'Rate') \
                         .withColumnRenamed("MortgageRateValue", "AverageRate") \
                         .withColumnRenamed("Year", 'drop_year') \
                         .withColumnRenamed('PrimaryMonth', 'Month') \
                         .drop('Attribute')

    # Rejoin dataframes so that there are two new columns, rate and points, then drop the year and primary month
    finished_df = rate_frame.join(attribute_frame, (rate_frame.drop_year == attribute_frame.Year) & (rate_frame.Month == attribute_frame.PrimaryMonth), how = 'inner')
    finished_df = finished_df.drop('drop_year').drop('PrimaryMonth')

    # Filter out the annual averages
    finished_df = finished_df.where(col("Month") != 'ANNUAL AVERAGE') 
            
    return finished_df

def save_scrape_to_blob(df):
    final_path = write_path + '/mortgage_data_from_databrick'

    # Reduce to single file and write to blob
    df.repartition(1).write.format("com.databricks.spark.json") \
    .mode('overwrite').option("header", "True") \
    .json(final_path)

    # Locate file in blob
    files = dbutils.fs.ls(final_path)
    json_file = [x.path for x in files if x.path.endswith(".json")][0]

    # Move file out of directory into main blob and delete junk files
    dbutils.fs.mv(json_file, final_path + ".json")
    dbutils.fs.rm(final_path, recurse = True)

In [0]:
if scrape_permitted:        
    finished_df = scrape_mortgage_rates()
    save_scrape_to_blob(finished_df)

In [0]:
# If we want to read the data from the blob
#mortgage_df = spark.read.options(inferSchema='True', header='True').json(read_path + "/" + getArgument('filename') + ".json")