In [43]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from sqlalchemy import create_engine
import pandas as pd
import json
import requests
import io
import calendar

# 1. Transform, Format and Clean Data. 

# 2. Seperate into dimensions and facts

# 3. Save the data into the warehouse

In [44]:
# Read the JSON config file
config_file_path = 'config.json'
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file) 

# Azure connection string
CONNECTION_STRING = config['AZURE_CONNECTION_STRING']
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)

# Database connection
DATABASE = config['DW_CONNECTION_STRING']
engine = create_engine(DATABASE)
schema = config['SCHEMA']

In [45]:
def get_blob_list(container_name):
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs()
    return blob_list

In [46]:
def get_azure_blob_data(container_name, blob):
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(blob.name)
    stream = blob_client.download_blob()
    blob_content = b""
    for chunk in stream.chunks():
        blob_content += chunk
    return blob_content

In [47]:
def download_file(url):
    response = requests.get(url)
    return io.BytesIO(response.content)

# Retrieving the data from Azure Containers

In [48]:
"""def get_ppp_loan_data():
    container_name = 'pppdata'
    blob_list = get_blob_list(container_name)
    df_list = []  # Initialize df_list outside the loop

    print(f"Downloading data from {container_name} container\n")
    for blob in blob_list:
        print(f"Downloading:\t{blob.name}")
        blob_data = get_azure_blob_data(container_name, blob)
        print(f"Downloaded {blob.name} successfully\n")
        data = io.BytesIO(blob_data)
        print(f"Reading:\t{blob.name}")
        df_chunks = pd.read_csv(data, chunksize=100000)  # Adjust the chunksize as per your memory capacity
        for chunk in df_chunks:
            df_list.append(chunk)
        print(f"Read {blob.name} successfully\n\n")
    
    if df_list:  # Check if df_list is not empty
        df = pd.concat(df_list)
        print(f"PPP consolidated successfully")
        return df
    else:
        print("No data downloaded.")
        return None"""

'def get_ppp_loan_data():\n    container_name = \'pppdata\'\n    blob_list = get_blob_list(container_name)\n    df_list = []  # Initialize df_list outside the loop\n\n    print(f"Downloading data from {container_name} container\n")\n    for blob in blob_list:\n        print(f"Downloading:\t{blob.name}")\n        blob_data = get_azure_blob_data(container_name, blob)\n        print(f"Downloaded {blob.name} successfully\n")\n        data = io.BytesIO(blob_data)\n        print(f"Reading:\t{blob.name}")\n        df_chunks = pd.read_csv(data, chunksize=100000)  # Adjust the chunksize as per your memory capacity\n        for chunk in df_chunks:\n            df_list.append(chunk)\n        print(f"Read {blob.name} successfully\n\n")\n    \n    if df_list:  # Check if df_list is not empty\n        df = pd.concat(df_list)\n        print(f"PPP consolidated successfully")\n        return df\n    else:\n        print("No data downloaded.")\n        return None'

In [49]:
# Testing only one file
def get_ppp_loan_data():
    container_name = 'pppdata'
    blob_list = get_blob_list(container_name)
    
    for blob in blob_list:
        if "public_150k_plus" in blob.name:
            print(f"Downloading {blob.name}")
            blob_data = get_azure_blob_data(container_name, blob)
            print(f"Downloaded {blob.name} successfully")
            data = io.BytesIO(blob_data)
            print(f"Reading {blob.name}")
            df_chunks = pd.read_csv(data, chunksize=100000)  # Adjust the chunksize as per your memory capacity
            df_list = []
            for chunk in df_chunks:
                df_list.append(chunk)
            df = pd.concat(df_list)
            return df

In [50]:
def get_naics_data():
    container_name = 'naicsdata'
    blob_list = get_blob_list(container_name)

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df
    

In [51]:
def get_gdp_data():
    container_name = 'gdpdata'
    blob_list = get_blob_list(container_name)

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df

# Reformating, and Cleaning the data

In [52]:
def reformat_naics_data():
    df_naics = get_naics_data()
    df_naics.rename(columns={
        'Code': 'naics_code',
        'Title': 'naics_title',
        'Description': 'description'
    }, inplace=True)
    # Remove all the rows where naics_code is not a number
    # The naics_code column has some generic values like "31-33" which are not valid NAICS codes
    df_naics = df_naics[df_naics['naics_code'].str.isnumeric()]

    # Remove T from naics_title
    df_naics['naics_title'] = df_naics['naics_title'].str.replace('T', '')

    # Convert the data types
    df_naics['naics_code'] = df_naics['naics_code'].astype(int)
    df_naics['naics_title'] = df_naics['naics_title'].astype(pd.StringDtype("pyarrow"))
    df_naics['description'] = df_naics['description'].astype(pd.StringDtype("pyarrow"))
    
    return df_naics

In [53]:
def reformat_gdp_data():
    df_gdp  = get_gdp_data()
    #Drop all the records where 2017, 2018, 2019, 2020, 2021, 2022 = "(NA)" 
    df_gdp = df_gdp[df_gdp['2017'] != "(NA)"]
    df_gdp = df_gdp[df_gdp['2020'] != "(NA)"]

    # Pivot the data in GDP data
    selected_columns = ['GeoFIPS', 'GeoName', 'Region', 'Description', '2017', '2018', '2019', '2020', '2021', '2022']
    df_gdp = df_gdp[selected_columns]
    pivot_data = df_gdp.melt(id_vars=["GeoFIPS", "GeoName", "Region", "Description"],
                                    value_vars=["2017", "2018", "2019", "2020", "2021", "2022"],
                                    var_name="date_id",
                                    value_name="Value")
    pivot_data = pivot_data.pivot_table(index=["GeoFIPS", "GeoName", "Region", "date_id"], columns="Description", values="Value", aggfunc='first').reset_index()
    pivot_data = pivot_data.sort_values(by=["GeoFIPS", "date_id"])
    pivot_data.rename(columns={
        "Chain-type quantity indexes for real GDP ": "chain_type_index_gdp",
        "Current-dollar GDP (thousands of current dollars) ": "current_dollar_gdp",
        "Real GDP (thousands of chained 2017 dollars) ": "real_gdp",
        "GeoFIPS": "geofips",
        "GeoName": "geo_name",
        "Description": "Index",
        "date_id": "year_id",
        "Region": "region"
    }, inplace=True)
    pivot_data['facts_gdp_id'] = range(1, len(pivot_data) + 1)
    final_data = pivot_data.drop(columns='Description', errors='ignore')
    final_data = pivot_data[['facts_gdp_id', 'geofips', 'geo_name', 'region', 'year_id', 'chain_type_index_gdp',
                         'current_dollar_gdp', 'real_gdp']]
    df_gdp = final_data

    # Remove the quation marks from geofips
    df_gdp['geofips'] = df_gdp['geofips'].str.replace('"', '')
    
    # Change the YearID to match the format in the Date Dimension
    df_gdp['year_id'] = pd.to_datetime(df_gdp['year_id'], format='%Y').dt.strftime('%Y%m%d%H')
    
    # Change the data types of the columns
    df_gdp['year_id'] = df_gdp['year_id'].astype(int)
    df_gdp['geofips'] = df_gdp['geofips'].astype(int)
    df_gdp['geo_name'] = df_gdp['geo_name'].astype(pd.StringDtype("pyarrow"))
    df_gdp['region'] = df_gdp['region'].astype(pd.StringDtype("pyarrow"))    
    df_gdp['chain_type_index_gdp'] = df_gdp['chain_type_index_gdp'].astype(float)
    df_gdp['current_dollar_gdp'] = df_gdp['current_dollar_gdp'].astype(float)
    df_gdp['real_gdp'] = df_gdp['real_gdp'].astype(float)


    return df_gdp

    
    

In [54]:
def reformat_ppp_loan_data():
    df_ppp = get_ppp_loan_data()


    # Delete the columns that are not required
    df_ppp.drop(columns=[
        'UTILITIES_PROCEED',
        'PAYROLL_PROCEED',
        'MORTGAGE_INTEREST_PROCEED',
        'RENT_PROCEED',
        'REFINANCE_EIDL_PROCEED',
        'HEALTH_CARE_PROCEED',
        'DEBT_INTEREST_PROCEED',
        'RuralUrbanIndicator',
        'HubzoneIndicator',
        'LMIIndicator',
        'ProjectCity',
        'ProjectZip',
        'CD'
    ], inplace=True)
    # Rename the columns to match the SQL table
    df_ppp.rename(columns={
        'LoanNumber': 'loan_number',
        'DateApproved': 'date_approved_id',
        'SBAOfficeCode': 'sba_office_code',
        'ProcessingMethod': 'processing_method',
        'BorrowerName': 'borrower_name',
        'BorrowerAddress': 'borrower_address',
        'BorrowerCity': 'borrower_city',
        'BorrowerState': 'borrower_state',
        'BorrowerZip': 'borrower_zip',
        'LoanStatusDate': 'loan_status_date_id',
        'LoanStatus': 'loan_status',
        'Term': 'term_month',
        'SBAGuarantyPercentage': 'sba_guaranty_percentage',
        'InitialApprovalAmount': 'initial_approval_amount',
        'CurrentApprovalAmount': 'current_approval_amount',
        'UndisbursedAmount': 'undisbursed_amount',
        'FranchiseName': 'franchise_name',
        'ServicingLenderLocationID': 'servicing_lender_location_id',
        'ServicingLenderName': 'servicing_lender_name',
        'ServicingLenderAddress': 'servicing_lender_address',
        'ServicingLenderCity': 'servicing_lender_city',
        'ServicingLenderState': 'servicing_lender_state',
        'ServicingLenderZip': 'servicing_lender_zip',
        'BusinessAgeDescription': 'business_age_description',
        'ProjectState': 'project_state',
        'ProjectCountyName': 'project_county_name',
        'Race': 'race',
        'Ethnicity': 'ethnicity',
        'Gender': 'gender',
        'BusinessType': 'business_type',
        'OriginatingLenderLocationID': 'originating_lender_location_id',
        'OriginatingLender': 'originating_lender',
        'OriginatingLenderCity': 'originating_lender_city',
        'OriginatingLenderState': 'originating_lender_state',
        'Veteran': 'veteran',
        'NonProfit': 'nonprofit',
        'ForgivenessAmount': 'forgiveness_amount',
        'ForgivenessDate': 'forgiveness_date_id',
        'JobsReported': 'jobs_reported',
        'NAICSCode': 'naics_code'
    }, inplace=True)

    # Droping all the empty rows
    # Drop all the rows where Borrower State is empty
    df_ppp = df_ppp.dropna(subset=['borrower_state'])

    # Drop all the rows where naics_code is empty
    df_ppp = df_ppp.dropna(subset=['naics_code'])

    # Drop all the rows where dates are empty
    df_ppp = df_ppp.dropna(subset=['date_approved_id', 'loan_status_date_id', 'forgiveness_date_id'])

    # Drop all the rows where jobs reported is empty
    df_ppp = df_ppp.dropna(subset=['jobs_reported'])

    # Drop all the rows where business type is empty
    df_ppp = df_ppp.dropna(subset=['business_type'])

    # Drop all the rows where business age description is empty
    df_ppp = df_ppp.dropna(subset=['business_age_description'])
    # or where the value is Unanswered
    df_ppp = df_ppp[df_ppp['business_age_description'] != 'Unanswered']

    

    # Change the Date columns to match the format in the Date Dimension
    df_ppp['forgiveness_date_id'] = pd.to_datetime(df_ppp['forgiveness_date_id']).dt.strftime('%Y%m%d%H')
    df_ppp['date_approved_id'] = pd.to_datetime(df_ppp['date_approved_id']).dt.strftime('%Y%m%d%H')
    df_ppp['loan_status_date_id'] = pd.to_datetime(df_ppp['loan_status_date_id']).dt.strftime('%Y%m%d%H')
    
    # Change nonprofit to boolean
    df_ppp['nonprofit'] = df_ppp['nonprofit'].map({'Y': True})
    df_ppp['nonprofit'] = df_ppp['nonprofit'].fillna(False)

    # Change veteran to boolean
    df_ppp['veteran'] = df_ppp['veteran'].map({'veteran': True, 'Non-veteran': False, 'Unanswered':None})

    # Sentence case the string columns
    df_ppp['borrower_address'] = df_ppp['borrower_address'].str.title()
    df_ppp['borrower_city'] = df_ppp['borrower_city'].str.title()
    df_ppp['originating_lender_city'] = df_ppp['originating_lender_city'].str.title()
    df_ppp['servicing_lender_city'] = df_ppp['servicing_lender_city'].str.title()
    df_ppp['project_county_name'] = df_ppp['project_county_name'].str.title()
    
    #df_ppp['loan_number'] = df_ppp['loan_number'].astype(int)
    df_ppp['date_approved_id'] = df_ppp['date_approved_id'].astype(int)
    df_ppp['sba_office_code'] = df_ppp['sba_office_code'].astype(int)
    df_ppp['processing_method'] = df_ppp['processing_method'].astype(pd.StringDtype("pyarrow"))
    df_ppp['borrower_name'] = df_ppp['borrower_name'].astype(pd.StringDtype("pyarrow"))
    df_ppp['borrower_address'] = df_ppp['borrower_address'].astype(pd.StringDtype("pyarrow"))
    df_ppp['borrower_city'] = df_ppp['borrower_city'].astype(pd.StringDtype("pyarrow"))
    df_ppp['borrower_state'] = df_ppp['borrower_state'].astype(pd.StringDtype("pyarrow"))
    df_ppp['borrower_zip'] = df_ppp['borrower_zip'].astype(pd.StringDtype("pyarrow"))
    df_ppp['loan_status_date_id'] = df_ppp['loan_status_date_id'].astype(int)
    df_ppp['loan_status'] = df_ppp['loan_status'].astype(pd.StringDtype("pyarrow"))
    df_ppp['term_month'] = df_ppp['term_month'].astype(int)
    df_ppp['sba_guaranty_percentage'] = df_ppp['sba_guaranty_percentage'].astype(float)
    df_ppp['initial_approval_amount'] = df_ppp['initial_approval_amount'].astype(float)
    df_ppp['current_approval_amount'] = df_ppp['current_approval_amount'].astype(float)
    df_ppp['undisbursed_amount'] = df_ppp['undisbursed_amount'].astype(float)
    df_ppp['franchise_name'] = df_ppp['franchise_name'].astype(pd.StringDtype("pyarrow"))
    df_ppp['servicing_lender_location_id'] = df_ppp['servicing_lender_location_id'].astype(int)
    df_ppp['servicing_lender_name'] = df_ppp['servicing_lender_name'].astype(pd.StringDtype("pyarrow"))
    df_ppp['servicing_lender_address'] = df_ppp['servicing_lender_address'].astype(pd.StringDtype("pyarrow"))
    df_ppp['servicing_lender_city'] = df_ppp['servicing_lender_city'].astype(pd.StringDtype("pyarrow"))
    df_ppp['servicing_lender_state'] = df_ppp['servicing_lender_state'].astype(pd.StringDtype("pyarrow"))
    df_ppp['servicing_lender_zip'] = df_ppp['servicing_lender_zip'].astype(pd.StringDtype("pyarrow"))
    df_ppp['business_age_description'] = df_ppp['business_age_description'].astype(pd.StringDtype("pyarrow"))
    df_ppp['project_state'] = df_ppp['project_state'].astype(pd.StringDtype("pyarrow"))
    df_ppp['project_county_name'] = df_ppp['project_county_name'].astype(pd.StringDtype("pyarrow"))
    df_ppp['race'] = df_ppp['race'].astype(pd.StringDtype("pyarrow"))
    df_ppp['ethnicity'] = df_ppp['ethnicity'].astype(pd.StringDtype("pyarrow"))
    df_ppp['gender'] = df_ppp['gender'].astype(pd.StringDtype("pyarrow"))
    df_ppp['business_type'] = df_ppp['business_type'].astype(pd.StringDtype("pyarrow"))
    df_ppp['originating_lender_location_id'] = df_ppp['originating_lender_location_id'].astype(int)
    df_ppp['originating_lender'] = df_ppp['originating_lender'].astype(pd.StringDtype("pyarrow"))
    df_ppp['originating_lender_city'] = df_ppp['originating_lender_city'].astype(pd.StringDtype("pyarrow"))
    df_ppp['originating_lender_state'] = df_ppp['originating_lender_state'].astype(pd.StringDtype("pyarrow"))
    df_ppp['veteran'] = df_ppp['veteran'].astype(bool)
    df_ppp['nonprofit'] = df_ppp['nonprofit'].astype(bool)
    df_ppp['forgiveness_amount'] = df_ppp['forgiveness_amount'].astype(float)
    df_ppp['forgiveness_date_id'] = df_ppp['forgiveness_date_id'].astype(int)
    df_ppp['jobs_reported'] = df_ppp['jobs_reported'].astype(int)
    df_ppp['naics_code'] = df_ppp['naics_code'].astype(int)

    # Create a FACTS_PPP_ID 
    df_ppp['facts_ppp_id'] = range(1, len(df_ppp) + 1)

    return df_ppp

# Build the Dimensions and Facts Tables

## NAICS Data

In [55]:
clean_naics_data = reformat_naics_data()
# Create the dimensions
dim_naics = clean_naics_data # Completed

# Reset the index
dim_naics.reset_index(drop=True, inplace=True)
dim_naics.head()

Unnamed: 0,naics_code,naics_title,description
0,11,"Agriculture, Forestry, Fishing and Hunting","The Sector as a Whole The Agriculture, Forest..."
1,111,Crop Production,Industries in the Crop Production subsector gr...
2,1111,Oilseed and Grain Farming,This industry group comprises establishments p...
3,11111,Soybean Farming,See industry description for 111110.
4,111110,Soybean Farming,This industry comprises establishments primari...


In [56]:
# Load the data into the database
print("Loading dim_naics data into the database...")
#dim_naics.to_sql('dim_naics', engine, schema=schema, if_exists='append', index=False)
print("dim_naics data loaded successfully\n")

Loading dim_naics data into the database...
dim_naics data loaded successfully



## GDP Data

In [57]:
clean_gdp_data = reformat_gdp_data()
dim_geography = clean_gdp_data[['geofips', 'geo_name', 'region']].drop_duplicates()
dim_geography = dim_geography.reset_index(drop=True)

# Remove the * from the geo_name
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace('*', '')

#Remove the County, Parish, Borough, Census Area, Municipality, City and Borough, (Independent City) from the geo_name
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(' City and Borough', '')
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(' Borough', '')
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(' Census Area', '')
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(' Municipality', '')
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(' (Independent City)', '')

# Special cases. Ex: Augusta, Staunton + Waynesboro, VA -> Augusta, VA
dim_geography['geo_name'] = dim_geography['geo_name'].str.replace(r'(.+),.+,', r'\1,')

# Remove any other records with + in the geo_name
dim_geography = dim_geography[~dim_geography['geo_name'].str.contains('\+')]

dim_geography.head(100)

Description,geofips,geo_name,region
0,0,United States,
1,1000,Alabama,5
2,1001,"Autauga, AL",5
3,1003,"Baldwin, AL",5
4,1005,"Barbour, AL",5
...,...,...,...
95,2275,"Wrangell, AK",8
96,2282,"Yakutat, AK",8
97,2290,"Yukon-Koyukuk, AK",8
98,4000,Arizona,6


In [58]:
# Split the geo_name into project_state and project_county_name
dim_geography['project_state'] = dim_geography['geo_name'].str.split(',').str[1].str.strip()
dim_geography['project_county_name'] = dim_geography['geo_name'].str.split(',').str[0].str.strip()

# Temporarily set geofips to string
dim_geography['geofips'] = dim_geography['geofips'].astype(str)
dim_geography['geo_name'] = dim_geography['geo_name'].astype(str)

# Set the project_state and project_county_name for the United States
dim_geography.loc[dim_geography['geofips'] == '0', 'project_state'] = 'All States'
dim_geography.loc[dim_geography['geofips'] == '0', 'project_county_name'] = 'All Counties'

# Set the project_state and project_county_name for the States
dim_geography.loc[dim_geography['geofips'].str.endswith('000'), 'project_state'] = dim_geography['geo_name']
dim_geography.loc[dim_geography['geofips'].str.endswith('000'), 'project_county_name'] = 'All Counties'

# Set the data types
dim_geography['geofips'] = dim_geography['geofips'].astype(int)
dim_geography['geo_name'] = dim_geography['geo_name'].astype(pd.StringDtype("pyarrow"))
dim_geography['region'] = dim_geography['region'].astype(pd.StringDtype("pyarrow"))
dim_geography['project_state'] = dim_geography['project_state'].astype(pd.StringDtype("pyarrow"))
dim_geography['project_county_name'] = dim_geography['project_county_name'].astype(pd.StringDtype("pyarrow"))
dim_geography.head(100)

Description,geofips,geo_name,region,project_state,project_county_name
0,0,United States,,All States,All Counties
1,1000,Alabama,5,Alabama,All Counties
2,1001,"Autauga, AL",5,AL,Autauga
3,1003,"Baldwin, AL",5,AL,Baldwin
4,1005,"Barbour, AL",5,AL,Barbour
...,...,...,...,...,...
95,2275,"Wrangell, AK",8,AK,Wrangell
96,2282,"Yakutat, AK",8,AK,Yakutat
97,2290,"Yukon-Koyukuk, AK",8,AK,Yukon-Koyukuk
98,4000,Arizona,6,Arizona,All Counties


In [59]:
# Create the Fact Table
facts_gdp = clean_gdp_data[['facts_gdp_id', 'year_id', 'real_gdp', 'chain_type_index_gdp', 'current_dollar_gdp', 'geofips']]
# Reset the index
facts_gdp.reset_index(drop=True, inplace=True)
# Re order the columns
facts_gdp = facts_gdp[['facts_gdp_id', 'geofips', 'year_id', 'chain_type_index_gdp', 'current_dollar_gdp', 'real_gdp']]
facts_gdp.head(5)

Description,facts_gdp_id,geofips,year_id,chain_type_index_gdp,current_dollar_gdp,real_gdp
0,1,0,2017010100,100.0,19612100000.0,19612100000.0
1,2,0,2018010100,102.967,20656520000.0,20193900000.0
2,3,0,2019010100,105.507,21521400000.0,20692090000.0
3,4,0,2020010100,103.171,21322950000.0,20234070000.0
4,5,0,2021010100,109.156,23594030000.0,21407690000.0


In [60]:
# Load the data into the database
print("Loading dim_geography data into the database...")
#dim_geography.to_sql('dim_geography', engine, schema=schema, if_exists='append', index=False)
print("dim_geography data loaded successfully\n")

print("Loading facts_gdp data into the database...")
#facts_gdp.to_sql('facts_gdp', engine, schema=schema, if_exists='append', index=False)
print("facts_gdp data loaded successfully\n")

Loading dim_geography data into the database...
dim_geography data loaded successfully

Loading facts_gdp data into the database...
facts_gdp data loaded successfully



## Date Dimension
Start date: 2017-01-01 00:00:00 

2017 is the minimum year in the GDP data

End date: 2023-10-1 00:00:00 

October 2023 is the maximum date in the PPP data

In [61]:
def week_of_month(dt):
    year = dt.year
    month = dt.month
    day = dt.day

    cal = calendar.monthcalendar(year, month)
    week_number = (day - 1) // 7 + 1
    return week_number

start_date = pd.to_datetime('2017-01-01 00:00:00') #2017 is the start date in the GDP data
end_date = pd.to_datetime('2023-10-01 00:00:00') # 2023 is the end date in the PPP data

# Create a DataFrame for the date dimension
dim_date = pd.DataFrame({'date': pd.date_range(start_date, end_date, freq='H')})

# Extract attributes
dim_date['year_number'] = dim_date['date'].dt.year
dim_date['quarter_number'] = dim_date['date'].dt.quarter #quarter_number
dim_date['month_number'] = dim_date['date'].dt.month
dim_date['month_name'] = dim_date['date'].dt.strftime('%B')
dim_date['day_number'] = dim_date['date'].dt.day #day_number
dim_date['day_name'] = dim_date['date'].dt.strftime('%A') #day_name
dim_date['hour_number'] = dim_date['date'].dt.hour #hour_number
dim_date['date_iso_format'] = dim_date['date'].apply(lambda x: x.isoformat())
dim_date['date_id'] = dim_date['date'].dt.strftime('%Y%m%d%H')

# Add week of the month and week of the year
dim_date['week_of_month'] = dim_date['date'].apply(week_of_month) #week_of_month
dim_date['week_of_year'] = dim_date['date'].dt.strftime('%U') #week_of_year

new_order = ['date_id', 'date_iso_format','year_number','quarter_number','month_number','day_number','hour_number','month_name','day_name','week_of_year','week_of_month']
dim_date = dim_date[new_order]

dim_date.head(5)

Unnamed: 0,date_id,date_iso_format,year_number,quarter_number,month_number,day_number,hour_number,month_name,day_name,week_of_year,week_of_month
0,2017010100,2017-01-01T00:00:00,2017,1,1,1,0,January,Sunday,1,1
1,2017010101,2017-01-01T01:00:00,2017,1,1,1,1,January,Sunday,1,1
2,2017010102,2017-01-01T02:00:00,2017,1,1,1,2,January,Sunday,1,1
3,2017010103,2017-01-01T03:00:00,2017,1,1,1,3,January,Sunday,1,1
4,2017010104,2017-01-01T04:00:00,2017,1,1,1,4,January,Sunday,1,1


In [62]:
print("Loading dim_date data into the database...")
#dim_date.to_sql('dim_date', engine, schema=schema, if_exists='append', index=False)
print("dim_date data loaded successfully\n")

Loading dim_date data into the database...
dim_date data loaded successfully



## PPP Data

In [63]:
clean_ppp_data = reformat_ppp_loan_data()

Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv


In [64]:
dim_loan_status = pd.DataFrame({'loan_status': ['Paid in Full', 'Charged Off']})
dim_loan_status["loan_status_id"] = range(1, len(dim_loan_status) + 1)
# Change the column order
dim_loan_status = dim_loan_status[['loan_status_id', 'loan_status']]
# Reset the index
dim_loan_status = dim_loan_status.reset_index(drop=True)
# Merge the clean_ppp_data with the dim_loan_status to get the loan_status_id
clean_ppp_data = clean_ppp_data.merge(dim_loan_status[['loan_status', 'loan_status_id']], on='loan_status', how='left', suffixes=('', '_dim_loan_status'))
dim_loan_status.head()

Unnamed: 0,loan_status_id,loan_status
0,1,Paid in Full
1,2,Charged Off


In [65]:
dim_processing_method = pd.DataFrame({'processing_method': ['PPP', 'PPS']})
dim_processing_method["processing_method_id"] = range(1, len(dim_processing_method) + 1)
# Change the column order
dim_processing_method = dim_processing_method[['processing_method_id', 'processing_method']]
# Reset the index
dim_processing_method = dim_processing_method.reset_index(drop=True)
# Merge the clean_ppp_data with the dim_processing_method to get the processing_method_id
clean_ppp_data = clean_ppp_data.merge(dim_processing_method[['processing_method', 'processing_method_id']], on='processing_method', how='left', suffixes=('', '_dim_processing_method'))
dim_processing_method.head()

Unnamed: 0,processing_method_id,processing_method
0,1,PPP
1,2,PPS


In [66]:
dim_business_type = clean_ppp_data[['business_type']].drop_duplicates()
dim_business_type["business_type_id"] = range(1, len(dim_business_type) + 1)

# Change the column order
dim_business_type = dim_business_type[['business_type_id', 'business_type']]

# Reset the index
dim_business_type = dim_business_type.reset_index(drop=True)

# Merge the clean_ppp_data with the dim_business_type to get the business_type_id
clean_ppp_data = clean_ppp_data.merge(dim_business_type[['business_type', 'business_type_id']], on='business_type', how='left', suffixes=('', '_dim_business_type'))
dim_business_type.head()

Unnamed: 0,business_type_id,business_type
0,1,Limited Liability Company(LLC)
1,2,Non-Profit Organization
2,3,501(c)3 – Non Profit
3,4,Corporation
4,5,Cooperative


In [67]:
dim_sba_office = clean_ppp_data[['sba_office_code']].drop_duplicates()
dim_sba_office = dim_sba_office.reset_index(drop=True)
dim_sba_office.head()

Unnamed: 0,sba_office_code
0,1084
1,459
2,470
3,405
4,669


In [69]:
dim_originating_lender = clean_ppp_data[['originating_lender_location_id', 'originating_lender', 'originating_lender_city', 'originating_lender_state']].drop_duplicates()
dim_originating_lender["originating_lender_id"] = range(1, len(dim_originating_lender) + 1)
# Change column order
dim_originating_lender = dim_originating_lender[['originating_lender_id', 'originating_lender_location_id', 'originating_lender', 'originating_lender_city', 'originating_lender_state']]
# Reset the index
dim_originating_lender = dim_originating_lender.reset_index(drop=True)

# Merge the clean_ppp_data with the dim_originating_lender to get the originating_lender_id
clean_ppp_data = clean_ppp_data.merge(dim_originating_lender[['originating_lender_location_id', 'originating_lender_id']], on='originating_lender_location_id', how='left', suffixes=('', '_dim_originating_lender'))

dim_originating_lender.head()

Unnamed: 0,originating_lender_id,originating_lender_location_id,originating_lender,originating_lender_city,originating_lender_state
0,1,116975,Northrim Bank,Anchorage,AK
1,2,89628,"National Cooperative Bank, National Association",Hillsboro,OH
2,3,3386,First National Bank Alaska,Anchorage,AK
3,4,119918,East West Bank,Pasadena,CA
4,5,225134,Truist Bank,Charlotte,NC


In [70]:
dim_borrower = clean_ppp_data[['borrower_name', 'borrower_address', 'borrower_city', 'borrower_state', 'borrower_zip', 'race', 'ethnicity', 'gender', 'veteran', 'franchise_name', 'nonprofit', 'jobs_reported']].drop_duplicates()
dim_borrower["borrower_id"] = range(1, len(dim_borrower) + 1)
# Change the column order
dim_borrower = dim_borrower[['borrower_id', 'borrower_name', 'borrower_address', 'borrower_city','borrower_state', 'borrower_zip', 'race', 'ethnicity', 'gender', 'veteran', 'franchise_name', 'nonprofit', 'jobs_reported']]

# Reset the index
dim_borrower = dim_borrower.reset_index(drop=True)

# Merge the clean_ppp_data with the dim_borrower to get the borrower_id
clean_ppp_data = clean_ppp_data.merge(dim_borrower[['borrower_name', 'borrower_address', 'borrower_city', 'borrower_state', 'borrower_zip', 'borrower_id']], on=['borrower_name', 'borrower_address', 'borrower_city', 'borrower_state', 'borrower_zip'], how='left', suffixes=('', '_dim_borrower'))
dim_borrower.head()

Unnamed: 0,borrower_id,borrower_name,borrower_address,borrower_city,borrower_state,borrower_zip,race,ethnicity,gender,veteran,franchise_name,nonprofit,jobs_reported
0,1,"KAKIVIK ASSET MANAGEMENT, LLC",5015 Business Park Blvd,Anchorage,AK,99503-7146,Unanswered,Unknown/NotStated,Unanswered,False,,False,385
1,2,"ARCTIC SLOPE NATIVE ASSOCIATION, LTD.",7000 Uula St,Barrow,AK,99723,Unanswered,Unknown/NotStated,Unanswered,False,,True,295
2,3,HOPE COMMUNITY RESOURCES INC.,540 W Intl Airport Rd,Anchorage,AK,99518-1105,Unanswered,Unknown/NotStated,Unanswered,False,,True,500
3,4,SOUTH PENINSULA HOSPITAL INC,4300 Bartlett Street,Homer,AK,99603,Unanswered,Unknown/NotStated,Unanswered,False,,False,439
4,5,"COPPER RIVER SEAFOODS, INC.",1118 5Th Ave,Anchorage,AK,99501-2759,Unanswered,Unknown/NotStated,Male Owned,True,,False,303


In [71]:
dim_servicing_lender = clean_ppp_data[['servicing_lender_location_id', 'servicing_lender_name', 'servicing_lender_address', 'servicing_lender_city', 'servicing_lender_state', 'servicing_lender_zip']].drop_duplicates()
dim_servicing_lender["servicing_lender_id"] = range(1, len(dim_servicing_lender) + 1)

# Reset the index
dim_servicing_lender = dim_servicing_lender.reset_index(drop=True)

# Change the column order
dim_servicing_lender = dim_servicing_lender[['servicing_lender_id', 'servicing_lender_location_id', 'servicing_lender_name', 'servicing_lender_address', 'servicing_lender_city', 'servicing_lender_state', 'servicing_lender_zip']]

# Merge the clean_ppp_data with the dim_servicing_lender to get the servicing_lender_id
clean_ppp_data = clean_ppp_data.merge(dim_servicing_lender[['servicing_lender_location_id', 'servicing_lender_id']], on='servicing_lender_location_id', how='left', suffixes=('', '_dim_servicing_lender'))
dim_servicing_lender.head()

Unnamed: 0,servicing_lender_id,servicing_lender_location_id,servicing_lender_name,servicing_lender_address,servicing_lender_city,servicing_lender_state,servicing_lender_zip
0,1,116975,Northrim Bank,3111 'C' St,Anchorage,AK,99503
1,2,89628,"National Cooperative Bank, National Association",139 S High St,Hillsboro,OH,45133-1442
2,3,3386,First National Bank Alaska,101 W 36th Ave,Anchorage,AK,99503-5904
3,4,119918,East West Bank,"135 N Los Robles Ave, 7th Fl",Pasadena,CA,91101-4525
4,5,225134,Truist Bank,214 N Tryon St,Charlotte,NC,28202-1078


In [72]:
dim_term = clean_ppp_data[['term_month']].drop_duplicates()
dim_term = dim_term.sort_values(by='term_month')
dim_term["term_id"] = range(1, len(dim_term) + 1)
dim_term = dim_term[['term_id', 'term_month']]
dim_term = dim_term.reset_index(drop=True)

clean_ppp_data = clean_ppp_data.merge(dim_term[['term_month', 'term_id']], on='term_month', how='left', suffixes=('', '_dim_term'))

dim_term.head()

Unnamed: 0,term_id,term_month
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [73]:
dim_business_age = clean_ppp_data[['business_age_description']].drop_duplicates()
dim_business_age["business_age_id"] = range(1, len(dim_business_age) + 1)
dim_business_age = dim_business_age[['business_age_id', 'business_age_description']]
dim_business_age = dim_business_age.reset_index(drop=True)

clean_ppp_data = clean_ppp_data.merge(dim_business_age[['business_age_description', 'business_age_id']], on='business_age_description', how='left', suffixes=('', '_dim_business_age'))

dim_business_age.head()

Unnamed: 0,business_age_id,business_age_description
0,1,Existing or more than 2 years old
1,2,New Business or 2 years or less
2,3,Change of Ownership
3,4,"Startup, Loan Funds will Open Business"


In [68]:
# In the clean_ppp_data, create the GEONAME column using the project_state and project_county_name
# Make project_state and project_county_name as string
clean_ppp_data['project_state'] = clean_ppp_data['project_state'].astype(str)
clean_ppp_data['project_county_name'] = clean_ppp_data['project_county_name'].astype(str)
clean_ppp_data['geo_name'] = clean_ppp_data['project_county_name'] + ', ' + clean_ppp_data['project_state']

# Merge the clean_ppp_data with the dim_geography to get the geofips
clean_ppp_data = clean_ppp_data.merge(dim_geography[['geo_name', 'geofips']], on='geo_name', how='left', suffixes=('', '_dim_geography'))
# Delete the records that have no geofips in the clean_ppp_data
clean_ppp_data = clean_ppp_data.dropna(subset=['geofips'])

# Set the data types of the columns
clean_ppp_data['geofips'] = clean_ppp_data['geofips'].astype(int)
clean_ppp_data['project_state'] = clean_ppp_data['project_state'].astype(pd.StringDtype("pyarrow"))
clean_ppp_data['project_county_name'] = clean_ppp_data['project_county_name'].astype(pd.StringDtype("pyarrow"))
clean_ppp_data['geo_name'] = clean_ppp_data['geo_name'].astype(pd.StringDtype("pyarrow"))

In [74]:
fact_ppp = clean_ppp_data[['facts_ppp_id', 'loan_number', 'naics_code', 'geofips', 'date_approved_id', 'loan_status_date_id', 'forgiveness_date_id', 'borrower_id', 'originating_lender_id', 'servicing_lender_id', 'term_id', 'loan_status_id', 'processing_method_id', 'sba_office_code', 'business_age_id', 'business_type_id', 'sba_guaranty_percentage', 'initial_approval_amount', 'current_approval_amount', 'undisbursed_amount', 'forgiveness_amount']]
fact_ppp = fact_ppp.reset_index(drop=True)
fact_ppp.tail()

Unnamed: 0,facts_ppp_id,loan_number,naics_code,geofips,date_approved_id,loan_status_date_id,forgiveness_date_id,borrower_id,originating_lender_id,servicing_lender_id,...,loan_status_id,processing_method_id,sba_office_code,business_age_id,business_type_id,sba_guaranty_percentage,initial_approval_amount,current_approval_amount,undisbursed_amount,forgiveness_amount
868818,878865,4395967002,621210,56025,2020040300,2021011300,2020120900,823939,1428,1319,...,1,1,897,1,4,100.0,150000.0,150000.0,0.0,151037.5
868819,878866,6985647108,624410,56037,2020041400,2020120800,2020110300,823940,1535,1414,...,1,1,897,1,12,100.0,150000.0,150000.0,0.0,150789.04
868820,878867,7996438405,238210,56005,2021021200,2021091500,2021081700,823941,658,613,...,1,2,897,1,6,100.0,150000.0,150000.0,0.0,150743.84
868821,878868,9054647103,621610,56033,2020041500,2022030800,2022022300,823942,839,786,...,1,1,897,2,4,100.0,150000.0,150000.0,0.0,152820.83
868822,878869,9184687004,722511,56039,2020040900,2021102200,2021092200,823943,32,32,...,1,1,897,1,6,100.0,150000.0,150000.0,0.0,152162.5
