In [81]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from sqlalchemy import create_engine
import pandas as pd
import json
import requests
import io
import calendar

# 1. Transform, Format and Clean Data. 

# 2. Seperate into dimensions and facts

# 3. Save the data frames as CSV  

# 4. Load Data into the Data Warehouse

In [82]:
# Read the JSON config file
config_file_path = 'config.json'
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file) 

# Azure connection string
CONNECTION_STRING = config['AZURE_CONNECTION_STRING']
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)

# Database connection
DATABASE = config['DW_CONNECTION_STRING']

In [83]:
def get_blob_list(container_name):
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs()
    return blob_list

In [84]:
def get_azure_blob_data(container_name, blob):
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(blob.name)
    stream = blob_client.download_blob()
    blob_content = b""
    for chunk in stream.chunks():
        blob_content += chunk
    return blob_content

In [85]:
def download_file(url):
    response = requests.get(url)
    return io.BytesIO(response.content)

# Extracting the data from Azure Containers

In [86]:
def get_ppp_loan_data():
    container_name = 'pppdata'
    blob_list = get_blob_list(container_name)
    
    for blob in blob_list:
        if "public_150k_plus" in blob.name:
            print(f"Downloading {blob.name}")
            blob_data = get_azure_blob_data(container_name, blob)
            print(f"Downloaded {blob.name} successfully")
            data = io.BytesIO(blob_data)
            print(f"Reading {blob.name}")
            """df_chunks = pd.read_csv(data, chunksize=100000)  # Adjust the chunksize as per your memory capacity
            df_list = []
            for chunk in df_chunks:
                df_list.append(chunk)
            df = pd.concat(df_list)"""
            df_chunks = pd.read_csv(data, chunksize=3000)
            df = next(df_chunks)
            return df

In [87]:
def get_naics_data():
    container_name = 'naicsdata'
    blob_list = get_blob_list(container_name)

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df
    

In [88]:
def get_gdp_data():
    container_name = 'gdpdata'
    blob_list = get_blob_list(container_name)

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df

# Reformating, and Cleaning the data

## SQL Schema
DIM_BORROWER

BORROWER_ID (integer, primary key)
BORROWER_NAME (varchar(200))
BORROWER_ADDRESS (varchar(200))
BORROWER_CITY (varchar(100))
BORROWER_STATE (varchar(100))
BORROWER_ZIP (varchar(100))
RACE (varchar(100))
ETHNICITY (varchar(100))
GENDER (varchar(100))
VETERAN (boolean)
FRANCHISE_NAME (varchar(200))
NONPROFIT (boolean)
JOBS_REPORTED (integer)

DIM_BUSINESS_AGE

BUSINESS_AGE_ID (integer, primary key)
BUSINESS_AGE_DESCRIPTION (varchar(200))

DIM_BUSINESS_TYPE

BUSINESS_TYPE_ID (integer, primary key)
BUSINESS_TYPE (varchar(200))

DIM_DATE

DATE_ID (integer, primary key)
YEAR_NUMBER (integer)
MONTH_NUMBER (integer)
QUARTER_NUMBER (integer)
DAY_NUMBER (integer)
HOUR_NUMBER (integer)
ISHOLIDAY (boolean)
DAY_NAME (varchar(100))
MONTH_NAME (varchar(100))
WEEK_OF_MONTH (integer)
WEEK_OF_YEAR (integer)

DIM_GEOGRAPHY

GEOFIPS (integer, primary key)
GEO_NAME (varchar(100))
REGION (varchar(50))
PROJECT_COUNTY_NAME (varchar(200))
PROJECT_STATE (varchar(100))

DIM_LOAN_STATUS

LOAN_STATUS_ID (integer, primary key)
LOAN_STATUS (varchar(100))

DIM_NAICS

NAICS_CODE (integer, primary key)
NAICS_TITLE (varchar(200))
DESCRIPTION (text)

DIM_ORIGINATING_LENDER

ORIGINATING_LENDER_ID (integer, primary key)
ORIGINATING_LENDER_LOCATION_ID (integer)
ORIGINATING_LENDER (varchar(200))
ORIGINATING_LENDER_CITY (varchar(200))
ORIGINATING_LENDER_STATE (varchar(100))

DIM_PROCESSING_METHOD

PROCESSING_METHOD_ID (integer, primary key)
PROCESSING_METHOD (varchar(100))

DIM_SBA_OFFICE

SBA_OFFICE_CODE (integer, primary key)

DIM_SERVICING_LENDER

SERVICING_LENDER_ID (integer, primary key)
SERVICING_LENDER_LOCATION_ID (integer)
SERVICING_LENDER_NAME (varchar(200))
SERVICING_LENDER_ADDRESS (varchar(200))
SERVICING_LENDER_CITY (varchar(100))
SERVICING_LENDER_STATE (varchar(200))
SERVICING_LENDER_ZIP (integer)

DIM_TERM

TERM_ID (integer, primary key)
TERM_MONTH (integer)

FACTS_GDP

FACTS_GDP_ID (integer, primary key)
YEAR_ID (integer, foreign key)
REAL_GDP (number)
CHAIN_TYPE_INDEX_GDP (number)
CURRENT_DOLLAR_GDP (number)
GEOFIPS (integer, foreign key)

FACTS_PPP

FACTS_PPP_ID (integer, primary key)
LOAN_NUMBER (integer)
NAICS_CODE (integer, foreign key)
GEOFIPS (integer, foreign key)
DATE_APPROVED_ID (integer, foreign key)
LOAN_STATUS_DATE_ID (integer, foreign key)
FORGIVENESS_DATE_ID (integer, foreign key)
BORROWER_ID (integer, foreign key)
ORIGINATING_LENDER_ID (integer, foreign key)
SERVICING_LENDER_ID (integer, foreign key)
TERM_ID (integer, foreign key)
LOAN_STATUS_ID (integer, foreign key)
PROCESSING_METHOD_ID (integer, foreign key)
SBA_OFFICE_CODE (integer, foreign key)
BUSINESS_AGE_ID (integer, foreign key)
BUSINESS_TYPE_ID (integer, foreign key)
SBA_GUARANTY_PERCENTAGE (number)
INITIAL_APPROVAL_AMOUNT (number)
CURRENT_APPROVAL_AMOUNT (number)
UNDISBURSED_AMOUNT (number)
FORGIVENESS_AMOUNT (number)

In [89]:
def reformat_naics_data():
    df_naics = get_naics_data()
    df_naics.rename(columns={
        'Code': 'NAICS_CODE',
        'Title': 'NAICS_TITLE',
        'Description': 'DESCRIPTION'
    }, inplace=True)
    # Remove all the rows where NAICS_CODE is not a number
    # The NAICS_CODE column has some generic values like "31-33" which are not valid NAICS codes
    df_naics = df_naics[df_naics['NAICS_CODE'].str.isnumeric()]

    df_naics['NAICS_CODE'] = df_naics['NAICS_CODE'].astype(int)
    df_naics['NAICS_TITLE'] = df_naics['NAICS_TITLE'].astype(pd.StringDtype("pyarrow"))
    df_naics['DESCRIPTION'] = df_naics['DESCRIPTION'].astype(pd.StringDtype("pyarrow"))
    
    return df_naics

In [90]:
naics = reformat_naics_data()
naics.dtypes

NAICS_CODE               int32
NAICS_TITLE    string[pyarrow]
DESCRIPTION    string[pyarrow]
dtype: object

In [91]:
def reformat_gdp_data():
    df_gdp  = get_gdp_data()

    # Pivot the data in GDP data
    selected_columns = ['GeoFIPS', 'GeoName', 'Region', 'Description', '2017', '2018', '2019', '2020', '2021', '2022']
    df_gdp = df_gdp[selected_columns]
    pivot_data = df_gdp.melt(id_vars=["GeoFIPS", "GeoName", "Region", "Description"],
                                    value_vars=["2017", "2018", "2019", "2020", "2021", "2022"],
                                    var_name="date_id",
                                    value_name="Value")
    pivot_data = pivot_data.pivot_table(index=["GeoFIPS", "GeoName", "Region", "date_id"], columns="Description", values="Value", aggfunc='first').reset_index()
    pivot_data = pivot_data.sort_values(by=["GeoFIPS", "date_id"])
    pivot_data.rename(columns={
        "Chain-type quantity indexes for real GDP ": "CHAIN_TYPE_INDEX_GDP",
        "Current-dollar GDP (thousands of current dollars) ": "CURRENT_DOLLAR_GDP",
        "Real GDP (thousands of chained 2017 dollars) ": "REAL_GDP",
        "GeoFIPS": "GEOFIPS",
        "GeoName": "GEO_NAME",
        "Description": "Index",
        "date_id": "YEAR_ID"
    }, inplace=True)
    pivot_data['FACTS_GDP_ID'] = range(1, len(pivot_data) + 1)
    final_data = pivot_data.drop(columns='Description', errors='ignore')
    final_data = pivot_data[['FACTS_GDP_ID', 'GEOFIPS', 'GEO_NAME', 'Region', 'YEAR_ID', 'CHAIN_TYPE_INDEX_GDP',
                         'CURRENT_DOLLAR_GDP', 'REAL_GDP']]
    df_gdp = final_data

    # Remove the quation marks from GEOFIPS
    df_gdp['GEOFIPS'] = df_gdp['GEOFIPS'].str.replace('"', '')

    # Remove the rows where GEOFIPS is not a number
    df_gdp = df_gdp[df_gdp['GEOFIPS'].str.isnumeric()]

    df_gdp['GEOFIPS'] = df_gdp['GEOFIPS'].astype(int)
    df_gdp['GEO_NAME'] = df_gdp['GEO_NAME'].astype(pd.StringDtype("pyarrow"))
    df_gdp['Region'] = df_gdp['Region'].astype(pd.StringDtype("pyarrow"))
    df_gdp['YEAR_ID'] = df_gdp['YEAR_ID'].astype(pd.StringDtype("pyarrow"))
    df_gdp['CHAIN_TYPE_INDEX_GDP'] = df_gdp['CHAIN_TYPE_INDEX_GDP'].astype(float)
    df_gdp['CURRENT_DOLLAR_GDP'] = df_gdp['CURRENT_DOLLAR_GDP'].astype(float)
    df_gdp['REAL_GDP'] = df_gdp['REAL_GDP'].astype(float)


    return df_gdp

    
    

In [92]:
gdp = reformat_gdp_data()
gdp.dtypes

Description
FACTS_GDP_ID                      int64
GEOFIPS                           int32
GEO_NAME                string[pyarrow]
Region                  string[pyarrow]
YEAR_ID                 string[pyarrow]
CHAIN_TYPE_INDEX_GDP            float64
CURRENT_DOLLAR_GDP              float64
REAL_GDP                        float64
dtype: object

In [93]:
def reformat_ppp_loan_data():
    df_ppp = get_ppp_loan_data()


    # Delete the columns that are not required
    df_ppp.drop(columns=[
        'UTILITIES_PROCEED',
        'PAYROLL_PROCEED',
        'MORTGAGE_INTEREST_PROCEED',
        'RENT_PROCEED',
        'REFINANCE_EIDL_PROCEED',
        'HEALTH_CARE_PROCEED',
        'DEBT_INTEREST_PROCEED',
        'RuralUrbanIndicator',
        'HubzoneIndicator',
        'LMIIndicator',
        'ProjectState',
        'ProjectZip',
        'CD'
    ], inplace=True)
    # Rename the columns to match the SQL table
    df_ppp.rename(columns={
        'LoanNumber': 'LOAN_NUMBER',
        'DateApproved': 'DATE_APPROVED_ID',
        'SBAOfficeCode': 'SBA_OFFICE_CODE',
        'ProcessingMethod': 'PROCESSING_METHOD',
        'BorrowerName': 'BORROWER_NAME',
        'BorrowerAddress': 'BORROWER_ADDRESS',
        'BorrowerCity': 'BORROWER_CITY',
        'BorrowerState': 'BORROWER_STATE',
        'BorrowerZip': 'BORROWER_ZIP',
        'LoanStatusDate': 'LOAN_STATUS_DATE_ID',
        'LoanStatus': 'LOAN_STATUS',
        'Term': 'TERM_MONTH',
        'SBAGuarantyPercentage': 'SBA_GUARANTY_PERCENTAGE',
        'InitialApprovalAmount': 'INITIAL_APPROVAL_AMOUNT',
        'CurrentApprovalAmount': 'CURRENT_APPROVAL_AMOUNT',
        'UndisbursedAmount': 'UNDISBURSED_AMOUNT',
        'FranchiseName': 'FRANCHISE_NAME',
        'ServicingLenderLocationID': 'SERVICING_LENDER_LOCATION_ID',
        'ServicingLenderName': 'SERVICING_LENDER_NAME',
        'ServicingLenderAddress': 'SERVICING_LENDER_ADDRESS',
        'ServicingLenderCity': 'SERVICING_LENDER_CITY',
        'ServicingLenderState': 'SERVICING_LENDER_STATE',
        'ServicingLenderZip': 'SERVICING_LENDER_ZIP',
        'BusinessAgeDescription': 'BUSINESS_AGE_DESCRIPTION',
        'ProjectCity': 'PROJECT_CITY',
        'ProjectCountyName': 'PROJECT_COUNTY_NAME',
        'Race': 'RACE',
        'Ethnicity': 'ETHNICITY',
        'Gender': 'GENDER',
        'BusinessType': 'BUSINESS_TYPE',
        'OriginatingLenderLocationID': 'ORIGINATING_LENDER_LOCATION_ID',
        'OriginatingLender': 'ORIGINATING_LENDER',
        'OriginatingLenderCity': 'ORIGINATING_LENDER_CITY',
        'OriginatingLenderState': 'ORIGINATING_LENDER_STATE',
        'Veteran': 'VETERAN',
        'NonProfit': 'NONPROFIT',
        'ForgivenessAmount': 'FORGIVENESS_AMOUNT',
        'ForgivenessDate': 'FORGIVENESS_DATE_ID',
        'JobsReported': 'JOBS_REPORTED',
        'NAICSCode': 'NAICS_CODE'
    }, inplace=True)
    
    # Drop all the rows where Borrower State is empty
    df_ppp = df_ppp.dropna(subset=['BORROWER_STATE'])

    # Drop all the rows where NAICS_CODE is empty
    df_ppp = df_ppp.dropna(subset=['NAICS_CODE'])

    # Drop all the rows where dates are empty
    df_ppp = df_ppp.dropna(subset=['DATE_APPROVED_ID', 'LOAN_STATUS_DATE_ID', 'FORGIVENESS_DATE_ID'])

    

    # Change the Date columns to match SQL format
    df_ppp['FORGIVENESS_DATE_ID'] = pd.to_datetime(df_ppp['FORGIVENESS_DATE_ID']).dt.strftime('%Y%m%d%H')
    df_ppp['DATE_APPROVED_ID'] = pd.to_datetime(df_ppp['DATE_APPROVED_ID']).dt.strftime('%Y%m%d%H')
    df_ppp['LOAN_STATUS_DATE_ID'] = pd.to_datetime(df_ppp['LOAN_STATUS_DATE_ID']).dt.strftime('%Y%m%d%H')
    
    # Change NonProfit to boolean
    df_ppp['NONPROFIT'] = df_ppp['NONPROFIT'].map({'Y': True})
    df_ppp['NONPROFIT'] = df_ppp['NONPROFIT'].fillna(False)

    # Change Veteran to boolean
    df_ppp['VETERAN'] = df_ppp['VETERAN'].map({'Veteran': True, 'Non-Veteran': False, 'Unanswered':None})

    # Sentence case the string columns
    df_ppp['BORROWER_ADDRESS'] = df_ppp['BORROWER_ADDRESS'].str.title()
    df_ppp['BORROWER_CITY'] = df_ppp['BORROWER_CITY'].str.title()
    df_ppp['ORIGINATING_LENDER_CITY'] = df_ppp['ORIGINATING_LENDER_CITY'].str.title()
    df_ppp['SERVICING_LENDER_CITY'] = df_ppp['SERVICING_LENDER_CITY'].str.title()
    df_ppp['PROJECT_CITY'] = df_ppp['PROJECT_CITY'].str.title()
    
    #df_ppp['LOAN_NUMBER'] = df_ppp['LOAN_NUMBER'].astype(int)
    df_ppp['DATE_APPROVED_ID'] = df_ppp['DATE_APPROVED_ID'].astype(int)
    df_ppp['SBA_OFFICE_CODE'] = df_ppp['SBA_OFFICE_CODE'].astype(int)
    df_ppp['PROCESSING_METHOD'] = df_ppp['PROCESSING_METHOD'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BORROWER_NAME'] = df_ppp['BORROWER_NAME'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BORROWER_ADDRESS'] = df_ppp['BORROWER_ADDRESS'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BORROWER_CITY'] = df_ppp['BORROWER_CITY'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BORROWER_STATE'] = df_ppp['BORROWER_STATE'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BORROWER_ZIP'] = df_ppp['BORROWER_ZIP'].astype(pd.StringDtype("pyarrow"))
    df_ppp['LOAN_STATUS_DATE_ID'] = df_ppp['LOAN_STATUS_DATE_ID'].astype(int)
    df_ppp['LOAN_STATUS'] = df_ppp['LOAN_STATUS'].astype(pd.StringDtype("pyarrow"))
    df_ppp['TERM_MONTH'] = df_ppp['TERM_MONTH'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SBA_GUARANTY_PERCENTAGE'] = df_ppp['SBA_GUARANTY_PERCENTAGE'].astype(float)
    df_ppp['INITIAL_APPROVAL_AMOUNT'] = df_ppp['INITIAL_APPROVAL_AMOUNT'].astype(float)
    df_ppp['CURRENT_APPROVAL_AMOUNT'] = df_ppp['CURRENT_APPROVAL_AMOUNT'].astype(float)
    df_ppp['UNDISBURSED_AMOUNT'] = df_ppp['UNDISBURSED_AMOUNT'].astype(float)
    df_ppp['FRANCHISE_NAME'] = df_ppp['FRANCHISE_NAME'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SERVICING_LENDER_LOCATION_ID'] = df_ppp['SERVICING_LENDER_LOCATION_ID'].astype(int)
    df_ppp['SERVICING_LENDER_NAME'] = df_ppp['SERVICING_LENDER_NAME'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SERVICING_LENDER_ADDRESS'] = df_ppp['SERVICING_LENDER_ADDRESS'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SERVICING_LENDER_CITY'] = df_ppp['SERVICING_LENDER_CITY'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SERVICING_LENDER_STATE'] = df_ppp['SERVICING_LENDER_STATE'].astype(pd.StringDtype("pyarrow"))
    df_ppp['SERVICING_LENDER_ZIP'] = df_ppp['SERVICING_LENDER_ZIP'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BUSINESS_AGE_DESCRIPTION'] = df_ppp['BUSINESS_AGE_DESCRIPTION'].astype(pd.StringDtype("pyarrow"))
    df_ppp['PROJECT_CITY'] = df_ppp['PROJECT_CITY'].astype(pd.StringDtype("pyarrow"))
    df_ppp['PROJECT_COUNTY_NAME'] = df_ppp['PROJECT_COUNTY_NAME'].astype(pd.StringDtype("pyarrow"))
    df_ppp['RACE'] = df_ppp['RACE'].astype(pd.StringDtype("pyarrow"))
    df_ppp['ETHNICITY'] = df_ppp['ETHNICITY'].astype(pd.StringDtype("pyarrow"))
    df_ppp['GENDER'] = df_ppp['GENDER'].astype(pd.StringDtype("pyarrow"))
    df_ppp['BUSINESS_TYPE'] = df_ppp['BUSINESS_TYPE'].astype(pd.StringDtype("pyarrow"))
    df_ppp['ORIGINATING_LENDER_LOCATION_ID'] = df_ppp['ORIGINATING_LENDER_LOCATION_ID'].astype(int)
    df_ppp['ORIGINATING_LENDER'] = df_ppp['ORIGINATING_LENDER'].astype(pd.StringDtype("pyarrow"))
    df_ppp['ORIGINATING_LENDER_CITY'] = df_ppp['ORIGINATING_LENDER_CITY'].astype(pd.StringDtype("pyarrow"))
    df_ppp['ORIGINATING_LENDER_STATE'] = df_ppp['ORIGINATING_LENDER_STATE'].astype(pd.StringDtype("pyarrow"))
    df_ppp['VETERAN'] = df_ppp['VETERAN'].astype(bool)
    df_ppp['NONPROFIT'] = df_ppp['NONPROFIT'].astype(bool)
    df_ppp['FORGIVENESS_AMOUNT'] = df_ppp['FORGIVENESS_AMOUNT'].astype(float)
    df_ppp['FORGIVENESS_DATE_ID'] = df_ppp['FORGIVENESS_DATE_ID'].astype(int)
    #df_ppp['JOBS_REPORTED'] = df_ppp['JOBS_REPORTED'].astype(int)
    df_ppp['NAICS_CODE'] = df_ppp['NAICS_CODE'].astype(int)

    return df_ppp

In [94]:
ppp = reformat_ppp_loan_data()
# Show the head of the date columns
ppp.dtypes

Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv


LOAN_NUMBER                                 int64
DATE_APPROVED_ID                            int32
SBA_OFFICE_CODE                             int32
PROCESSING_METHOD                 string[pyarrow]
BORROWER_NAME                     string[pyarrow]
BORROWER_ADDRESS                  string[pyarrow]
BORROWER_CITY                     string[pyarrow]
BORROWER_STATE                    string[pyarrow]
BORROWER_ZIP                      string[pyarrow]
LOAN_STATUS_DATE_ID                         int32
LOAN_STATUS                       string[pyarrow]
TERM_MONTH                        string[pyarrow]
SBA_GUARANTY_PERCENTAGE                   float64
INITIAL_APPROVAL_AMOUNT                   float64
CURRENT_APPROVAL_AMOUNT                   float64
UNDISBURSED_AMOUNT                        float64
FRANCHISE_NAME                    string[pyarrow]
SERVICING_LENDER_LOCATION_ID                int32
SERVICING_LENDER_NAME             string[pyarrow]
SERVICING_LENDER_ADDRESS          string[pyarrow]


# Build the Dimensions and Facts Tables

In [95]:
clean_ppp_data = reformat_ppp_loan_data()
clean_naics_data = reformat_naics_data()
clean_gdp_data = reformat_gdp_data()


# Create the dimensions
dim_naics = reformat_naics_data() # Completed
dim_sba_office = clean_ppp_data[['SBA_OFFICE_CODE']] # Completed

dim_geography = pd.DataFrame(columns=['GEOFIPS', 'GEONAME', 'PROJECT_COUNTY_NAME', 'PROJECT_STATE'])
dim_date = pd.DataFrame(columns=['DATE_ID'])
dim_originating_lender = pd.DataFrame(columns=['ORIGINATING_LENDER_ID'])
dim_borrower = pd.DataFrame(columns=['BORROWER_ID'])
dim_servicing_lender = pd.DataFrame(columns=['SERVICING_LENDER_ID'])

# These tables will need to be built with .factorize() method
dim_loan_status = pd.DataFrame(columns=['LOAN_STATUS_ID'])
dim_business_type = pd.DataFrame(columns=['BUSINESS_TYPE_ID'])
dim_processing_method = pd.DataFrame(columns=['PROCESSING_METHOD_ID'])
dim_term = pd.DataFrame(columns=['TERM_ID'])
dim_business_age = pd.DataFrame(columns=['BUSINESS_AGE_ID'])

# Create the fact table
facts_ppp_loan = pd.DataFrame(columns=['FACTS_PPP_ID'])
facts_gdp = pd.DataFrame(columns=['FACTS_GDP_ID'])

Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv


### Date Dimension
Start date: 2017-01-01 00:00:00 

2017 is the minimum year in the GDP data

End date: 2023-10-1 00:00:00 

October 2023 is the maximum date in the PPP data

In [96]:
def week_of_month(dt):
    year = dt.year
    month = dt.month
    day = dt.day

    cal = calendar.monthcalendar(year, month)
    week_number = (day - 1) // 7 + 1
    return week_number

start_date = pd.to_datetime('2017-01-01 00:00:00')
end_date = pd.to_datetime('2023-10-01 00:00:00')
# Create a DataFrame for the date dimension
date_dimension = pd.DataFrame({'date': pd.date_range(start_date, end_date, freq='H')})

# Extract attributes
date_dimension['year_number'] = date_dimension['date'].dt.year
date_dimension['quarter_number'] = date_dimension['date'].dt.quarter
date_dimension['month_number'] = date_dimension['date'].dt.month
date_dimension['monthName'] = date_dimension['date'].dt.strftime('%B')
date_dimension['daynumber'] = date_dimension['date'].dt.day
date_dimension['dayName'] = date_dimension['date'].dt.strftime('%A')
date_dimension['hour_number'] = date_dimension['date'].dt.hour
date_dimension['date_iso_format'] = date_dimension['date'].apply(lambda x: x.isoformat())
date_dimension['date_id'] = date_dimension['date'].dt.strftime('%Y%m%d%H')

# Add week of the month and week of the year
date_dimension['weekofMonth'] = date_dimension['date'].apply(week_of_month)
date_dimension['weekofYear'] = date_dimension['date'].dt.strftime('%U')

new_order = ['date_id', 'date_iso_format','year_number','quarter_number','month_number','daynumber','hour_number','monthName','dayName','weekofYear','weekofMonth']
date_dimension = date_dimension[new_order]

date_dimension.head(10)

Unnamed: 0,date_id,date_iso_format,year_number,quarter_number,month_number,daynumber,hour_number,monthName,dayName,weekofYear,weekofMonth
0,2017010100,2017-01-01T00:00:00,2017,1,1,1,0,January,Sunday,1,1
1,2017010101,2017-01-01T01:00:00,2017,1,1,1,1,January,Sunday,1,1
2,2017010102,2017-01-01T02:00:00,2017,1,1,1,2,January,Sunday,1,1
3,2017010103,2017-01-01T03:00:00,2017,1,1,1,3,January,Sunday,1,1
4,2017010104,2017-01-01T04:00:00,2017,1,1,1,4,January,Sunday,1,1
5,2017010105,2017-01-01T05:00:00,2017,1,1,1,5,January,Sunday,1,1
6,2017010106,2017-01-01T06:00:00,2017,1,1,1,6,January,Sunday,1,1
7,2017010107,2017-01-01T07:00:00,2017,1,1,1,7,January,Sunday,1,1
8,2017010108,2017-01-01T08:00:00,2017,1,1,1,8,January,Sunday,1,1
9,2017010109,2017-01-01T09:00:00,2017,1,1,1,9,January,Sunday,1,1


# Transform the Data 

In [97]:
# Transfer originating lender data to dim_originating_lender
dim_originating_lender = clean_ppp_data[['ORIGINATING_LENDER_LOCATION_ID', 'ORIGINATING_LENDER', 'ORIGINATING_LENDER_CITY', 'ORIGINATING_LENDER_STATE']]


dim_borrower = clean_ppp_data[['BORROWER_NAME', 'BORROWER_ADDRESS', 'BORROWER_CITY', 'BORROWER_STATE', 'BORROWER_ZIP', 'RACE', 'ETHNICITY', 'FRANCHISE_NAME', 'GENDER', 'VETERAN', 'NONPROFIT', 'JOBS_REPORTED']]

# Transfer servicing lender data to dim_servicing_lender
dim_servicing_lender = clean_ppp_data[['SERVICING_LENDER_LOCATION_ID', 'SERVICING_LENDER_NAME', 'SERVICING_LENDER_ADDRESS', 'SERVICING_LENDER_CITY', 'SERVICING_LENDER_STATE', 'SERVICING_LENDER_ZIP']]

# Save the data frames as CSV  

# Load Data into the Data Warehouse

# Main

In [98]:
if __name__ == "__main__":
    pass