In [223]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from sqlalchemy import create_engine
import pandas as pd
import json
import requests
import io

Extract Data from Azure

Transform, Format and Clean Data. Make the data match with the SQL Schema

Save the data frames as CSV and Load Data into the Data Warehouse

In [224]:
# Read the JSON config file
config_file_path = 'config.json'
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file) 

# Azure connection string
CONNECTION_STRING = config['AZURE_CONNECTION_STRING']
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)

# Database connection
DATABASE = config['DW_CONNECTION_STRING']


In [225]:
def upload_to_azure(data, blob_name, container_name):
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    blob_client.upload_blob(data.getvalue(), overwrite=True)

In [226]:
def get_blob_list(container_name):
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs()
    return blob_list

In [227]:
def get_azure_blob_data(container_name, blob):
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(blob.name)
    blob_content = blob_client.download_blob().readall()
    return blob_content

In [228]:
def download_file(url):
    response = requests.get(url)
    return io.BytesIO(response.content)

# Data Dictionary for PPP Data
Field Name,Field Description,Type,Format,Field Length,Max Value
LoanNumber,Loan Number (unique identifier),INT,,,
DateApproved,Loan Funded Date,Date,MM/DD/YYYY,,
SBAOfficeCode,SBA Origination Office Code,VARCHAR,4 #'s,4,
ProcessingMethod,Loan Delivery Method (PPP for first draw; PPS for second draw),VARCHAR,"PPP 
or PPS",3,
BorrowerName,Borrower Name,NVARCHAR ,,,
BorrowerAddress,Borrower Street Address,VARCHAR,,,
BorrowerCity,Borrower City,VARCHAR,,,
BorrowerState,Borrower State,VARCHAR,Two letter abbreviations for states,2,
BorrowerZip,Borrower Zip Code,VARCHAR,,,
LoanStatusDate,"Loan Status Date: Loan Status Date is  blank when the loan is disbursed but not Paid In Full or Charged Off",Date,MM/DD/YYYY,,
LoanStatus,"Loan Status Description: Loan Status is replaced by 'Exemption 4' when the loan is disbursed but not Paid in Full or Charged Off",Text,"Exemption 4
Paid in Full
or Charged Off",,
Term,Loan Maturity in Months,INT,,,
SBAGuarantyPercentage,SBA Guaranty Percentage,INT,,,100
InitialApprovalAmount,Loan Approval Amount(at origination),Decimal,,,
CurrentApprovalAmount,Loan Approval Amount (current),Decimal,,,
UndisbursedAmount,Undisbursed Amount,Decimal,,,
FranchiseName,Franchise Name,VARCHAR,,,
ServicingLenderLocationID,Lender Location ID (unique identifier),INT,,,
ServicingLenderName,Servicing Lender Name,VARCHAR,,,
ServicingLenderAddress,Servicing Lender Street Address,VARCHAR,,,
ServicingLenderCity,Servicing Lender City,VARCHAR,,,
ServicingLenderState,Servicing Lender State,VARCHAR,Two letter abbreviations for states,2,
ServicingLenderZip,Servicing Lender Zip Code,VARCHAR,,,
RuralUrbanIndicator,Rural or Urban Indicator (R/U),CHAR,U or R,1,
HubzoneIndicator,Hubzone Indicator (Y/N),CHAR,N or Y,1,
LMIIndicator,LMI Indicator (Y/N),CHAR,N or Y,1,
BusinessAgeDescription,Business Age Description,VARCHAR,"Existing or more than 2 years old
New Business or 2 years or less
or Unanswered",,
ProjectCity,Project City,VARCHAR,,,
ProjectCountyName,Project County Name,VARCHAR,,,
ProjectState,Project State,VARCHAR,Two letter abbreviations for states,2,
ProjectZip,Project Zip Code,VARCHAR,,,
CD,Project Congressional District,VARCHAR,XY-# (ex. NY-15),,
JobsReported,Number of Employees,INT,,,500
NAICSCode,NAICS 6 digit code,INT,6 #'s,6,
Race,Borrower Race Description,VARCHAR,,,
Ethnicity,Borrower Ethnicity Description,VARCHAR,,,
UTILITIES_PROCEED,Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.,VARCHAR,"its either ""Not Stated"" or a decimal representing $",,
PAYROLL_PROCEED,,Decimal,,,
MORTGAGE_INTEREST_PROCEED,,Decimal,,,
RENT_PROCEED,,Decimal,,,
REFINANCE_EIDL_PROCEED,,Decimal,,,
HEALTH_CARE_PROCEED,,Decimal,,,
DEBT_INTEREST_PROCEED,,Decimal,,,
BusinessType,Business Type Description,VARCHAR,About 22 different business types,,
OriginatingLenderLocationID,Originating Lender ID (unique identifier),INT,,,
OriginatingLender,Originating Lender Name,VARCHAR,,,
OriginatingLenderCity,Originating Lender City,VARCHAR,,,
OriginatingLenderState,Originating Lender State,VARCHAR,Two letter abbreviations for states,2,
Gender,Gender Indicator,VARCHAR,"Male Owned
Female Owned
or Unanswered",,
Veteran,Veteran Indicator,VARCHAR,"Veteran
Non-Veteran
or Unanswered",,
NonProfit,Yes' if Business Type = Non-Profit Organization or Non-Profit Childcare Center or 501(c) Non Profit,CHAR,"Y
or NULL(empty)",1,
ForgivenessAmount,Forgiveness Amount,Decimal,,,
ForgivenessDate,Forgiveness Paid Date,Date,MM/DD/YYYY,,

# Data Dictionary for NAICS Data
Field Name,Field Description,Type,Format,Field Length
Code,"NAICS Code: NAICS is an industry classification system that groups establishments into industries based on
the similarity of their production processes. ",INT,,
Title,Industry Name,NVARCHAR ,,
Description,Industry decription ,Text,, 

# Data Dicitonary for GDP Data
Field Name,Field Description,Type,Format,Field Length
GeoFIPS,"The Census Bureau has published FIPS codes. 
FIPS codes are assigned alphabetically by geographic name for states, counties, core based statistical areas, places, county subdivisions, consolidated cities and all types of American Indian, Alaska Native, and Native Hawaiian (AIANNH) areas. 
Lists of geographic FIPS codes in census products can be found on the ANSI/FIPS Codes page.",VARCHAR,"5 #'s in "" "" qouations (""#####"")",7
GeoName,Location name of the corresponding GeoFIPS,VARCHAR,"County, State",
Region,U.S. FIPS Region Codes (four regions) created by the Census Bureau,INT,# 1 through 4,1
LineCode,Represents the three choices in the description ,INT,"1, 2, or 3",1
TableName,Table source of data: CAGDP1 (GDP summary by county and MSA),VARCHAR,CAGDP1,
IndustryClassification,"The value entered for every row is ""...""",VARCHAR,,
Description,"Three choices: 
Real GDP (thousands of chained 2017 dollars), 
Chain-type quantity indexes for real GDP, 
or Current-dollar GDP (thousands of current dollars) ",VARCHAR,,
Unit,"Three choices: 
Thousands of chained 2017 dollars, 
Quantity index, 
or Thousands of dollars",VARCHAR,,
2017,Data input for the year 2017 based on the description and unit column,Decimal,,
2018,Data input for the year 2018 based on the description and unit column,Decimal,,
2019,Data input for the year 2019 based on the description and unit column,Decimal,,
2020,Data input for the year 2020 based on the description and unit column,Decimal,,
2021,Data input for the year 2021 based on the description and unit column,Decimal,,
2022,Data input for the year 2022 based on the description and unit column,Decimal,,

# Dimensional Model
dimensions,facts_ppp,facts_gdp,dim_geography,dim_naics,dim_date,dim_originating_lender,dim_borrower,dim_business_type,dim_business_age,dim_term,dim_loan_status,dim_processing_method,dim_sba_office,dim_servicing_lender
dim_geography,facts_id,facts_gdp_id,geofips,naics_code,date_id,originating_lender_id,borrower_id,business_type_id,business_age_id,term_id,loan_status_id,processing_method_id,sba_office_code,servicing_lender_id
dim_naics,naics_code,year_id,geoname,title,year_number,originating_lender_location_id,borrower_name,business_type,business_age_description,term_month,loan_status,processing_method,,servicing_lender_location_id
dim_date,date_approved_id,Real GDP,region,description,month_number,originating_lender,borrower_address,,,,,,,servicing_lender_name
dim_originating_lender,loan_status_date_id,Chain-type quantity indexes for real GDP ,project_county_name,,quarter_number,originating_lender_city,borrower_city,,,,,,,servicing_lender_address
dim_borrower,forgiveness_date_id,Current-dollar GDP,project_state,,day_number,originating_lender_state,borrower_state,,,,,,,servicing_lender_city
dim_term,originating_lender_id,geofips,,,hour_number,,borrower_zip,,,,,,,servicing_lender_state
dim_loan_status,borrower_id,,,,isholiday,,race,,,,,,,servicing_lender_zip
dim_processing_method,loan_number,,,,day_name,,ethnicity,,,,,,,
dim_sba_office,servicing_lender_id,,,,month_name,,franchise_name,,,,,,,
dim_servicing_lender,project_id,,,,week_of_month,,gender,,,,,,,
,term_id,,,,week_of_year,,veteran,,,,,,,
,loan_status_id,,,,,,nonprofit,,,,,,,
,processing_method_id,,,,,,jobs_reported,,,,,,,
,sba_office_code,,,,,,,,,,,,,
,sba_guaranty_percentage,,,,,,,,,,,,,
,initial_approval_amount,,,,,,,,,,,,,
,current_approval_amount,,,,,,,,,,,,,
,undisbursed_amount,,,,,,,,,,,,,
,forgiveness_amount,,,,,,,,,,,,,
,geofips_id,,,,,,,,,,,,,

# SQL Schema
CREATE DATABASE PPP_LOAN;

CREATE SCHEMA PUBLIC;

CREATE  TABLE PUBLIC.DIM_BORROWER ( 
	BORROWER_ID          integer NOT NULL   ,
	BORROWER_NAME        varchar(200)    ,
	BORROWER_ADDRESS     varchar(200)    ,
	BORROWER_CITY        varchar(100)    ,
	BORROWER_STATE       varchar(100)    ,
	BORROWER_ZIP         varchar(100)    ,
	RACE                 varchar(100)    ,
	ETHNICITY            varchar(100)    ,
	GENDER               varchar(100)    ,
	VETERAN              boolean    ,
	FRANCHISE_NAME       varchar(200)    ,
	NONPROFIT            boolean    ,
	JOBS_REPORTED        integer    ,
	CONSTRAINT PK_DIM_BORROWER PRIMARY KEY ( BORROWER_ID )
 );

CREATE  TABLE PUBLIC.DIM_BUSINESS_AGE ( 
	BUSINESS_AGE_ID      integer NOT NULL   ,
	BUSINESS_AGE_DESCRIPTION varchar(200)    ,
	CONSTRAINT PK_DIM_BUSINESS_AGE PRIMARY KEY ( BUSINESS_AGE_ID )
 );

CREATE  TABLE PUBLIC.DIM_BUSINESS_TYPE ( 
	BUSINESS_TYPE_ID     integer NOT NULL   ,
	BUSINESS_TYPE        varchar(200)    ,
	CONSTRAINT PK_DIM_BUSINESS_TYPE PRIMARY KEY ( BUSINESS_TYPE_ID )
 );

CREATE  TABLE PUBLIC.DIM_DATE ( 
	DATE_ID              integer NOT NULL   ,
	YEAR_NUMBER          integer    ,
	MONTH_NUMBER         integer    ,
	QUARTER_NUMBER       integer    ,
	DAY_NUMBER           integer    ,
	HOUR_NUMBER          integer    ,
	ISHOLIDAY            boolean    ,
	DAY_NAME             varchar(100)    ,
	MONTH_NAME           varchar(100)    ,
	WEEK_OF_MONTH        integer    ,
	WEEK_OF_YEAR         integer    ,
	CONSTRAINT PK_DIM_DATE PRIMARY KEY ( DATE_ID )
 );

CREATE  TABLE PUBLIC.DIM_GEOGRAPHY ( 
	GEOFIPS              integer NOT NULL   ,
	GEO_NAME             varchar(100)    ,
	REGION               varchar(50)    ,
	PROJECT_COUNTY_NAME  varchar(200)    ,
	PROJECT_STATE        varchar(100)    ,
	CONSTRAINT PK_DIM_GEOGRAPHY PRIMARY KEY ( GEOFIPS )
 );

CREATE  TABLE PUBLIC.DIM_LOAN_STATUS ( 
	LOAN_STATUS_ID       integer NOT NULL   ,
	LOAN_STATUS          varchar(100)    ,
	CONSTRAINT PK_DIM_LOAN_STATUS PRIMARY KEY ( LOAN_STATUS_ID )
 );

CREATE  TABLE PUBLIC.DIM_NAICS ( 
	NAICS_CODE           integer NOT NULL   ,
	NAICS_TITLE          varchar(200)    ,
	DESCRIPTION          text    ,
	CONSTRAINT PK_NAICS_CODE PRIMARY KEY ( NAICS_CODE )
 );

CREATE  TABLE PUBLIC.DIM_ORIGINATING_LENDER ( 
	ORIGINATING_LENDER_ID integer NOT NULL   ,
	ORIGINATING_LENDER_LOCATION_ID integer    ,
	ORIGINATING_LENDER   varchar(200)    ,
	ORIGINATING_LENDER_CITY varchar(200)    ,
	ORIGINATING_LENDER_STATE varchar(100)    ,
	CONSTRAINT PK_DIM_ORIGINATING_LENDER PRIMARY KEY ( ORIGINATING_LENDER_ID )
 );

CREATE  TABLE PUBLIC.DIM_PROCESSING_METHOD ( 
	PROCESSING_METHOD_ID integer NOT NULL   ,
	PROCESSING_METHOD    varchar(100)    ,
	CONSTRAINT PK_DIM_PROCESSING_METHOD PRIMARY KEY ( PROCESSING_METHOD_ID )
 );

CREATE  TABLE PUBLIC.DIM_SBA_OFFICE ( 
	SBA_OFFICE_CODE      integer NOT NULL   ,
	CONSTRAINT PK_DIM_SBA_OFFICE PRIMARY KEY ( SBA_OFFICE_CODE )
 );

CREATE  TABLE PUBLIC.DIM_SERVICING_LENDER ( 
	SERVICING_LENDER_ID  integer NOT NULL   ,
	SERVICING_LENDER_LOCATION_ID integer    ,
	SERVICING_LENDER_NAME varchar(200)    ,
	SERVICING_LENDER_ADDRESS varchar(200)    ,
	SERVICING_LENDER_CITY varchar(100)    ,
	SERVICING_LENDER_STATE varchar(200)    ,
	SERVICING_LENDER_ZIP integer    ,
	CONSTRAINT PK_DIM_SERVICING_LENDER PRIMARY KEY ( SERVICING_LENDER_ID )
 );

CREATE  TABLE PUBLIC.DIM_TERM ( 
	TERM_ID              integer NOT NULL   ,
	TERM_MONTH           integer    ,
	CONSTRAINT PK_DIM_TERM PRIMARY KEY ( TERM_ID )
 );

CREATE  TABLE PUBLIC.ENTITY ( 
 );

CREATE  TABLE PUBLIC.FACTS_GDP ( 
	FACTS_GDP_ID         integer NOT NULL   ,
	YEAR_ID              integer NOT NULL   ,
	REAL_GDP             number    ,
	CHAIN_TYPE_INDEX_GDP number    ,
	CURRENT_DOLLAR_GDP   number    ,
	GEOFIPS              integer NOT NULL   ,
	CONSTRAINT PK_FACTS_GDP PRIMARY KEY ( FACTS_GDP_ID )
 );

CREATE  TABLE PUBLIC.FACTS_PPP ( 
	FACTS_PPP_ID         integer NOT NULL   ,
	LOAN_NUMBER          integer    ,
	NAICS_CODE           integer NOT NULL   ,
	GEOFIPS              integer NOT NULL   ,
	DATE_APPROVED_ID     integer NOT NULL   ,
	LOAN_STATUS_DATE_ID  integer NOT NULL   ,
	FORGIVENESS_DATE_ID  integer NOT NULL   ,
	BORROWER_ID          integer NOT NULL   ,
	ORIGINATING_LENDER_ID integer NOT NULL   ,
	SERVICING_LENDER_ID  integer NOT NULL   ,
	TERM_ID              integer NOT NULL   ,
	LOAN_STATUS_ID       integer NOT NULL   ,
	PROCESSING_METHOD_ID integer NOT NULL   ,
	SBA_OFFICE_CODE      integer NOT NULL   ,
	BUSINESS_AGE_ID      integer NOT NULL   ,
	BUSINESS_TYPE_ID     integer NOT NULL   ,
	SBA_GUARANTY_PERCENTAGE number    ,
	INITIAL_APPROVAL_AMOUNT number    ,
	CURRENT_APPROVAL_AMOUNT number    ,
	UNDISBURSED_AMOUNT   number    ,
	FORGIVENESS_AMOUNT   number    ,
	CONSTRAINT PK_FACTS_PPP PRIMARY KEY ( FACTS_PPP_ID )
 );

ALTER TABLE PUBLIC.FACTS_GDP ADD CONSTRAINT FK_FACTS_GDP_DIM_GEOGRAPHY FOREIGN KEY ( GEOFIPS ) REFERENCES PUBLIC.DIM_GEOGRAPHY( GEOFIPS ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_GDP ADD CONSTRAINT FK_FACTS_GDP_DIM_DATE FOREIGN KEY ( YEAR_ID ) REFERENCES PUBLIC.DIM_DATE( DATE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_NAICS FOREIGN KEY ( NAICS_CODE ) REFERENCES PUBLIC.DIM_NAICS( NAICS_CODE ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_GEOGRAPHY FOREIGN KEY ( GEOFIPS ) REFERENCES PUBLIC.DIM_GEOGRAPHY( GEOFIPS ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_DATE FOREIGN KEY ( DATE_APPROVED_ID ) REFERENCES PUBLIC.DIM_DATE( DATE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_DATE_0 FOREIGN KEY ( LOAN_STATUS_DATE_ID ) REFERENCES PUBLIC.DIM_DATE( DATE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_DATE_1 FOREIGN KEY ( FORGIVENESS_DATE_ID ) REFERENCES PUBLIC.DIM_DATE( DATE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_BORROWER FOREIGN KEY ( BORROWER_ID ) REFERENCES PUBLIC.DIM_BORROWER( BORROWER_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_ORIGINATING_LENDER FOREIGN KEY ( ORIGINATING_LENDER_ID ) REFERENCES PUBLIC.DIM_ORIGINATING_LENDER( ORIGINATING_LENDER_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_SERVICING_LENDER FOREIGN KEY ( SERVICING_LENDER_ID ) REFERENCES PUBLIC.DIM_SERVICING_LENDER( SERVICING_LENDER_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_TERM FOREIGN KEY ( TERM_ID ) REFERENCES PUBLIC.DIM_TERM( TERM_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_LOAN_STATUS FOREIGN KEY ( LOAN_STATUS_ID ) REFERENCES PUBLIC.DIM_LOAN_STATUS( LOAN_STATUS_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_PROCESSING_METHOD FOREIGN KEY ( PROCESSING_METHOD_ID ) REFERENCES PUBLIC.DIM_PROCESSING_METHOD( PROCESSING_METHOD_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_SBA_OFFICE FOREIGN KEY ( SBA_OFFICE_CODE ) REFERENCES PUBLIC.DIM_SBA_OFFICE( SBA_OFFICE_CODE ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_BUSINESS_AGE FOREIGN KEY ( BUSINESS_AGE_ID ) REFERENCES PUBLIC.DIM_BUSINESS_AGE( BUSINESS_AGE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;

ALTER TABLE PUBLIC.FACTS_PPP ADD CONSTRAINT FK_FACTS_PPP_DIM_BUSINESS_TYPE FOREIGN KEY ( BUSINESS_TYPE_ID ) REFERENCES PUBLIC.DIM_BUSINESS_TYPE( BUSINESS_TYPE_ID ) ON DELETE NO ACTION ON UPDATE NO ACTION;


In [229]:
def get_ppp_loan_data():
    container_name = 'pppdata'
    blob_list = get_blob_list(container_name)
    
    for blob in blob_list:
        if "public_150k_plus" in blob.name:
            print(f"Downloading {blob.name}")
            blob_data = get_azure_blob_data(container_name, blob)
            print(f"Downloaded {blob.name} successfully")
            data = io.BytesIO(blob_data)
            print(f"Reading {blob.name}")
            df_chunks = pd.read_csv(data, chunksize=100000)  # Adjust the chunksize as per your memory capacity
            df = pd.concat(df_chunks)
            return df

In [230]:
def get_naics_data():
    container_name = 'naicsdata'
    blob_list = get_blob_list(container_name)
    #print(f"Found {len(blob_list)} files in the container")

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df
    

In [231]:
def get_gdp_data():
    container_name = 'gdpdata'
    blob_list = get_blob_list(container_name)
    #print(f"Found {len(blob_list)} files in the container")

    for blob in blob_list:
        blob_data = get_azure_blob_data(container_name, blob)
        data = io.BytesIO(blob_data)
        df = pd.read_csv(data)
        return df

In [232]:
def transform_data():
    df_ppp = get_ppp_loan_data()
    df_naics = get_naics_data()
    df_gdp  = get_gdp_data()

    # Remove unnecessary columns from PPP data
    df_ppp.drop(columns=[
        'UTILITIES_PROCEED',
        'PAYROLL_PROCEED',
        'MORTGAGE_INTEREST_PROCEED',
        'RENT_PROCEED',
        'REFINANCE_EIDL_PROCEED',
        'HEALTH_CARE_PROCEED',
        'DEBT_INTEREST_PROCEED',
        'RuralUrbanIndicator',
        'HubzoneIndicator',
        'LMIIndicator',
        'ProjectCity',
        'ProjectZip',
        'CD'
    ], inplace=True)

    # Rename PPP columns
    df_ppp.rename(columns={
        'LoanNumber': 'loan_number',
        'SBAOfficeCode': 'sba_office_code',
        'ProcessingMethod': 'processing_method',
        'BorrowerName': 'borrower_name',
        'BorrowerAddress': 'borrower_address',
        'BorrowerCity': 'borrower_city',
        'BorrowerState': 'borrower_state',
        'BorrowerZip': 'borrower_zip',
        'LoanStatus': 'loan_status',
        'Term': 'term_month',
        'SBAGuarantyPercentage': 'sba_guaranty_percentage',
        'InitialApprovalAmount': 'initial_approval_amount',
        'CurrentApprovalAmount': 'current_approval_amount',
        'UndisbursedAmount': 'undisbursed_amount',
        'FranchiseName': 'franchise_name',
        'ServicingLenderLocationID': 'servicing_lender_location_id',
        'ServicingLenderName': 'servicing_lender_name',
        'ServicingLenderAddress': 'servicing_lender_address',
        'ServicingLenderCity': 'servicing_lender_city',
        'ServicingLenderState': 'servicing_lender_state',
        'ServicingLenderZip': 'servicing_lender_zip',
        'Race': 'race',
        'Ethnicity': 'ethnicity',
        'Gender': 'gender',
        'Veteran': 'veteran',
        'NonProfit': 'nonprofit',
        'ForgivenessAmount': 'forgiveness_amount',
        'JobsReported': 'jobs_reported',
        'NAICSCode': 'naics_code'
    }, inplace=True)

    # Rename NAICS columns
    df_naics.rename(columns={
        'Code': 'naics_code',
        'Title': 'naics_title',
        'Description': 'description'
    }, inplace=True)

    # Pivot the data in GDP data
    # Current columns: GeoFIPS, GeoName, Region, LineCode, TableName, IndustryClassification, Descriptio, Unit, 2017, 2018, 2019, 2020, 2021, 2022
    # Desired columns: GeoFIPS, GeoName, Year, current_dollar_gdp, chain_type_index_gdp, real_gdp
    """
    Example code:
    # Reshaping the dataframe to have years as rows and descriptions as columns
    pivot_data = filtered_data.melt(id_vars=["GeoFIPS", "GeoName", "Description"],
                                    value_vars=["2017", "2018", "2019", "2020", "2021", "2022"],
                                    var_name="date_id",
                                    value_name="Value")

    # Pivoting data to have descriptions as columns
    pivot_data = pivot_data.pivot_table(index=["GeoFIPS", "GeoName", "date_id"], columns="Description", values="Value", aggfunc='first').reset_index()

    # Sorting by GeoFIPS and date_id for better readability
    pivot_data = pivot_data.sort_values(by=["GeoFIPS", "date_id"])

    # Renaming columns
    pivot_data.rename(columns={
        "Chain-type quantity indexes for real GDP ": "chain_type_quantity_index",
        "Current-dollar GDP (thousands of current dollars) ": "current_dollar_gdp",
        "Real GDP (thousands of chained 2017 dollars) ": "real_gdp",
        "GeoFIPS" : "geo_fips",
        "GeoName" : "geo_name"
    }, inplace=True)

    pivot_data['fact_id'] = range(1, len(pivot_data) + 1)

    print(pivot_data.columns)

    final_data = pivot_data.drop(columns='Description', errors='ignore')

    final_data = pivot_data[['geo_fips', 'geo_name', 'date_id', 'chain_type_quantity_index',
        'current_dollar_gdp', 'real_gdp', 'fact_id']]

    print(final_data.shape)
    # Display the reshaped data
    final_data.head()
    """
    selected_columns = ['GeoFIPS', 'GeoName', 'Description', '2017', '2018', '2019', '2020', '2021', '2022']
    filtered_data = df_gdp[selected_columns]
    pivot_data = filtered_data.melt(id_vars=["GeoFIPS", "GeoName", "Description"],
                                    value_vars=["2017", "2018", "2019", "2020", "2021", "2022"],
                                    var_name="date_id",
                                    value_name="Value")
    pivot_data = pivot_data.pivot_table(index=["GeoFIPS", "GeoName", "date_id"], columns="Description", values="Value", aggfunc='first').reset_index()
    pivot_data = pivot_data.sort_values(by=["GeoFIPS", "date_id"])
    pivot_data.rename(columns={
        "Chain-type quantity indexes for real GDP ": "chain_type_quantity_index",
        "Current-dollar GDP (thousands of current dollars) ": "current_dollar_gdp",
        "Real GDP (thousands of chained 2017 dollars) ": "real_gdp",
        "GeoFIPS" : "geo_fips",
        "GeoName" : "geo_name"
    }, inplace=True)
    pivot_data['fact_id'] = range(1, len(pivot_data) + 1)
    final_data = pivot_data.drop(columns='Description', errors='ignore')
    final_data = pivot_data[['geo_fips', 'geo_name', 'date_id', 'chain_type_quantity_index',
        'current_dollar_gdp', 'real_gdp', 'fact_id']]
    df_gdp = final_data
    
    return df_ppp, df_naics, df_gdp


    

In [233]:
ppp, naics, gdp = transform_data()
gdp.head()

Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv


Description,geo_fips,geo_name,date_id,chain_type_quantity_index,current_dollar_gdp,real_gdp,fact_id
0,"""00000""",United States,2017,100.0,19612102000,19612102000,1
1,"""00000""",United States,2018,102.967,20656516000,20193896000,2
2,"""00000""",United States,2019,105.507,21521395000,20692087000,3
3,"""00000""",United States,2020,103.171,21322950000,20234074000,4
4,"""00000""",United States,2021,109.156,23594031000,21407692000,5


In [234]:
if __name__ == "__main__":
    # Get the head of the data
    #transform_gdp_data()
    #transform_naics_data()
    #transform_gdp_data()
    pass