### [SBA PPP Loans](https://data.sba.gov/dataset/ppp-foia)

In [None]:
##%load water.md
from IPython.display import Markdown, display

display(Markdown("water.md"))

## Set-Up

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv (
        find_dotenv (
            usecwd=True
        ),
    override=True
) # read local .env file and override any existing

from sqlalchemy import create_engine
from os import environ

username     =  environ.get("POSTGRES_USERNAME", "postgres")
password     =  environ.get("POSTGRES_PASSWORD", "postgres")
ipaddress    =  environ.get("POSTGRES_IPADDRESS", "localhost")
port         =  environ.get("POSTGRES_PORT", "5432")
dbname       =  environ.get("POSTGRES_DBNAME", "MA")

#establish database connection for Transform queries and Loads
cnx= create_engine(f'postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}')

In [None]:
import pandas as pd
import numpy as np

## Extract

In [None]:

# sba_data_dict_url = 'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/aab8e9f9-36d1-42e1-b3ba-e59c79f1d7f0/download/ppp-data-dictionary.xlsx'

# sba_data_dict = pd.read_excel(sba_data_dict_url)
# sba_data_dict


# link.format(num='2',id=links['2'])

# ppp=pd.read_csv(link.format(num=num,id=links[num]))

In [None]:
#    '0':'2b55e11d-7e75-4bbb-b526-69a06c0c4731/download/public_150k_plus_230101.csv',

link = 'https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/{id}//download/public_up_to_150k_{num}_230101.csv'
links = {
    '1': '5f700a26-02f9-4d97-94a3-e3c2c43871eb',
    '2': 'b785dfac-7d99-4bc0-9ab2-e87fe855174e',
    '3': '6899d4ff-7f2a-4455-a18f-592118e8e052',
    '4': 'a100fcb3-7708-4683-aa63-e5a594264e21',
    '5': 'f4f85ef0-6279-4e81-baac-eefbbc3ebc2d',
    '6': '03bab509-ad0f-4dbd-88f1-99599dbd3dfc',
    '7': '2cea4fbe-2fb5-4307-8d00-5c7203d333f7',
    '8': '35375b26-8bd5-4868-b89d-ab02ccbf2b43',
    '9': '2f6e4ccd-0311-43dc-b721-8bc07f586fa2',
    '10':'d2a0b6cd-414a-44af-9c0d-55259e5ebf20',
    '11':'262eb7fc-e074-45ca-a977-f6d8d223e1b3',
    '12':'dd54d47b-63e9-41c4-ae13-8e12c8ca4ea1',
}
import requests

Arlington = pd.DataFrame()
MA= pd.DataFrame()
for num in links.keys():
    r = requests.get(link.format(num=num,id=links[num]), stream=True)
    ppp = pd.read_csv(r.raw)

    mask = (ppp.BorrowerState=='MA')&(ppp.BorrowerCity=='Arlington')
    Arlington = pd.concat([Arlington,ppp[mask]])

    mask = (ppp.BorrowerState=='MA')
    MA = pd.concat([MA,ppp[mask]])
    print(num,len(Arlington),len(MA))


## Transform

In [None]:
#ppp = pd.read_csv(links['1'])
mask = (ppp.BorrowerState=='MA')&(ppp.BorrowerCity=='Arlington')
Arlington_bigloans = ppp[mask].copy()
mask = (ppp.BorrowerState=='MA')
MA_bigloans = ppp[mask].copy()
           

#.iloc[-1]
cols = ['LoanNumber', 'DateApproved','BorrowerName', 'BorrowerAddress','InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount', 'FranchiseName',
        'BusinessAgeDescription','JobsReported',
       'NAICSCode', 'Race', 'Ethnicity', 'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED', 'BusinessType','Gender', 'Veteran', 'NonProfit', 'ForgivenessAmount',
       'ForgivenessDate']
combo = pd.concat([Arlington[cols],Arlington_bigloans[cols]])

for col in ['FranchiseName','NonProfit']:
    mask = pd.isnull(combo[col])
    combo.loc[mask,col]=''
    
for col in ['DateApproved','ForgivenessDate']:
    combo[col]=pd.to_datetime(combo[col]).dt.date#dt.strftime('%Y-%m-%d')
    
int_cols = ['InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount','JobsReported',
       'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED']
for col in int_cols:
    combo[col]=combo[col].replace({np.nan:0,pd.isnull:0}).astype(int)
                                  

combo.loc[pd.isnull(combo.ForgivenessDate),'ForgivenessDate']=None

# if 'BorrowerName' in cols:
#     cols.remove('BorrowerName')
    
agg_dict = dict(zip(cols,len(cols)*[list]))
int_cols = ['InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount',
       'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED','ForgivenessAmount']

for col in int_cols:
    agg_dict[col]=sum

for col in ['BusinessAgeDescription','NAICSCode','BorrowerAddress','Race', 'JobsReported',
            'Ethnicity','Gender', 'Veteran','NonProfit','BusinessType','FranchiseName']:
    agg_dict[col]='last'  ##37 BorrowAddresses changes, adding Apt# and prefix spelling

ppp = combo.groupby('BorrowerName').agg(agg_dict).drop('BorrowerName',axis=1).reset_index()

streetPrefixes = {
    ' STREET$':' ST',
    ' TURNPIKE$':' TPKE',
    ' N/A$':'',
    ' AVE\.\,$' :' AVE',
    ' AVE\.$'  :' AVE',
    ' AVENUE$':' AVE',
    ' CIRCLE$':' CIR',
    ' DRIVE$' :' DR',
    ' EXTENSION$' :' EXT',
    ' LANE$' :' LN',
    ' PARK$' :' PK',
    ' PLACE$' :' PL',
    ' RAOD$' :' RD',
    ' ROAD$' :' RD',
    ' RD\.$' :' RD',
    ' ST\.$' :' ST',
}
ppp['address']=ppp['BorrowerAddress'].str.upper().replace(streetPrefixes,regex=True)
mask = ppp['address'].str.contains('927MASACHUSETTS')
ppp.loc[mask,'address']='927 MASS AVE'
ppp['address'] = ppp['address'].str.replace('MASSACHUSETTS','MASS')

ppp['pid'] = ''
exceptions = {
    '366 Massachusetts Ave\tABC FAMILY CHIROPRACTIC AND WELLNESS INC':'009.B-0001-0103.0',
    '259 Massachusetts Avenue	ANGEL SEIBRING, PH.D.':'029.A-0003-0013.A',
    '1064 Massachusetts Ave	ARLINGTON SMILES PC':'128.A-0001-1064.1',
    "180 Massachusetts Ave	BRIZUELA'S ATHLETICS INC.":'003.A-0001-0007.0',
    '120 Decatur Street	EASY TUCK BEDDING LLC':'037.A-0005-0001.0',
    '180 Massachusetts Ave	ISAIAH BRIZUELA':'003.A-0001-0007.0',
    '259 Massachusetts Ave	JOANNE GREENFIELD':'029.A-0003-0013.B',
    '22 Mill St	JOSEPH FAHEY':'052.A-0001-0408.0',
    '990 Massachusetts Ave	MASTER THE TIME CONCIERGE LLC':'127.A-0002-0336.0',
    '40 Brattle St	MEHMET SAHIN':'056.A-0002-0004.A',
    '11 Howard St	MENGMENG DING':'148.A-0005-0011.1',
    '259 Massachusetts Ave	MIKKIE MITTELHOLZER HARVEY':'029.A-0003-0013.C',
    '366 Massachusetts Avenue	RENEE OUTLAND':'009.B-0001-0102.0',
    '366 Massachusetts Ave	ROBERT MULHERN':'009.B-0001-0101.0',
    '130 Broadway	SARA DETRICK SCOTT, PSYD':'030.A-0003-0001.1',
    '34 Hamilton road	SELINA MAITREYA CONSULTING':'020.A-0004-0504.0',
    '19 Prentiss Rd	SUNSHINE NURSERY SCHOOL':'054.A-0001-0003.2',
    '366 Massachusetts Ave	TINA BONNEY':'009.B-0001-0304.0',
    '366 Massachusetts Ave	UNIVERSAL BROKERAGE REAL ESTATE LLC':'009.B-0001-0104.0',
}
for key in exceptions.keys():
    mask = (ppp['BorrowerAddress'] + '\t' + ppp['BorrowerName']) == key
    if mask.any():
        ppp.loc[mask,'pid'] = exceptions[key]
#ppp[((ppp['BorrowerAddress'] + '\t' + ppp['BorrowerName']).isin(list(exceptions.keys())))]

##90 with duplicate addresses
len(ppp[ppp['BorrowerAddress'].duplicated()])

##84 different companies (some name changes) at same address
len(ppp[ppp.BorrowerAddress.duplicated(keep=False)].sort_values(['BorrowerAddress']))

len(ppp.address.unique())

parcels = pd.read_sql_query('select * from property.patriot where fy=2023',cnx)

ddd=ppp[ppp.pid==''].merge(parcels,how='left',right_on='location',left_on='address',indicator='matched')#.groupby('matched').count()

matches = ddd[ddd.matched=='both'][['parcel']+list(ppp.columns)]
matches['pid']=matches['parcel']
matches=matches.drop('parcel',axis=1)
matches = matches[ppp.columns]

matches = pd.concat([ppp[ppp.pid!=''],matches])

ddd=ddd[ddd.matched!='both'].drop('matched',axis=1)[ppp.columns]
# ddd[['BorrowerAddress','BorrowerName','address']].to_csv('ppp_address_unmatched.tsv',sep='\t',index=False)

additional= pd.read_csv('ppp_address_matched.tsv',sep='\t')

additional_matches = ddd.drop('pid',axis=1).merge(additional,on=['BorrowerAddress','BorrowerName'],how='left')


assert(len(matches)+len(additional)-len(ppp)==0)
additional_matches.columns

ppp_norm = pd.concat([matches,additional_matches]).reset_index(drop=True)

##outdated NAICS used, by Henry Bear and Cookie Time and Fuel delivery and auto repair
naics_changes = {
    453998.0:459120,
    454310.0:457210,
    423720.0:457210,
    445299.0:445298,
    447190.0:811111,
    453110.0:459310,
    511210.0:513210,
    #445120.0:,
}

ppp_norm['NAICSCode']=ppp_norm['NAICSCode'].replace(naics_changes)


## Load

In [None]:
table_create_query = \
"""
        DROP TABLE IF EXISTS infrastructure.ppp;
        CREATE TABLE infrastructure.ppp (
            "BorrowerName" VARCHAR(60),
            "LoanNumber" BIGINT ARRAY,
            "DateApproved" DATE ARRAY,
            "BorrowerAddress" VARCHAR(45),
            "InitialApprovalAmount" INTEGER ,
            "CurrentApprovalAmount" INTEGER ,
            "UndisbursedAmount" INTEGER ,
            "FranchiseName" VARCHAR(45),
            "BusinessAgeDescription" VARCHAR(35),
            "JobsReported" SMALLINT,
            "NAICSCode" INTEGER,
            "Race" CHAR(35),           
            "Ethnicity" VARCHAR(30),
            "UTILITIES_PROCEED" INTEGER ,
            "PAYROLL_PROCEED" INTEGER ,
            "MORTGAGE_INTEREST_PROCEED" INTEGER ,
            "RENT_PROCEED" INTEGER ,
            "REFINANCE_EIDL_PROCEED" INTEGER ,
            "HEALTH_CARE_PROCEED" INTEGER ,
            "DEBT_INTEREST_PROCEED" INTEGER ,
            "BusinessType" VARCHAR(35) ,
            "Gender" VARCHAR(15) ,
            "Veteran" VARCHAR(15) ,
            "NonProfit" CHAR(1) ,
            "ForgivenessAmount" INTEGER,
            "ForgivenessDate" DATE ARRAY,
            "address" VARCHAR(45) ,
            "pid" VARCHAR(20) 
        );
        
    CREATE INDEX ppp_idx 
        ON infrastructure.ppp(pid);
    """
cnx.execute(table_create_query)


In [None]:
ppp_norm.to_sql(
    'ppp',schema='infrastructure',con=cnx,
    if_exists='append',index=False
)

In [None]:
cols = ['LoanNumber', 'DateApproved','BorrowerName', 'BorrowerAddress','InitialApprovalAmount',
       'CurrentApprovalAmount', 'UndisbursedAmount', 'FranchiseName',
        'BusinessAgeDescription','JobsReported',
       'NAICSCode', 'Race', 'Ethnicity', 'UTILITIES_PROCEED',
       'PAYROLL_PROCEED', 'MORTGAGE_INTEREST_PROCEED', 'RENT_PROCEED',
       'REFINANCE_EIDL_PROCEED', 'HEALTH_CARE_PROCEED',
       'DEBT_INTEREST_PROCEED', 'BusinessType','Gender', 'Veteran', 'NonProfit', 'ForgivenessAmount',
       'ForgivenessDate']

all_MA = pd.concat([MA[cols],MA_bigloans[cols]])

In [None]:
all_MA.to_csv('MA_ppp_loans.tsv',sep='\t',index=False)

In [None]:
naics = pd.read_excel('https://www.census.gov/naics/2022NAICS/6-digit_2022_Codes.xlsx')
naics = naics.iloc[1:,0:2].reset_index(drop=True)
naics.columns = ['key','value']
naics.key=naics.key.astype(int)

table_create_query = \
"""
        DROP TABLE IF EXISTS common.naics;
        CREATE TABLE common.naics (
            "key" INTEGER,
            "value" VARCHAR(255),
            PRIMARY KEY("key")
        );
        
    CREATE INDEX naics_idx 
        ON common.naics(key);
    """
cnx.execute(table_create_query)

naics.to_sql('naics',schema='common',con=cnx,if_exists='append',index=False)