# PPP FOIA Data Cleaning

Data files can be downloaded from https://data.sba.gov/dataset/ppp-foia

In [2]:
import json
import os
import pandas as pd

## Combine & Load Data
The full data set contains over 10M items. Use nrows to reduce loaded data for faster processing during exploration if needed.

In [1]:
nrows = None # Integer number of rows to load from each file. Use None to read in all rows.
data_dir = '../data/PPP-FOIA/'

In [3]:
# Running this with all data uses ~9 Gb of RAM
df = None
low_memory = False # Allows checking entire file to decide dtypes (removes warning)
filenames = os.listdir(data_dir)
filenames = [f for f in filenames if f.endswith('.csv')]
for filename in filenames:
    if df is None:
        df = pd.read_csv(data_dir + filename, nrows=nrows, low_memory=low_memory)
    else:
        df = pd.concat([df, pd.read_csv(data_dir + filename, nrows=nrows, low_memory=low_memory)])


In [5]:
df[['OriginatingLenderLocationID', 'OriginatingLender']].head()

Unnamed: 0,OriginatingLenderLocationID,OriginatingLender
0,19248.0,Synovus Bank
1,19248.0,Synovus Bank
2,9551.0,"Bank of America, National Association"
3,9551.0,"Bank of America, National Association"
4,57328.0,The Huntington National Bank


## Column Descriptions:

In [4]:
column_names_df = pd.read_excel(f'{data_dir}ppp-data-dictionary.xlsx')
column_names_df = column_names_df.rename(columns=lambda x: x.replace(' ', ''))
column_names_df.head()

Unnamed: 0,FieldName,FieldDescription
0,LoanNumber,Loan Number (unique identifier)
1,DateApproved,Loan Funded Date
2,SBAOfficeCode,SBA Origination Office Code
3,ProcessingMethod,Loan Delivery Method (PPP for first draw; PPS ...
4,BorrowerName,Borrower Name


## Dat Type Consistency

In [9]:
df['BorrowerAddress'].isna.sum()

AttributeError: 'function' object has no attribute 'sum'

## Basis Exploration

In [15]:
loan_count = len(df.LoanNumber.unique())
loan_count

11460475

In [16]:
lender_count = len(df.OriginatingLender.unique())
lender_count

4676

Count the number of loans each institution made

In [17]:
loan_counts = df.groupby('OriginatingLender').LoanNumber.count().sort_values(ascending=False).head()
loan_counts.head()

OriginatingLender
Bank of America, National Association        491035
Cross River Bank                             478866
Prestamos CDFI, LLC                          444781
JPMorgan Chase Bank, National Association    438565
Harvest Small Business Finance, LLC          408173
Name: LoanNumber, dtype: int64

In [18]:
missing_NAICS = df.NAICSCode.isna().sum()
print(f'There are {missing_NAICS} out of {loan_count} loans missing the NAICS Code ({100*(missing_NAICS/loan_count):.2f}%)')

There are 132325 out of 11460475 loans missing the NAICS Code (1.15%)


## Handle Missing Data

## Check for Duplicates

In [19]:
df.duplicated().sum()

0

## Consider Outliers

## Transform Datatypes

dtypes can be applied when loading data. The following definition constains all columns, but requires field updates.

In [20]:
# TODO: Select correct dtypes and apply datetime transform as needed. Once ready, use these transforms to create/load the combined file.
dtype = {
    'LoanNumber': int,	                    # Loan Number (unique identifier)
    'DateApproved': str,	        # Loan Funded Date
    'SBAOfficeCode': str,	                # SBA Origination Office Code
    'ProcessingMethod': str,	            # Loan Delivery Method (PPP for first draw; PPS for second draw)
    'BorrowerName': str,	                # Borrower Name
    'BorrowerAddress': str,	                # Borrower Street Address
    'BorrowerCity': str,	                # Borrower City
    'BorrowerState': str,	                # Borrower State
    'BorrowerZip': str,	                    # Borrower Zip Code
    'LoanStatusDate': str,   	    # Loan Status Date - Loan Status Date is  blank when the loan is disbursed but not Paid In Full or Charged Off
    'LoanStatus': int,	                    # Loan Status Description - Loan Status is replaced by 'Exemption 4' when the loan is disbursed but not Paid in Full or Charged Off
    'Term': str,	                        # Loan Maturity in Months
    'SBAGuarantyPercentage': str,	        # SBA Guaranty Percentage
    'InitialApprovalAmount': str,	        # Loan Approval Amount(at origination)
    'CurrentApprovalAmount': str,	        # Loan Approval Amount (current)
    'UndisbursedAmount': str,	            # Undisbursed Amount
    'FranchiseName': str,	                # Franchise Name
    'ServicingLenderLocationID': str,	    # Lender Location ID (unique identifier)
    'ServicingLenderName': str,	            # Servicing Lender Name
    'ServicingLenderAddress': str,	        # Servicing Lender Street Address
    'ServicingLenderCity': str,     	    # Servicing Lender City
    'ServicingLenderState': str,	        # Servicing Lender State
    'ServicingLenderZip': str,	            # Servicing Lender Zip Code
    'RuralUrbanIndicator': str,	            # Rural or Urban Indicator (R/U)
    'HubzoneIndicator': str,	            # Hubzone Indicator (Y/N)
    'LMIIndicator': str,	                # LMI Indicator (Y/N)
    'BusinessAgeDescription': str,	        # Business Age Description
    'ProjectCity': str,	                    # Project City
    'ProjectCountyName': str,	            # Project County Name
    'ProjectState': str,	                # Project State
    'ProjectZip': str,	                    # Project Zip Code
    'CD': str,	                            # Project Congressional District
    'JobsReported': str,	                # Number of Employees
    'NAICSCode': str,	                    # NAICS 6 digit code
    'Race': str,	                        # Borrower Race Description
    'Ethnicity': str,               	    # Borrower Ethnicity Description
    'UTILITIES_PROCEED': str,	            # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'PAYROLL_PROCEED': str,	                # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'MORTGAGE_INTEREST_PROCEED': str,	    # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'RENT_PROCEED': str,	                # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'REFINANCE_EIDL_PROCEED': str,	        # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'HEALTH_CARE_PROCEED': str,	            # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'DEBT_INTEREST_PROCEED': str,	        # Note: Proceed data is lender reported at origination.  On the PPP application the proceeds fields were check boxes.  
    'BusinessType': str,	                # Business Type Description
    'OriginatingLenderLocationID': str,	    # Originating Lender ID (unique identifier)
    'OriginatingLender': str,	            # Originating Lender Name
    'OriginatingLenderCity': str,	        # Originating Lender City
    'OriginatingLenderState': str,	        # Originating Lender State
    'Gender': str,	                        # Gender Indicator
    'Veteran': str,	                        # Veteran Indicator
    'NonProfit': str,	                    # 'Yes' if Business Type = Non-Profit Organization or Non-Profit Childcare Center or 501(c) Non Profit
    'ForgivenessAmount': str,	            # Forgiveness Amount
    'ForgivenessDate': str,	                # Forgiveness Paid Date
}

## Validate Data (Check for Errors)