In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import os 

In [2]:
# Initialize the master DataFrame
master_df = pd.read_csv("ZIPdf.csv")
master_df.rename(columns={"Unnamed: 0" : "ZIP"}, inplace=True)
master_df["ZIP"] = master_df["ZIP"].astype(str).str.zfill(5)
master_df.set_index("ZIP", inplace=True)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,Gini,UR,%BachOrMore,STCOUNTYFP,W_Avg Emp Min Share,W_Avg Emp Black Share,W_Avg Emp White Share,W_Avg Emp Asian Share,W_Avg Emp Hisp Share,W_Avg Emp Non-Hisp Share
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00501,,5.0,49.0,5.0,,,,,,,...,,,,['36103'],0.149040,,0.859492,0.083086,0.056364,0.892998
00601,,,,,99.785408,99.779608,0.005800,99.669412,0.000000,0.000000,...,44.90,34.986667,14.562560,"['72001', '72113']",,,,,,
00602,,,,,99.432726,99.406094,0.026633,99.280920,0.026633,0.007990,...,49.26,11.262690,21.817471,"['72003', '72005']",,,,,,
00603,,,,,98.827403,98.666774,0.160630,98.377640,0.098386,0.010039,...,57.02,20.851813,24.267387,"['72005', '72071']",,,,,,
00606,,,,,99.480934,99.460970,0.019964,99.301258,0.039928,0.000000,...,43.73,12.011372,10.361217,"['72093', '72153']",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99252,,,,,,,,,,,...,,,,['53063'],0.061404,,0.852339,0.029630,0.020760,0.864522
99256,,,,,,,,,,,...,,,,['53063'],0.061404,,0.852339,0.029630,0.020760,0.864522
99529,,,,,,,,,,,...,,,,['02020'],0.135079,0.015303,0.751177,0.072248,0.027810,0.826074
99599,,,,,,,,,,,...,,,,['02020'],0.135079,0.015303,0.751177,0.072248,0.027810,0.826074


# PPP ZIP Data

In [3]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "PPP Files.csv")
file_df["Completed"] = 0
file_df

Unnamed: 0,File,Completed
0,public_150k_plus_230630.csv,0
1,public_up_to_150k_1_230630.csv,0
2,public_up_to_150k_2_230630.csv,0
3,public_up_to_150k_3_230630.csv,0
4,public_up_to_150k_4_230630.csv,0
5,public_up_to_150k_5_230630.csv,0
6,public_up_to_150k_6_230630.csv,0
7,public_up_to_150k_7_230630.csv,0
8,public_up_to_150k_8_230630.csv,0
9,public_up_to_150k_9_230630.csv,0


In [4]:
def assign_value_based_on_date(date):
    if date < pd.Timestamp('2020-04-17'):
        return '_1'
    elif date < pd.Timestamp('2020-08-09'):
        return '_2'
    else:
        return '_3'
traunches = ["_1", "_2", "_3"]

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
master_df["Number of Loans_1"] = 0
master_df["Total Loans Amount_1"] = 0

master_df["Number of Loans_2"] = 0
master_df["Total Loans Amount_2"] = 0

master_df["Number of Loans_3"] = 0
master_df["Total Loans Amount_3"] = 0

In [7]:
for file in file_df[file_df["Completed"] == 0]["File"]:
    file_name = base + file
    # Check if file exists
    if not os.path.exists(file_name):
        print(f"File {file_name} does not exist. Skipping...")
        continue

    print(f"Processing {file}...")
    
    # Read file row by row since cannot fit in DF
    chunk_size = 640000
    for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['NonProfit', 
                                                                        'BusinessType', 
                                                                        'NAICSCode', 
                                                                        'BorrowerZip', 
                                                                        'DateApproved', 
                                                                        'CurrentApprovalAmount']):
        # Filter chunk to only needed observations
        #chunk = chunk[chunk["NonProfit"] != "Y"]
        chunk = chunk[~chunk["BusinessType"].isin(["Independent Contractors",
                                                   "Self-Employed Individuals"])]
        chunk = chunk[~chunk["NAICSCode"].astype(str).str.startswith(("482",
                                                                      "491",
                                                                      "525110",
                                                                      "525120",
                                                                      "525190",
                                                                      "525920",
                                                                      "541120",
                                                                      "814",
                                                                      "92"))]
        chunk.dropna(subset=["BorrowerZip"], inplace=True)

        # Edit chunk to include necessary info
        chunk["DateApproved"] = pd.to_datetime(chunk["DateApproved"])
        chunk["Traunch"] = chunk["DateApproved"].apply(assign_value_based_on_date)
        chunk["BorrowerZip"] = chunk["BorrowerZip"].astype(str).str[:5]
            
        # Aggregate Data
        grouped = chunk.groupby(['BorrowerZip', 'Traunch']).agg({
                'Traunch': 'size'})
        
        # Edit chunk for amount
        chunk = chunk[~chunk["NAICSCode"].astype(str).str.startswith(("111", "112"))]
        
        # Aggregate for amount
        amount_grouped = chunk.groupby(['BorrowerZip', 'Traunch']).agg({
                'CurrentApprovalAmount': 'sum'})

        # Update master_df
        for (zip_, traunch), row in grouped.iterrows():
            if zip_ in master_df.index:
                master_df.loc[zip_, f"Number of Loans{traunch}"] += row['Traunch']  # Update the number of loans
            else:
                continue

        for (zip_, traunch), row in amount_grouped.iterrows():
            if zip_ in master_df.index:
                master_df.loc[zip_, f"Total Loans Amount{traunch}"] += row['CurrentApprovalAmount']
            else:
                continue
                        
    #file_df.loc[file_df["File"] == file, "Completed"] = 1
    
    print(f"Updated master DataFrame with data from {file}.")

Processing public_150k_plus_230630.csv...
Updated master DataFrame with data from public_150k_plus_230630.csv.
Processing public_up_to_150k_1_230630.csv...
Updated master DataFrame with data from public_up_to_150k_1_230630.csv.
Processing public_up_to_150k_2_230630.csv...
Updated master DataFrame with data from public_up_to_150k_2_230630.csv.
Processing public_up_to_150k_3_230630.csv...
Updated master DataFrame with data from public_up_to_150k_3_230630.csv.
Processing public_up_to_150k_4_230630.csv...
Updated master DataFrame with data from public_up_to_150k_4_230630.csv.
Processing public_up_to_150k_5_230630.csv...
Updated master DataFrame with data from public_up_to_150k_5_230630.csv.
Processing public_up_to_150k_6_230630.csv...
Updated master DataFrame with data from public_up_to_150k_6_230630.csv.
Processing public_up_to_150k_7_230630.csv...
Updated master DataFrame with data from public_up_to_150k_7_230630.csv.
Processing public_up_to_150k_8_230630.csv...
Updated master DataFrame 

In [8]:
master_df["Total Loans"] = master_df["Number of Loans_1"].add(master_df["Number of Loans_2"], fill_value = 0).add(master_df["Number of Loans_3"], fill_value = 0)
master_df["Total Loan Amount"] = master_df["Total Loans Amount_1"].add(master_df["Total Loans Amount_2"], fill_value = 0).add(master_df["Total Loans Amount_3"], fill_value = 0)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,W_Avg Emp Hisp Share,W_Avg Emp Non-Hisp Share,Number of Loans_1,Total Loans Amount_1,Number of Loans_2,Total Loans Amount_2,Number of Loans_3,Total Loans Amount_3,Total Loans,Total Loan Amount
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00501,,5.0,49.0,5.0,,,,,,,...,0.056364,0.892998,0,0.00,0,0.00,0,0.00,0,0.00
00601,,,,,99.785408,99.779608,0.005800,99.669412,0.000000,0.000000,...,,,2,328260.00,87,1177718.14,80,1160503.32,169,2666481.46
00602,,,,,99.432726,99.406094,0.026633,99.280920,0.026633,0.007990,...,,,20,2229895.86,321,5833086.32,335,6080007.15,676,14142989.33
00603,,,,,98.827403,98.666774,0.160630,98.377640,0.098386,0.010039,...,,,23,5091700.00,442,10733965.15,417,12158515.71,882,27984180.86
00606,,,,,99.480934,99.460970,0.019964,99.301258,0.039928,0.000000,...,,,1,4800.00,15,131967.00,15,169580.00,31,306347.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99252,,,,,,,,,,,...,0.020760,0.864522,0,0.00,0,0.00,0,0.00,0,0.00
99256,,,,,,,,,,,...,0.020760,0.864522,0,0.00,0,0.00,0,0.00,0,0.00
99529,,,,,,,,,,,...,0.027810,0.826074,0,0.00,0,0.00,0,0.00,0,0.00
99599,,,,,,,,,,,...,0.027810,0.826074,0,0.00,0,0.00,0,0.00,0,0.00


In [9]:
master_df[["Number of Loans_1",
           "Total Loans Amount_1",
           "Number of Loans_2",
           "Total Loans Amount_2",
           "Number of Loans_3",
           "Total Loans Amount_3",
           "Total Loans",
           "Total Loan Amount"]] = master_df[["Number of Loans_1",
           "Total Loans Amount_1",
           "Number of Loans_2",
           "Total Loans Amount_2",
           "Number of Loans_3",
           "Total Loans Amount_3",
           "Total Loans",
           "Total Loan Amount"]].fillna(0)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,W_Avg Emp Hisp Share,W_Avg Emp Non-Hisp Share,Number of Loans_1,Total Loans Amount_1,Number of Loans_2,Total Loans Amount_2,Number of Loans_3,Total Loans Amount_3,Total Loans,Total Loan Amount
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00501,,5.0,49.0,5.0,,,,,,,...,0.056364,0.892998,0,0.00,0,0.00,0,0.00,0,0.00
00601,,,,,99.785408,99.779608,0.005800,99.669412,0.000000,0.000000,...,,,2,328260.00,87,1177718.14,80,1160503.32,169,2666481.46
00602,,,,,99.432726,99.406094,0.026633,99.280920,0.026633,0.007990,...,,,20,2229895.86,321,5833086.32,335,6080007.15,676,14142989.33
00603,,,,,98.827403,98.666774,0.160630,98.377640,0.098386,0.010039,...,,,23,5091700.00,442,10733965.15,417,12158515.71,882,27984180.86
00606,,,,,99.480934,99.460970,0.019964,99.301258,0.039928,0.000000,...,,,1,4800.00,15,131967.00,15,169580.00,31,306347.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99252,,,,,,,,,,,...,0.020760,0.864522,0,0.00,0,0.00,0,0.00,0,0.00
99256,,,,,,,,,,,...,0.020760,0.864522,0,0.00,0,0.00,0,0.00,0,0.00
99529,,,,,,,,,,,...,0.027810,0.826074,0,0.00,0,0.00,0,0.00,0,0.00
99599,,,,,,,,,,,...,0.027810,0.826074,0,0.00,0,0.00,0,0.00,0,0.00


In [10]:
master_df["Loans per Emp Bus"] = master_df["Total Loans"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp"] = master_df["Total Loan Amount"] / master_df["CBP Emp"]
master_df["Loans per Emp Bus_1"] = master_df["Number of Loans_1"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_1"] = master_df["Total Loans Amount_1"] / master_df["CBP Emp"]
master_df["Loans per Emp Bus_2"] = master_df["Number of Loans_2"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_2"] = master_df["Total Loans Amount_2"] / master_df["CBP Emp"]
master_df["Loans per Emp Bus_3"] = master_df["Number of Loans_3"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_3"] = master_df["Total Loans Amount_3"] / master_df["CBP Emp"]
master_df.replace([np.inf, -np.inf], np.nan, inplace=True)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,Total Loans,Total Loan Amount,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00501,,5.0,49.0,5.0,,,,,,,...,0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00601,,,,,99.785408,99.779608,0.005800,99.669412,0.000000,0.000000,...,169,2666481.46,,,,,,,,
00602,,,,,99.432726,99.406094,0.026633,99.280920,0.026633,0.007990,...,676,14142989.33,,,,,,,,
00603,,,,,98.827403,98.666774,0.160630,98.377640,0.098386,0.010039,...,882,27984180.86,,,,,,,,
00606,,,,,99.480934,99.460970,0.019964,99.301258,0.039928,0.000000,...,31,306347.00,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99252,,,,,,,,,,,...,0,0.00,,,,,,,,
99256,,,,,,,,,,,...,0,0.00,,,,,,,,
99529,,,,,,,,,,,...,0,0.00,,,,,,,,
99599,,,,,,,,,,,...,0,0.00,,,,,,,,


# EIDL Advance ZIP Data

In [11]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "EIDL Advance Files.csv")
file_df

Unnamed: 0,File,Completed
0,01 EIDL Advance through 111520.csv,1
1,02 EIDL Advance through 111520.csv,1
2,03 EIDL Advance through 111520.csv,1
3,04 EIDL Advance through 111520.csv,1
4,05 EIDL Advance through 111520.csv,1
5,06 EIDL Advance through 111520.csv,1
6,07 EIDL Advance through 111520.csv,1


In [12]:
# master_df[["Number of Advs EIDLA", "Total Advs Amount EIDLA"]] = 0

In [13]:
# for file in file_df[file_df["Completed"] == 0]["File"]:
#     file_name = base + file
#     # Check if file exists
#     if not os.path.exists(file_name):
#         print(f"File {file_name} does not exist. Skipping...")
#         continue

#     print(f"Processing {file}...")

#     chunk_size = 150000

#     for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['LEGALENTITYZIP5', 
#                                                                        'FEDERALACTIONOBLIGATION',
#                                                                        'ACTIONTYPE']):
#         chunk.dropna(subset=["LEGALENTITYZIP5"], inplace=True)
    
#         # Edit chunk to include necessary info
#         chunk["LEGALENTITYZIP5"] = chunk["LEGALENTITYZIP5"].astype(str).str.zfill(5)
#             # The data for advances contains instances where funds were returned by banks or rescinded by the 
#             # government after too much was awarded. These negative values are important to the sum of funding 
#             # recieved, but need to be accounted for when obtaining counts of advances 
#         chunk['Adv_Count'] = chunk['ACTIONTYPE'].apply(lambda x: 1 if x == "A" else 0)
    
#         # Aggregate Data
#         grouped = chunk.groupby(['LEGALENTITYZIP5']).agg({
#             'FEDERALACTIONOBLIGATION': 'sum',
#             'Adv_Count': 'sum'  # Sum of Adv_Count will give the adjusted count of loans
#         })
    
#         # Update master_df
#         for zip_, row in grouped.iterrows():
#             if zip_ in master_df.index:
#                 master_df.loc[zip_, "Number of Advs EIDLA"] += row['Adv_Count']  # Update the number of loans
#                 master_df.loc[zip_, "Total Advs Amount EIDLA"] += row['FEDERALACTIONOBLIGATION']
#             else:
#                 # Create a new row filled with NaNs but update the relevant columns
#                 new_row = pd.Series(name=zip_, dtype='float64')  # Create a Series with name set to the ZIP code
#                 new_row["Number of Advs EIDLA"] = row['Adv_Count']
#                 new_row["Total Advs Amount EIDLA"] = row['FEDERALACTIONOBLIGATION']
        
#                 # Append the new row to the master DataFrame
#                 master_df = master_df.append(new_row)
                        
#     file_df.loc[file_df["File"] == file, "Completed"] = 1
    
#     print(f"Updated master DataFrame with data from {file}.")
    
# master_df.to_csv('ZIPdf.csv')
# file_df.to_csv(base + "EIDL Advance Files.csv", index=False)

# EIDL Loan ZIP Data

In [14]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "EIDL Loan Files.csv")
file_df

Unnamed: 0,File,Completed
0,DATAACT_EIDL_LOANS_20200401-20200609.csv,1
1,DATAACT_EIDL_LOANS_20200610-20200625.csv,1
2,DATAACT_EIDL_LOANS_20200626-20200723.csv,1
3,DATAACT_EIDL_LOANS_20200724-20201115.csv,1
4,DATAACT_EIDL_LOANS_DMCS2.0.csv,1


In [15]:
# master_df[["Number of Loans EIDLL", "Total Loan Amount EIDLL"]] = 0

In [16]:
# for file in file_df[file_df["Completed"] == 0]["File"]:
#     file_name = base + file
#     # Check if file exists
#     if not os.path.exists(file_name):
#         print(f"File {file_name} does not exist. Skipping...")
#         continue

#     print(f"Processing {file}...")
    
#     # Read file row by row since cannot fit in DF
#     chunk_size = 160000
#     for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['LEGALENTITYZIP5', 
#                                                                        'FACEVALUEOFDIRECTLOANORLOANGUARANTEE', 
#                                                                        'FAIN',
#                                                                        'ACTIONTYPE']):
#         chunk.dropna(subset=["LEGALENTITYZIP5"], inplace=True)

#         # Edit chunk to include necessary info
#         chunk["LEGALENTITYZIP5"] = chunk["LEGALENTITYZIP5"].astype(str).str.zfill(5)
#         chunk['Loan_Count'] = chunk['ACTIONTYPE'].apply(lambda x: 1 if x == "A" else 0)
        
#         # Aggregate Data
#         grouped = chunk.groupby(['LEGALENTITYZIP5']).agg({
#                 'FACEVALUEOFDIRECTLOANORLOANGUARANTEE': 'sum',
#                 'Loan_Count': 'size'  # Count of loans
#             })

#         # Update master_df
#         for zip_, row in grouped.iterrows():
#             if zip_ in master_df.index:
#                 master_df.loc[zip_, "Number of Loans EIDLL"] += row['Loan_Count']  # Update the number of loans
#                 master_df.loc[zip_, "Total Loan Amount EIDLL"] += row['FACEVALUEOFDIRECTLOANORLOANGUARANTEE']
#             else:
#                 # Create a new row filled with NaNs but update the relevant columns
#                 new_row = pd.Series(name=zip_, dtype='float64')  # Create a Series with name set to the ZIP code
#                 new_row["Number of Loans EIDLL"] = row['Loan_Count']
#                 new_row["Total Loan Amount EIDLL"] = row['FACEVALUEOFDIRECTLOANORLOANGUARANTEE']
        
#                 # Append the new row to the master DataFrame
#                 master_df = master_df.append(new_row)
#                 #print("Created new row for ZIP code: " + zip_)
                        
#     file_df.loc[file_df["File"] == file, "Completed"] = 1
    
#     print(f"Updated master DataFrame with data from {file}.")
    
#     # Ask for user confirmation to proceed to the next file
# #     user_input = input("Do you want to continue with the next file? (y/n): ")
# #     if user_input.lower() != 'y':
# #         print("Stopping the process.")
#         # Save the master DataFrame to disk
# master_df.to_csv('ZIPdf.csv')
# file_df.to_csv(base + "EIDL Loan Files.csv", index=False)
#         #break

# EIDL Post Nov-15 2020

In [17]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "EIDL Post Nov 20.csv")
file_df

Unnamed: 0,File,Completed
0,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
1,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
2,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
3,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
4,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
5,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
6,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
7,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
8,All_Assistance_PrimeTransactions_2023-09-09_H2...,0
9,All_Assistance_PrimeTransactions_2023-09-09_H2...,0


In [18]:
# for file in file_df[file_df["Completed"] == 0]["File"]:
#     file_name = base + file
#     # Check if file exists
#     if not os.path.exists(file_name):
#         print(f"File {file_name} does not exist. Skipping...")
#         continue

#     print(f"Processing {file}...")
    
#     # Read file row by row since cannot fit in DF
#     chunk_size = 160000
#     for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['cfda_number', 
#                                                                       'action_type_code', 
#                                                                       'federal_action_obligation',
#                                                                       'face_value_of_loan',
#                                                                       'disaster_emergency_fund_codes_for_overall_award',
#                                                                       'recipient_zip_code']):
#         # Filter chunk to only needed observations
#         chunk["disaster_emergency_fund_codes_for_overall_award"] = chunk[
#             "disaster_emergency_fund_codes_for_overall_award"].astype(str).str[0]
#         chunk = chunk[chunk["disaster_emergency_fund_codes_for_overall_award"].isin(["L","M","N","O","P","U","V"])]
#         chunk.dropna(subset=["recipient_zip_code"], inplace=True)
#         chunk["recipient_zip_code"] = chunk["recipient_zip_code"].astype(str).str.zfill(5)
#         chunk["Count"] = chunk['action_type_code'].apply(lambda x: 1 if x == "A" else 0)
    
#         chunk_l = chunk[chunk["cfda_number"].astype(np.double) == 59.072]
#         chunk_a = chunk[chunk["cfda_number"].astype(np.double) == 59.008]

        
#         # Aggregate Data
#         grouped_l = chunk.groupby(['recipient_zip_code']).agg({
#                 'face_value_of_loan': 'sum',
#                 'Count': 'sum'  # Count of loans
#             })
#         grouped_a = chunk.groupby(['recipient_zip_code']).agg({
#                 'federal_action_obligation': 'sum',
#                 'Count': 'sum'  # Count of advs
#             })
        
#         combined_df = pd.merge(grouped_l, grouped_a, on='recipient_zip_code', how='outer', suffixes=('_loan', '_adv'))
#         combined_df.fillna(0, inplace=True)
        
#         # Update master_df
#         for zip_, row in combined_df.iterrows():
#             if zip_ in master_df.index:
#                 master_df.loc[zip_, "Number of Loans EIDLL"] += row['Count_loan']  # Update the number of loans
#                 master_df.loc[zip_, "Total Loan Amount EIDLL"] += row['face_value_of_loan']
#                 master_df.loc[zip_, "Number of Advs EIDLA"] += row['Count_adv']  # Update the number of advs
#                 master_df.loc[zip_, "Total Advs Amount EIDLA"] += row['federal_action_obligation']
#             else:
#                 # Create a new row filled with NaNs but update the relevant columns
#                 new_row = pd.Series(name=zip_, dtype='float64')  # Create a Series with name set to the ZIP code
#                 new_row["Number of Loans EIDLL"] = row['Count_loan']
#                 new_row["Total Loan Amount EIDLL"] = row['face_value_of_loan']
#                 new_row["Number of Advs EIDLA"] = row['Count_adv']
#                 new_row["Total Advs Amount EIDLA"] = row['federal_action_obligation']
        
#                 # Append the new row to the master DataFrame
#                 master_df = master_df.append(new_row)
#                 #print("Created new row for ZIP code: " + zip_)
                        
#     file_df[file_df["File"] == file]["Completed"] = 1
    
#     print(f"Updated master DataFrame with data from {file}.")

# master_df.to_csv('ZIPdf.csv')
# file_df.to_csv(base + "EIDL Post Nov 20.csv", index=False)

In [19]:
# master_df[["Number of Advs EIDLA",
#            "Total Advs Amount EIDLA"]] = master_df[["Number of Advs EIDLA",
#                                             "Total Advs Amount EIDLA"]].fillna(0)

In [20]:
# master_df[["Number of Loans EIDLL",
#            "Total Loan Amount EIDLL"]] = master_df[["Number of Loans EIDLL",
#                                             "Total Loan Amount EIDLL"]].fillna(0)

In [21]:
# master_df["EIDL Loans per Emp Bus"] = master_df["Number of Loans EIDLL"] / master_df["Total Emp Bus"]
# master_df["Avg EIDL Loan Amt per Emp"] = master_df["Total Loan Amount EIDLL"] / master_df["CBP Emp"]
# master_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# master_df

In [22]:
# master_df["Advs per Emp Bus"] = master_df["Number of Advs EIDLA"] / master_df["Total Emp Bus"]
# master_df["Avg Adv Amt per Emp"] = master_df["Total Advs Amount EIDLA"] / master_df["CBP Emp"]
# master_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# master_df

# Stats

In [23]:
master_df.describe()

Unnamed: 0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,Total Loans,Total Loan Amount,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
count,32109.0,35052.0,35052.0,37322.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,...,39949.0,39949.0,37322.0,35049.0,37322.0,35049.0,37322.0,35049.0,37322.0,35049.0
mean,61.825968,227.137111,3638.92514,266.512513,26.309044,18.998565,7.310479,10.415032,2.316485,1.77479,...,243.526772,18850910.0,0.776629,8294.561,0.131573,3372.8202,0.230476,2059.097695,0.41458,2862.643379
std,88.443966,406.851597,8085.098199,414.475046,24.866075,20.157029,14.854133,16.399289,5.762396,9.231636,...,465.252828,45978850.0,0.670734,15932.49,0.132456,10198.406443,0.202272,7729.743281,0.53525,5627.543533
min,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,11.0,80.0,24.0,7.695221,6.483358,0.264276,1.863354,0.150038,0.082109,...,7.0,192444.0,0.411765,3810.393,0.047619,892.130435,0.096774,531.088235,0.170017,907.666667
50%,31.0,41.0,423.0,90.0,16.28866,10.917816,1.001669,4.099142,0.508504,0.220264,...,47.0,1567485.0,0.698738,6258.987,0.111111,2200.719967,0.190083,1216.216216,0.333333,1873.025907
75%,78.0,255.0,3367.0,326.0,37.407054,23.085094,6.340451,10.763758,1.719629,0.519481,...,247.0,14553950.0,1.0,9413.655,0.186548,3777.127536,0.327263,2309.789474,0.503063,3283.572659
max,1634.0,6893.0,177226.0,6897.0,100.0,100.0,100.0,100.0,100.0,100.0,...,8160.0,1713459000.0,20.0,1390754.0,6.0,822010.75,6.666667,966651.5,18.833333,424102.5


In [24]:
master_df[~master_df["Min Share"].isnull()].describe()

Unnamed: 0,Num Farms,Num Estabs,CBP Emp,Total Emp Bus,Min Share,Min Share Excl B,Black Share,Hisp Share,Asian Share,Native Share,...,Total Loans,Total Loan Amount,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
count,29574.0,30771.0,30771.0,32638.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,...,33631.0,33631.0,32638.0,30771.0,32638.0,30771.0,32638.0,30771.0,32638.0,30771.0
mean,66.360283,256.658022,4069.889571,302.106839,26.309044,18.998565,7.310479,10.415032,2.316485,1.77479,...,288.014391,22219830.0,0.82508,8046.689774,0.129916,3043.119599,0.233548,1881.201912,0.461616,3122.368264
std,90.643775,425.862046,8515.727201,431.575897,24.866075,20.157029,14.854133,16.399289,5.762396,9.231636,...,494.333977,49361140.0,0.676399,9379.915828,0.111998,5013.227339,0.186874,3078.100043,0.551725,5101.37844
min,1.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,15.0,95.0,40.0,7.695221,6.483358,0.264276,1.863354,0.150038,0.082109,...,18.0,356052.3,0.473684,4266.402184,0.054878,1124.720434,0.106667,620.007874,0.227686,1212.000299
50%,36.0,59.0,572.0,119.0,16.28866,10.917816,1.001669,4.099142,0.508504,0.220264,...,74.0,2673304.0,0.747057,6499.768458,0.112974,2307.865169,0.195942,1276.857923,0.366308,2089.612122
75%,84.0,320.0,4217.5,394.0,37.407054,23.085094,6.340451,10.763758,1.719629,0.519481,...,339.0,21509910.0,1.026115,9471.686101,0.184929,3784.581311,0.330045,2318.22047,0.533457,3508.27453
max,1634.0,6893.0,177226.0,6897.0,100.0,100.0,100.0,100.0,100.0,100.0,...,8160.0,1713459000.0,20.0,340050.64,3.0,331865.666667,5.0,174665.0,18.833333,236954.52


In [25]:
stats_df = master_df[~master_df["Min Share"].isnull()][["Min Share",
                                                       "Min Share Excl B",
                                                       "Black Share",
                                                       "White Share",
                                                       "Asian Share",
                                                       "Hisp Share",
                                                       "Native Share",
                                                       'Rural', 
                                                       'PC Inc', 
                                                       'Gini',
                                                       'UR', 
                                                       '%BachOrMore',
                                                       "W_Avg Emp Min Share",
                                                       "W_Avg Emp Black Share",
                                                       "W_Avg Emp White Share",
                                                       "W_Avg Emp Asian Share",
                                                       "W_Avg Emp Hisp Share",
                                                       "W_Avg Emp Non-Hisp Share",
                                                       "W_Avg Emp Min Share",
                                                       "W_Avg Emp Min Share",
                                                       "Total Pop",
                                                       "Loans per Emp Bus",
                                                       "Avg Loan Amt per Emp",
                                                       "Loans per Emp Bus_1",
                                                       "Avg Loan Amt per Emp_1",
                                                       "Loans per Emp Bus_2",
                                                       "Avg Loan Amt per Emp_2",
                                                       "Loans per Emp Bus_3",
                                                       "Avg Loan Amt per Emp_3"]]
stats_df.describe()

Unnamed: 0,Min Share,Min Share Excl B,Black Share,White Share,Asian Share,Hisp Share,Native Share,Rural,PC Inc,Gini,...,W_Avg Emp Min Share,Total Pop,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
count,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,32763.0,32405.0,...,15979.0,33631.0,32638.0,30771.0,32638.0,30771.0,32638.0,30771.0,32638.0,30771.0
mean,26.309044,18.998565,7.310479,73.690956,2.316485,10.415032,1.77479,64.4572,34022.11574,41.173309,...,0.143364,9952.918022,0.82508,8046.689774,0.129916,3043.119599,0.233548,1881.201912,0.461616,3122.368264
std,24.866075,20.157029,14.854133,24.866075,5.762396,16.399289,9.231636,44.097957,16923.415799,8.190941,...,0.103154,14935.801329,0.676399,9379.915828,0.111998,5013.227339,0.186874,3078.100043,0.551725,5101.37844
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,289.0,0.02,...,0.013952,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.695221,6.483358,0.264276,62.592946,0.150038,1.863354,0.082109,8.526885,24841.0,37.35,...,0.071605,664.5,0.473684,4266.402184,0.054878,1124.720434,0.106667,620.007874,0.227686,1212.000299
50%,16.28866,10.917816,1.001669,83.71134,0.508504,4.099142,0.220264,100.0,30756.0,41.57,...,0.11056,2681.0,0.747057,6499.768458,0.112974,2307.865169,0.195942,1276.857923,0.366308,2089.612122
75%,37.407054,23.085094,6.340451,92.304779,1.719629,10.763758,0.519481,100.0,38835.0,45.77,...,0.187418,13459.0,1.026115,9471.686101,0.184929,3784.581311,0.330045,2318.22047,0.533457,3508.27453
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,701688.0,82.76,...,0.679245,135256.0,20.0,340050.64,3.0,331865.666667,5.0,174665.0,18.833333,236954.52


In [26]:
excluded_prefixes = ('962',
'963',
'964',
'965',
'966',
'090',
'091',
'092',
'093',
'094',
'095',
'096',
'097',
'098',
'006',
'007',
'008',
'009')

stats_df = stats_df[~stats_df.index.str.startswith(excluded_prefixes)]
stats_df

Unnamed: 0_level_0,Min Share,Min Share Excl B,Black Share,White Share,Asian Share,Hisp Share,Native Share,Rural,PC Inc,Gini,...,W_Avg Emp Min Share,Total Pop,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01001,15.390956,13.312529,2.078427,84.609044,2.890956,6.771079,0.135422,0.965615,41144.0,42.84,...,0.104759,16984.0,1.002083,6340.613503,0.275000,3626.320899,0.341667,977.286006,0.385417,1737.006599
01002,36.718920,31.678641,5.040279,63.281080,15.712316,9.902751,0.166921,14.972059,38627.0,54.44,...,,27558.0,0.967410,5676.897990,0.186964,1868.964757,0.351630,1900.492046,0.428816,1907.441187
01003,39.568400,32.943484,6.624915,60.431600,20.176564,8.737644,0.105636,0.000000,4935.0,50.23,...,0.069946,13253.0,0.105263,1287.552301,0.000000,0.000000,0.052632,741.799163,0.052632,545.753138
01005,9.081633,8.142857,0.938776,90.918367,0.612245,3.122449,0.142857,100.000000,39783.0,33.77,...,0.094359,4900.0,0.597484,5408.815133,0.163522,2917.684114,0.182390,863.918274,0.251572,1627.212746
01007,13.311288,11.567140,1.744148,86.688712,2.379563,4.279323,0.038903,71.970434,43450.0,38.15,...,0.069946,15423.0,0.912052,7467.646516,0.211726,2763.742458,0.299674,2093.122252,0.400651,2610.781806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99923,10.606061,10.606061,0.000000,89.393939,0.000000,3.030303,0.000000,100.000000,,,...,,66.0,0.250000,496.363636,0.000000,0.000000,0.250000,496.363636,0.000000,0.000000
99925,62.627551,62.117347,0.510204,37.372449,1.020408,1.785714,44.005102,100.000000,30746.0,44.80,...,,784.0,0.700000,6172.925458,0.133333,1512.112231,0.233333,1935.419084,0.333333,2725.394143
99926,90.332907,90.076825,0.256082,9.667093,0.000000,2.944942,74.327785,100.000000,24979.0,38.18,...,,1562.0,1.083333,8023.447843,0.000000,0.000000,0.166667,4243.156863,0.916667,3780.290980
99927,14.285714,14.285714,0.000000,85.714286,0.000000,4.081633,0.000000,100.000000,,,...,,49.0,,,,,,,,


In [27]:
def weighted_average(df, values_col, weights_col):
    # Element-wise multiplication, ignoring NaNs
    weighted_values = df[values_col] * df[weights_col]
    
    # Sum of weighted values and weights, ignoring NaNs
    sum_weighted_values = weighted_values.sum(skipna=True)
    sum_weights = df[weights_col].sum(skipna=True)
    
    # Calculate weighted average, returning NaN if not computable
    if sum_weights == 0:
        return np.nan
    else:
        return sum_weighted_values / sum_weights

In [28]:
for column in stats_df.columns:
    if column != "Total Pop":
        print(column)
        sub_data = master_df[~master_df["Min Share"].isnull()][[column, "Total Pop"]]
        print(weighted_average(sub_data, column, "Total Pop"))

Min Share
42.723896750764816
Min Share Excl B
30.79049627686281
Black Share
11.933400473902005
White Share
57.276103249235184
Asian Share
5.861855861069847
Hisp Share
19.51655641718283
Native Share
0.6724129764822445
Rural
19.88455407593475
PC Inc
37512.35111359813
Gini
43.58688009048675
UR
5.499028274607905
%BachOrMore
33.40044956532447
W_Avg Emp Min Share
0.1442346015537462
W_Avg Emp Black Share
0.014944787172054962
W_Avg Emp White Share
0.7328978332593434
W_Avg Emp Asian Share
0.07553803104477251
W_Avg Emp Hisp Share
0.04269534723950568
W_Avg Emp Non-Hisp Share
0.7848242392247963
W_Avg Emp Min Share
0.1442346015537462
W_Avg Emp Min Share
0.1442346015537462
Loans per Emp Bus
1.1273316110724734
Avg Loan Amt per Emp
6653.865948357212
Loans per Emp Bus_1
0.15255700579754486
Avg Loan Amt per Emp_1
2467.589904711588
Loans per Emp Bus_2
0.34895046570969346
Avg Loan Amt per Emp_2
1779.186866846537
Loans per Emp Bus_3
0.6258241395652351
Avg Loan Amt per Emp_3
2407.089176799088


In [29]:
stats_df.to_csv("PPP_Agg.csv")