In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import os 

In [2]:
# Initialize the master DataFrame
master_df = pd.read_csv("ZIPdf.csv")
master_df.rename(columns={"Unnamed: 0" : "ZIP"}, inplace=True)
master_df["ZIP"] = master_df["ZIP"].astype(str).str.zfill(5)
master_df.set_index("ZIP", inplace=True)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,Total Loans Amount_2,Total Employees_2,Number of Loans_3,Total Loans Amount_3,Total Employees_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
00501,0.0,10.0,10.0,,,0,0,0,0,0,0,0,0,0
01001,11.0,475.0,486.0,0.087898,0.028075,0,0,0,0,0,0,0,0,0
01002,60.0,502.0,562.0,0.257683,0.058108,0,0,0,0,0,0,0,0,0
01003,3.0,15.0,18.0,0.311443,0.044960,0,0,0,0,0,0,0,0,0
01004,3.0,6.0,9.0,,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99926,1.0,9.0,10.0,0.890013,0.000000,0,0,0,0,0,0,0,0,0
99927,0.0,0.0,0.0,,,0,0,0,0,0,0,0,0,0
99928,1.0,4.0,5.0,,,0,0,0,0,0,0,0,0,0
99929,2.0,99.0,101.0,0.348346,0.003189,0,0,0,0,0,0,0,0,0


In [3]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "Files.csv")
file_df.drop(0, inplace=True) # Drop Test File
file_df

Unnamed: 0,File,Completed
1,public_up_to_150k_1_230630.csv,0
2,public_up_to_150k_2_230630.csv,0
3,public_up_to_150k_3_230630.csv,0
4,public_up_to_150k_4_230630.csv,0
5,public_up_to_150k_5_230630.csv,0
6,public_up_to_150k_6_230630.csv,0
7,public_up_to_150k_7_230630.csv,0
8,public_up_to_150k_8_230630.csv,0
9,public_up_to_150k_9_230630.csv,0
10,public_up_to_150k_10_230630.csv,0


In [4]:
def assign_value_based_on_date(date):
    if date < pd.Timestamp('2020-04-17'):
        return '_1'
    elif date < pd.Timestamp('2020-08-09'):
        return '_2'
    else:
        return '_3'
traunches = ["_1", "_2", "_3"]

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
for file in file_df[file_df["Completed"] == 0]["File"]:
    file_name = base + file
    # Check if file exists
    if not os.path.exists(file_name):
        print(f"File {file_name} does not exist. Skipping...")
        continue

    print(f"Processing {file}...")
    
    # Read file row by row since cannot fit in DF
    with open("Problem Rows.txt", "a") as p:
        chunk_size = 80000
        for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['NonProfit', 
                                                                           'BusinessType', 
                                                                           'NAICSCode', 
                                                                           'BorrowerZip', 
                                                                           'DateApproved', 
                                                                           'InitialApprovalAmount', 
                                                                           'JobsReported']):
            # Filter chunk to only needed observations
            chunk = chunk[chunk["NonProfit"] != "Y"]
            chunk = chunk[~chunk["BusinessType"].isin(["Sole Proprietorship",
                                                      "Independent Contractors",
                                                      "Self-Employed Individuals"])]
            chunk = chunk[~chunk["NAICSCode"].isin([999990])]
            #chunk = chunk[chunk["NAICSCode"].notnull()]
            chunk.dropna(subset=["BorrowerZip"], inplace=True)

            # Edit chunk to include necessary info
            chunk["DateApproved"] = pd.to_datetime(chunk["DateApproved"])
            chunk["Traunch"] = chunk["DateApproved"].apply(assign_value_based_on_date)
            chunk["BorrowerZip"] = chunk["BorrowerZip"].astype(str).str[:5]
            
            # Aggregate Data
            grouped = chunk.groupby(['BorrowerZip', 'Traunch']).agg({
                'InitialApprovalAmount': 'sum',
                'JobsReported': 'sum',
                'Traunch': 'size'  # Count of loans
            })

            # Update master_df
            for (zip_, traunch), row in grouped.iterrows():
                if zip_ in master_df.index:
                    master_df.loc[zip_, f"Number of Loans{traunch}"] += row['Traunch']  # Update the number of loans
                    master_df.loc[zip_, f"Total Loans Amount{traunch}"] += row['InitialApprovalAmount']
                    master_df.loc[zip_, f"Total Employees{traunch}"] += row['JobsReported'] 
                else:
                    p.write(str(zip_) + "\n")
                    print("Error with ZIP Code: " + str(zip_))
                        
    file_df[file_df["File"] == file]["Completed"] = 1
    
    print(f"Updated master DataFrame with data from {file}.")
    
    # Ask for user confirmation to proceed to the next file
    user_input = input("Do you want to continue with the next file? (y/n): ")
    if user_input.lower() != 'y':
        print("Stopping the process.")
        # Save the master DataFrame to disk
        master_df.to_csv('ZIPdf.csv')
        file_df.to_csv(base + "Files.csv", index=False)
        break

Processing public_up_to_150k_1_230630.csv...
Error with ZIP Code: 09724
Error with ZIP Code: 09724
Error with ZIP Code: 09803
Error with ZIP Code: 35015
Error with ZIP Code: 35086
Error with ZIP Code: 35182
Error with ZIP Code: 35220
Error with ZIP Code: 35261
Error with ZIP Code: 35295
Error with ZIP Code: 35607
Error with ZIP Code: 36118
Error with ZIP Code: 36135
Error with ZIP Code: 65404
Error with ZIP Code: 96929
Error with ZIP Code: 99697
Error with ZIP Code: 99697
Error with ZIP Code: 35246
Error with ZIP Code: 36062
Error with ZIP Code: 36640
Error with ZIP Code: 03542
Error with ZIP Code: 36508
Error with ZIP Code: 36640
Error with ZIP Code: 71748
Error with ZIP Code: 72260
Error with ZIP Code: 71748
Error with ZIP Code: 72369
Error with ZIP Code: 85106
Error with ZIP Code: 85220
Error with ZIP Code: 85294
Error with ZIP Code: 86026
Error with ZIP Code: 96799
Error with ZIP Code: 96799
Error with ZIP Code: 96799
Error with ZIP Code: 85220
Error with ZIP Code: 88537
Error with

Error with ZIP Code: 50427
Error with ZIP Code: 50427
Error with ZIP Code: 52504
Error with ZIP Code: 50592
Error with ZIP Code: 52149
Error with ZIP Code: 52409
Error with ZIP Code: 82813
Error with ZIP Code: 83865
Error with ZIP Code: 83865
Updated master DataFrame with data from public_up_to_150k_4_230630.csv.
Do you want to continue with the next file? (y/n): n
Stopping the process.


In [73]:
# master_df["Number of Loans_1"] = 0
# master_df["Total Loans Amount_1"] = 0
# master_df["Total Employees_1"] = 0

# master_df["Number of Loans_2"] = 0
# master_df["Total Loans Amount_2"] = 0
# master_df["Total Employees_2"] = 0

# master_df["Number of Loans_3"] = 0
# master_df["Total Loans Amount_3"] = 0
# master_df["Total Employees_3"] = 0

# master_df.to_csv("ZIPdf.csv")

In [7]:
master_df.loc["72704"]

Num Farms               2.050000e+02
Num Estabs              5.060000e+02
Total Emp Bus           7.110000e+02
Min Share               1.583459e-01
Black Share             5.332326e-02
Number of Loans_1       1.180000e+02
Total Loans Amount_1    5.284223e+06
Total Employees_1       8.160000e+02
Number of Loans_2       7.800000e+01
Total Loans Amount_2    1.654617e+06
Total Employees_2       2.870000e+02
Number of Loans_3       9.500000e+01
Total Loans Amount_3    2.938545e+06
Total Employees_3       6.120000e+02
Name: 72704, dtype: float64