In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import os 

In [14]:
# Initialize the master DataFrame
master_df = pd.read_csv("ZIPdf.csv")
master_df.rename(columns={"Unnamed: 0" : "ZIP"}, inplace=True)
master_df["ZIP"] = master_df["ZIP"].astype(str).str.zfill(5)
master_df.set_index("ZIP", inplace=True)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,Total Loans Amount_2,Total Employees_2,Number of Loans_3,Total Loans Amount_3,Total Employees_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00501,,5.0,5.0,,,,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0
00601,,,,0.997854,0.000058,17242.0,2.0,328260.00,117.0,43.0,846924.57,335.0,36.0,786849.32,341.0
00602,,,,0.994327,0.000266,37548.0,19.0,2198895.86,601.0,162.0,4301386.61,1381.0,119.0,3907995.21,1131.0
00603,,,,0.988274,0.001606,49804.0,22.0,5084300.00,1266.0,239.0,7976491.54,2700.0,162.0,8897252.89,2606.0
00606,,,,0.994809,0.000200,5009.0,1.0,4800.00,3.0,7.0,109067.00,42.0,5.0,84750.00,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25727,,,,,,,,,,1.0,45000.00,7.0,,,
54894,,,,,,,,,,1.0,3075.00,1.0,,,
54969,,,,,,,,,,1.0,11100.00,6.0,,,
25773,,,,,,,,,,1.0,6200.00,1.0,,,


In [3]:
# List of file names
base = "C:\\Users\\hthog\\Desktop\\PPP Loan Data\\"
file_df = pd.read_csv(base + "Files.csv")
file_df

Unnamed: 0,File,Completed
0,public_150k_plus_230630.csv,0
1,public_up_to_150k_1_230630.csv,0
2,public_up_to_150k_2_230630.csv,0
3,public_up_to_150k_3_230630.csv,0
4,public_up_to_150k_4_230630.csv,0
5,public_up_to_150k_5_230630.csv,0
6,public_up_to_150k_6_230630.csv,0
7,public_up_to_150k_7_230630.csv,0
8,public_up_to_150k_8_230630.csv,0
9,public_up_to_150k_9_230630.csv,0


In [4]:
def assign_value_based_on_date(date):
    if date < pd.Timestamp('2020-04-17'):
        return '_1'
    elif date < pd.Timestamp('2020-08-09'):
        return '_2'
    else:
        return '_3'
traunches = ["_1", "_2", "_3"]

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
for file in file_df[file_df["Completed"] == 0]["File"]:
    file_name = base + file
    # Check if file exists
    if not os.path.exists(file_name):
        print(f"File {file_name} does not exist. Skipping...")
        continue

    print(f"Processing {file}...")
    
    # Read file row by row since cannot fit in DF
    with open("Problem Rows.txt", "a") as p:
        chunk_size = 80000
        for chunk in pd.read_csv(file_name, chunksize=chunk_size, usecols=['NonProfit', 
                                                                           'BusinessType', 
                                                                           'NAICSCode', 
                                                                           'BorrowerZip', 
                                                                           'DateApproved', 
                                                                           'InitialApprovalAmount', 
                                                                           'JobsReported']):
            # Filter chunk to only needed observations
            chunk = chunk[chunk["NonProfit"] != "Y"]
            chunk = chunk[~chunk["BusinessType"].isin(["Sole Proprietorship",
                                                      "Independent Contractors",
                                                      "Self-Employed Individuals"])]
            chunk = chunk[~chunk["NAICSCode"].isin([999990])]
            #chunk = chunk[chunk["NAICSCode"].notnull()]
            chunk.dropna(subset=["BorrowerZip"], inplace=True)

            # Edit chunk to include necessary info
            chunk["DateApproved"] = pd.to_datetime(chunk["DateApproved"])
            chunk["Traunch"] = chunk["DateApproved"].apply(assign_value_based_on_date)
            chunk["BorrowerZip"] = chunk["BorrowerZip"].astype(str).str[:5]
            
            # Aggregate Data
            grouped = chunk.groupby(['BorrowerZip', 'Traunch']).agg({
                'InitialApprovalAmount': 'sum',
                'JobsReported': 'sum',
                'Traunch': 'size'  # Count of loans
            })

            # Update master_df
            for (zip_, traunch), row in grouped.iterrows():
                if zip_ in master_df.index:
                    master_df.loc[zip_, f"Number of Loans{traunch}"] += row['Traunch']  # Update the number of loans
                    master_df.loc[zip_, f"Total Loans Amount{traunch}"] += row['InitialApprovalAmount']
                    master_df.loc[zip_, f"Total Employees{traunch}"] += row['JobsReported'] 
                else:
                    # Create a new row filled with NaNs but update the relevant columns
                    new_row = pd.Series(name=zip_, dtype='float64')  # Create a Series with name set to the ZIP code
                    new_row[f"Number of Loans{traunch}"] = row['Traunch']
                    new_row[f"Total Loans Amount{traunch}"] = row['InitialApprovalAmount']
                    new_row[f"Total Employees{traunch}"] = row['JobsReported']
        
                    # Append the new row to the master DataFrame
                    master_df = master_df.append(new_row)
                    print("Created new row for ZIP code: " + zip_)
                        
    file_df[file_df["File"] == file]["Completed"] = 1
    
    print(f"Updated master DataFrame with data from {file}.")
    
    # Ask for user confirmation to proceed to the next file
    user_input = input("Do you want to continue with the next file? (y/n): ")
    if user_input.lower() != 'y':
        print("Stopping the process.")
        # Save the master DataFrame to disk
        master_df.to_csv('ZIPdf.csv')
        file_df.to_csv(base + "Files.csv", index=False)
        break

Processing public_150k_plus_230630.csv...
Created new row for ZIP code: 35142
Created new row for ZIP code: 35478
Created new row for ZIP code: 36671
Created new row for ZIP code: 36807
Created new row for ZIP code: 36824
Created new row for ZIP code: 72260
Created new row for ZIP code: 85074
Created new row for ZIP code: 85190
Created new row for ZIP code: 88532
Created new row for ZIP code: 90223
Created new row for ZIP code: 91385
Created new row for ZIP code: 92100
Created new row for ZIP code: 92170
Created new row for ZIP code: 92599
Created new row for ZIP code: 92686
Created new row for ZIP code: 93590
Created new row for ZIP code: 94661
Created new row for ZIP code: 95103
Created new row for ZIP code: 95164
Created new row for ZIP code: 95440
Created new row for ZIP code: 96312
Created new row for ZIP code: 96799
Created new row for ZIP code: 02592
Created new row for ZIP code: 90251
Created new row for ZIP code: 90355
Created new row for ZIP code: 90400
Created new row for ZI

Created new row for ZIP code: 35246
Created new row for ZIP code: 36062
Created new row for ZIP code: 36640
Created new row for ZIP code: 03542
Created new row for ZIP code: 36508
Created new row for ZIP code: 85038
Created new row for ZIP code: 85106
Created new row for ZIP code: 85220
Created new row for ZIP code: 85294
Created new row for ZIP code: 86026
Created new row for ZIP code: 85178
Created new row for ZIP code: 85288
Created new row for ZIP code: 88537
Created new row for ZIP code: 85058
Created new row for ZIP code: 85242
Created new row for ZIP code: 94926
Created new row for ZIP code: 95026
Created new row for ZIP code: 98212
Created new row for ZIP code: 00092
Created new row for ZIP code: 90273
Created new row for ZIP code: 91603
Created new row for ZIP code: 91609
Created new row for ZIP code: 92815
Created new row for ZIP code: 93793
Created new row for ZIP code: 94883
Created new row for ZIP code: 95157
Created new row for ZIP code: 95214
Created new row for ZIP code

Created new row for ZIP code: 56203
Created new row for ZIP code: 65107
Created new row for ZIP code: 46713
Created new row for ZIP code: 46852
Created new row for ZIP code: 46854
Created new row for ZIP code: 47245
Created new row for ZIP code: 60744
Created new row for ZIP code: 46863
Created new row for ZIP code: 47730
Created new row for ZIP code: 66420
Created new row for ZIP code: 66667
Created new row for ZIP code: 66855
Created new row for ZIP code: 40224
Created new row for ZIP code: 40266
Created new row for ZIP code: 40524
Created new row for ZIP code: 40574
Created new row for ZIP code: 42288
Created new row for ZIP code: 40129
Updated master DataFrame with data from public_up_to_150k_5_230630.csv.
Do you want to continue with the next file? (y/n): y
Processing public_up_to_150k_6_230630.csv...
Created new row for ZIP code: 41561
Created new row for ZIP code: 70156
Created new row for ZIP code: 70172
Created new row for ZIP code: 70177
Created new row for ZIP code: 70186
Cr

Created new row for ZIP code: 19283
Created new row for ZIP code: 19327
Created new row for ZIP code: 19648
Created new row for ZIP code: 15549
Created new row for ZIP code: 19455
Created new row for ZIP code: 19484
Created new row for ZIP code: 00095
Created new row for ZIP code: 00098
Created new row for ZIP code: 00619
Created new row for ZIP code: 00626
Created new row for ZIP code: 00645
Created new row for ZIP code: 00668
Created new row for ZIP code: 00689
Created new row for ZIP code: 00721
Created new row for ZIP code: 00724
Created new row for ZIP code: 00742
Created new row for ZIP code: 00762
Created new row for ZIP code: 00916
Created new row for ZIP code: 00931
Created new row for ZIP code: 00935
Created new row for ZIP code: 00940
Created new row for ZIP code: 00989
Created new row for ZIP code: 19874
Created new row for ZIP code: 02980
Created new row for ZIP code: 29350
Created new row for ZIP code: 29476
Created new row for ZIP code: 29705
Created new row for ZIP code

In [7]:
# master_df["Number of Loans_1"] = 0
# master_df["Total Loans Amount_1"] = 0
# master_df["Total Employees_1"] = 0

# master_df["Number of Loans_2"] = 0
# master_df["Total Loans Amount_2"] = 0
# master_df["Total Employees_2"] = 0

# master_df["Number of Loans_3"] = 0
# master_df["Total Loans Amount_3"] = 0
# master_df["Total Employees_3"] = 0

# master_df.to_csv("ZIPdf.csv")

In [9]:
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,Total Loans Amount_2,Total Employees_2,Number of Loans_3,Total Loans Amount_3,Total Employees_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00501,,5.0,5.0,,,,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0
00601,,,,0.997854,0.000058,17242.0,2.0,328260.00,117.0,43.0,846924.57,335.0,36.0,786849.32,341.0
00602,,,,0.994327,0.000266,37548.0,19.0,2198895.86,601.0,162.0,4301386.61,1381.0,119.0,3907995.21,1131.0
00603,,,,0.988274,0.001606,49804.0,22.0,5084300.00,1266.0,239.0,7976491.54,2700.0,162.0,8897252.89,2606.0
00606,,,,0.994809,0.000200,5009.0,1.0,4800.00,3.0,7.0,109067.00,42.0,5.0,84750.00,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25727,,,,,,,,,,1.0,45000.00,7.0,,,
54894,,,,,,,,,,1.0,3075.00,1.0,,,
54969,,,,,,,,,,1.0,11100.00,6.0,,,
25773,,,,,,,,,,1.0,6200.00,1.0,,,


In [16]:
master_df["Total Loans"] = master_df["Number of Loans_1"].add(master_df["Number of Loans_2"], fill_value = 0).add(master_df["Number of Loans_3"], fill_value = 0)
master_df["Total Loan Amount"] = master_df["Total Loans Amount_1"].add(master_df["Total Loans Amount_2"], fill_value = 0).add(master_df["Total Loans Amount_3"], fill_value = 0)
master_df["Total Employees"] = master_df["Total Employees_1"].add(master_df["Total Employees_2"], fill_value = 0).add(master_df["Total Employees_3"], fill_value = 0)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,Total Loans Amount_2,Total Employees_2,Number of Loans_3,Total Loans Amount_3,Total Employees_3,Total Loans,Total Loan Amount,Total Employees
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00501,,5.0,5.0,,,,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0
00601,,,,0.997854,0.000058,17242.0,2.0,328260.00,117.0,43.0,846924.57,335.0,36.0,786849.32,341.0,81.0,1962033.89,793.0
00602,,,,0.994327,0.000266,37548.0,19.0,2198895.86,601.0,162.0,4301386.61,1381.0,119.0,3907995.21,1131.0,300.0,10408277.68,3113.0
00603,,,,0.988274,0.001606,49804.0,22.0,5084300.00,1266.0,239.0,7976491.54,2700.0,162.0,8897252.89,2606.0,423.0,21958044.43,6572.0
00606,,,,0.994809,0.000200,5009.0,1.0,4800.00,3.0,7.0,109067.00,42.0,5.0,84750.00,28.0,13.0,198617.00,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25727,,,,,,,,,,1.0,45000.00,7.0,,,,1.0,45000.00,7.0
54894,,,,,,,,,,1.0,3075.00,1.0,,,,1.0,3075.00,1.0
54969,,,,,,,,,,1.0,11100.00,6.0,,,,1.0,11100.00,6.0
25773,,,,,,,,,,1.0,6200.00,1.0,,,,1.0,6200.00,1.0


In [17]:
master_df[["Number of Loans_1",
           "Total Loans Amount_1",
           "Total Employees_1",
           "Number of Loans_2",
           "Total Loans Amount_2",
           "Total Employees_2", 
           "Number of Loans_3",
           "Total Loans Amount_3",
           "Total Employees_3",
           "Total Loans",
           "Total Loan Amount",
           "Total Employees"]] = master_df[["Number of Loans_1",
           "Total Loans Amount_1",
           "Total Employees_1",
           "Number of Loans_2",
           "Total Loans Amount_2",
           "Total Employees_2", 
           "Number of Loans_3",
           "Total Loans Amount_3",
           "Total Employees_3",
           "Total Loans",
           "Total Loan Amount",
           "Total Employees"]].fillna(0)
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,Total Loans Amount_2,Total Employees_2,Number of Loans_3,Total Loans Amount_3,Total Employees_3,Total Loans,Total Loan Amount,Total Employees
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00501,,5.0,5.0,,,,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.0
00601,,,,0.997854,0.000058,17242.0,2.0,328260.00,117.0,43.0,846924.57,335.0,36.0,786849.32,341.0,81.0,1962033.89,793.0
00602,,,,0.994327,0.000266,37548.0,19.0,2198895.86,601.0,162.0,4301386.61,1381.0,119.0,3907995.21,1131.0,300.0,10408277.68,3113.0
00603,,,,0.988274,0.001606,49804.0,22.0,5084300.00,1266.0,239.0,7976491.54,2700.0,162.0,8897252.89,2606.0,423.0,21958044.43,6572.0
00606,,,,0.994809,0.000200,5009.0,1.0,4800.00,3.0,7.0,109067.00,42.0,5.0,84750.00,28.0,13.0,198617.00,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25727,,,,,,,0.0,0.00,0.0,1.0,45000.00,7.0,0.0,0.00,0.0,1.0,45000.00,7.0
54894,,,,,,,0.0,0.00,0.0,1.0,3075.00,1.0,0.0,0.00,0.0,1.0,3075.00,1.0
54969,,,,,,,0.0,0.00,0.0,1.0,11100.00,6.0,0.0,0.00,0.0,1.0,11100.00,6.0
25773,,,,,,,0.0,0.00,0.0,1.0,6200.00,1.0,0.0,0.00,0.0,1.0,6200.00,1.0


In [24]:
master_df["Loans per Emp Bus"] = master_df["Total Loans"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp"] = master_df["Total Loan Amount"] / master_df["Total Employees"]
master_df["Loans per Emp Bus_1"] = master_df["Number of Loans_1"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_1"] = master_df["Total Loans Amount_1"] / master_df["Total Employees_1"]
master_df["Loans per Emp Bus_2"] = master_df["Number of Loans_2"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_2"] = master_df["Total Loans Amount_2"] / master_df["Total Employees_2"]
master_df["Loans per Emp Bus_3"] = master_df["Number of Loans_3"] / master_df["Total Emp Bus"]
master_df["Avg Loan Amt per Emp_3"] = master_df["Total Loans Amount_3"] / master_df["Total Employees_3"]
master_df

Unnamed: 0_level_0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,...,Total Loan Amount,Total Employees,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
ZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00501,,5.0,5.0,,,,0.0,0.00,0.0,0.0,...,0.00,0.0,0.0,,0.0,,0.0,,0.0,
00601,,,,0.997854,0.000058,17242.0,2.0,328260.00,117.0,43.0,...,1962033.89,793.0,,2474.191538,,2805.641026,,2528.133045,,2307.476012
00602,,,,0.994327,0.000266,37548.0,19.0,2198895.86,601.0,162.0,...,10408277.68,3113.0,,3343.487851,,3658.728552,,3114.689797,,3455.345013
00603,,,,0.988274,0.001606,49804.0,22.0,5084300.00,1266.0,239.0,...,21958044.43,6572.0,,3341.151009,,4016.034755,,2954.256126,,3414.141554
00606,,,,0.994809,0.000200,5009.0,1.0,4800.00,3.0,7.0,...,198617.00,73.0,,2720.780822,,1600.000000,,2596.833333,,3026.785714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25727,,,,,,,0.0,0.00,0.0,1.0,...,45000.00,7.0,,6428.571429,,,,6428.571429,,
54894,,,,,,,0.0,0.00,0.0,1.0,...,3075.00,1.0,,3075.000000,,,,3075.000000,,
54969,,,,,,,0.0,0.00,0.0,1.0,...,11100.00,6.0,,1850.000000,,,,1850.000000,,
25773,,,,,,,0.0,0.00,0.0,1.0,...,6200.00,1.0,,6200.000000,,,,6200.000000,,


In [27]:
master_df.describe()

Unnamed: 0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,...,Total Loan Amount,Total Employees,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
count,32109.0,35052.0,37322.0,33631.0,33631.0,33774.0,39080.0,39080.0,39080.0,39080.0,...,39080.0,39080.0,37322.0,35897.0,37322.0,31414.0,37322.0,33127.0,37322.0,31307.0
mean,61.825968,227.137111,266.512513,0.26309,0.073105,9910.777107,35.65998,7432815.0,752.02175,59.35934,...,16856750.0,1919.429913,0.418266,8142.283546,0.110698,8851.337773,0.156128,7191.182383,0.15144,8372.959462
std,88.443966,406.851597,414.475046,0.248661,0.148541,14918.152573,69.944027,18312010.0,1671.85998,131.749004,...,41752700.0,4303.162337,0.321156,3227.806878,0.119044,3836.289053,0.155452,3574.58423,0.140103,3364.110122
min,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,192.507509,0.0,227.1875,0.0,4.5,0.0,127.5
25%,9.0,11.0,24.0,0.076952,0.002643,650.0,1.0,35200.0,6.0,1.0,...,187511.5,27.0,0.183116,6376.921524,0.034483,6678.635133,0.048387,5242.556793,0.05,6515.208413
50%,31.0,41.0,90.0,0.162887,0.010017,2649.5,6.0,626955.9,80.0,7.0,...,1491269.0,196.0,0.361111,7849.674088,0.090909,8543.120599,0.117139,6738.177594,0.126214,8039.139041
75%,78.0,255.0,326.0,0.374071,0.063405,13359.0,36.0,5532691.0,632.0,49.0,...,12945670.0,1642.0,0.618755,9455.664336,0.159143,10532.484959,0.230987,8450.8961,0.227273,9687.694067
max,1634.0,6893.0,6897.0,1.0,1.0,135256.0,1052.0,453772200.0,31360.0,3213.0,...,1546740000.0,133008.0,8.333333,126056.157895,6.0,124859.212121,5.0,126056.157895,3.0,83810.854


In [30]:
master_df[~master_df["Min Share"].isnull()].describe()

Unnamed: 0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,...,Total Loan Amount,Total Employees,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
count,29574.0,30771.0,32638.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,33631.0,...,33631.0,33631.0,32638.0,31569.0,32638.0,28569.0,32638.0,29622.0,32638.0,29905.0
mean,66.360283,256.658022,302.106839,0.26309,0.073105,9952.918022,41.086438,8544134.0,864.73343,68.609259,...,19442070.0,2215.45464,0.432061,7960.956512,0.109256,8697.305605,0.156429,6976.860697,0.166376,8285.909574
std,90.643775,425.862046,431.575897,0.248661,0.148541,14935.801329,73.948968,19501680.0,1775.194796,139.8267,...,44458690.0,4569.227086,0.304393,2811.093011,0.098997,3505.675489,0.142527,3084.54346,0.135005,3144.868256
min,1.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,192.507509,0.0,227.1875,0.0,103.535714,0.0,160.409512
25%,11.0,15.0,40.0,0.076952,0.002643,664.5,2.0,78790.0,12.0,2.0,...,306615.2,45.0,0.2,6398.919778,0.041667,6669.841026,0.054054,5235.62408,0.072464,6536.668264
50%,36.0,59.0,119.0,0.162887,0.010017,2681.0,9.0,968159.5,122.0,11.0,...,2358688.0,309.0,0.381164,7776.779661,0.093878,8472.081879,0.12,6659.826134,0.142857,8015.768176
75%,84.0,320.0,394.0,0.374071,0.063405,13459.0,46.0,7523938.0,857.0,68.0,...,17975540.0,2251.0,0.634921,9256.451059,0.157895,10376.672727,0.232393,8233.828118,0.239337,9599.687073
max,1634.0,6893.0,6897.0,1.0,1.0,135256.0,1052.0,453772200.0,31360.0,3213.0,...,1546740000.0,133008.0,6.0,105225.0,3.0,124859.212121,4.0,71655.0,2.090909,69549.17


In [31]:
master_df[~master_df["Min Share"].isnull()].corr()

Unnamed: 0,Num Farms,Num Estabs,Total Emp Bus,Min Share,Black Share,Total Pop,Number of Loans_1,Total Loans Amount_1,Total Employees_1,Number of Loans_2,...,Total Loan Amount,Total Employees,Loans per Emp Bus,Avg Loan Amt per Emp,Loans per Emp Bus_1,Avg Loan Amt per Emp_1,Loans per Emp Bus_2,Avg Loan Amt per Emp_2,Loans per Emp Bus_3,Avg Loan Amt per Emp_3
Num Farms,1.0,0.033213,0.262592,-0.056359,-0.085379,0.118172,0.155906,0.024659,0.067007,-0.02339,...,-0.010106,0.009652,-0.19623,-0.031537,-0.043022,-0.065724,-0.212932,-0.030873,-0.191202,-0.051906
Num Estabs,0.033213,1.0,0.978664,0.330769,0.117174,0.791313,0.879963,0.853516,0.876976,0.942222,...,0.906746,0.946712,0.474171,0.141633,0.256108,0.190542,0.48196,0.068143,0.393934,0.078856
Total Emp Bus,0.262592,0.978664,1.0,0.29592,0.095456,0.792784,0.887309,0.833386,0.865264,0.907686,...,0.876283,0.91875,0.384219,0.119791,0.216539,0.1655,0.373471,0.056364,0.313229,0.057667
Min Share,-0.056359,0.330769,0.29592,1.0,0.585684,0.439911,0.17985,0.224123,0.231957,0.314709,...,0.268406,0.29668,0.245882,0.03926,-0.015853,0.099886,0.282918,0.00615,0.267329,0.05766
Black Share,-0.085379,0.117174,0.095456,0.585684,1.0,0.207398,0.084257,0.101219,0.110369,0.105154,...,0.099207,0.110992,0.216662,0.030694,0.04498,0.037267,0.163876,-0.016608,0.282513,0.049246
Total Pop,0.118172,0.791313,0.792784,0.439911,0.207398,1.0,0.6491,0.555152,0.617228,0.746869,...,0.601697,0.68376,0.449405,0.053664,0.173737,0.12328,0.45817,-0.002175,0.402168,0.011551
Number of Loans_1,0.155906,0.879963,0.887309,0.17985,0.084257,0.6491,1.0,0.880956,0.933887,0.754809,...,0.830982,0.863689,0.414195,0.136421,0.384236,0.148831,0.312903,0.067354,0.321788,0.061303
Total Loans Amount_1,0.024659,0.853516,0.833386,0.224123,0.101219,0.555152,0.880956,1.0,0.975304,0.7594,...,0.9542,0.92911,0.366828,0.206347,0.290478,0.23752,0.304487,0.111902,0.292627,0.116137
Total Employees_1,0.067007,0.876976,0.865264,0.231957,0.110369,0.617228,0.933887,0.975304,1.0,0.767225,...,0.920758,0.932145,0.390046,0.164658,0.325702,0.182281,0.315169,0.087827,0.30787,0.085585
Number of Loans_2,-0.02339,0.942222,0.907686,0.314709,0.105154,0.746869,0.754809,0.7594,0.767225,1.0,...,0.875483,0.916981,0.466658,0.108718,0.162345,0.175802,0.504399,0.056816,0.400618,0.072144


In [33]:
master_df.to_csv('ZIPdf.csv')