The goal of this notebook is to take each of the census variables I will be using in my model and find which macroeconomic variables I can use to grow the variables used in the model for 2020-2022.

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# Load data

## BEA Data for Estimates
https://www.bea.gov/data/economic-accounts/regional

https://www.bea.gov/data/economic-accounts/national

### PCE by State

In [6]:
consumer_spending_state = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/bea/PCE.csv')

In [7]:
consumer_spending_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   GeoName  51 non-null     object
 1   1997     51 non-null     int64 
 2   1998     51 non-null     int64 
 3   1999     51 non-null     int64 
 4   2000     51 non-null     int64 
 5   2001     51 non-null     int64 
 6   2002     51 non-null     int64 
 7   2003     51 non-null     int64 
 8   2004     51 non-null     int64 
 9   2005     51 non-null     int64 
 10  2006     51 non-null     int64 
 11  2007     51 non-null     int64 
 12  2008     51 non-null     int64 
 13  2009     51 non-null     int64 
 14  2010     51 non-null     int64 
 15  2011     51 non-null     int64 
 16  2012     51 non-null     int64 
 17  2013     51 non-null     int64 
 18  2014     51 non-null     int64 
 19  2015     51 non-null     int64 
 20  2016     51 non-null     int64 
 21  2017     51 non-null     int64 
 22  2018

In [10]:
consumer_spending_state = consumer_spending_state.add_prefix('pce_')

### Employment by State

In [11]:
employment_state = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/bea/total_employment_state.csv')

In [12]:
employment_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 33 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   GeoName  51 non-null     object
 1   1990     51 non-null     int64 
 2   1991     51 non-null     int64 
 3   1992     51 non-null     int64 
 4   1993     51 non-null     int64 
 5   1994     51 non-null     int64 
 6   1995     51 non-null     int64 
 7   1996     51 non-null     int64 
 8   1997     51 non-null     int64 
 9   1998     51 non-null     int64 
 10  1999     51 non-null     int64 
 11  2000     51 non-null     int64 
 12  2001     51 non-null     int64 
 13  2002     51 non-null     int64 
 14  2003     51 non-null     int64 
 15  2004     51 non-null     int64 
 16  2005     51 non-null     int64 
 17  2006     51 non-null     int64 
 18  2007     51 non-null     int64 
 19  2008     51 non-null     int64 
 20  2009     51 non-null     int64 
 21  2010     51 non-null     int64 
 22  2011

In [13]:
employment_state = employment_state.add_prefix('emp_')

### GDP by State

In [16]:
gdp = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/bea/gdp_state.csv')

In [17]:
gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   GeoName  51 non-null     object 
 1   2005     51 non-null     float64
 2   2006     51 non-null     float64
 3   2007     51 non-null     float64
 4   2008     51 non-null     float64
 5   2009     51 non-null     float64
 6   2010     51 non-null     float64
 7   2011     51 non-null     float64
 8   2012     51 non-null     float64
 9   2013     51 non-null     float64
 10  2014     51 non-null     float64
 11  2015     51 non-null     float64
 12  2016     51 non-null     float64
 13  2017     51 non-null     float64
 14  2018     51 non-null     float64
 15  2019     51 non-null     float64
 16  2020     51 non-null     float64
 17  2021     51 non-null     float64
dtypes: float64(17), object(1)
memory usage: 7.3+ KB


In [19]:
gdp = gdp.add_prefix('gdp_')

### Per Capita Income by State

In [20]:
pi = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/bea/per_capita_income.csv')

In [21]:
pi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   GeoName  51 non-null     object 
 1   2010     51 non-null     float64
 2   2011     51 non-null     float64
 3   2012     51 non-null     float64
 4   2013     51 non-null     float64
 5   2014     51 non-null     float64
 6   2015     51 non-null     float64
 7   2016     51 non-null     float64
 8   2017     51 non-null     float64
 9   2018     51 non-null     float64
 10  2019     51 non-null     float64
 11  2020     51 non-null     float64
 12  2021     51 non-null     float64
dtypes: float64(12), object(1)
memory usage: 5.3+ KB


In [22]:
pi = pi.add_prefix('per_cap_income_')

## Census data

### Business patterns by ZIP

In [23]:
bp_zip = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/business_patterns_zip/zbp20totals.txt', delimiter=',', encoding = 'unicode_escape', engine ='python')

In [24]:
bp_zip

Unnamed: 0,zip,name,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,city,stabbr,cty_name
0,501,"HOLTSVILLE, NY",J,49,J,261,J,850,5,HOLTSVILLE,NY,SUFFOLK
1,1001,"AGAWAM, MA",G,7944,G,100626,G,409249,469,AGAWAM,MA,HAMPDEN
2,1002,"AMHERST, MA",H,7732,H,73903,H,294936,523,AMHERST,MA,HAMPSHIRE
3,1003,"AMHERST, MA",G,239,H,2515,H,9481,16,AMHERST,MA,HAMPSHIRE
4,1004,"AMHERST, MA",J,197,H,1439,H,5892,9,AMHERST,MA,HAMPSHIRE
...,...,...,...,...,...,...,...,...,...,...,...,...
35048,99926,"METLAKATLA, AK",G,51,H,442,H,1991,11,METLAKATLA,AK,PRINCE OF WALES-HYDER CENSUS A
35049,99928,"WARD COVE, AK",G,15,G,86,H,639,4,WARD COVE,AK,KETCHIKAN GATEWAY BOROUGH
35050,99929,"WRANGELL, AK",G,474,H,5029,H,21917,93,WRANGELL,AK,WRANGELL CITY AND BOROUGH
35051,99950,"KETCHIKAN, AK",J,43,J,151,H,803,12,KETCHIKAN,AK,KETCHIKAN GATEWAY BOROUGH


In [None]:
bp_zip1 = bp_zip[[]]
bp_zip_1 = bp_zip.rename(columns = {"emp": "num_employees", "est": "num_establishments", "ap": "annual_payroll_1000s"})

### Total Population

### Employment Status

### Nonemployer

### Manufacturing

### Retail Trade

In [25]:
retail_trade = pd.read_csv('https://raw.githubusercontent.com/jhancuch/sba-loan-credit-analysis/main/data/retail_trade/sales.csv', header = 3, encoding = 'unicode_escape', engine ='python')

In [32]:
retail_trade_1 = retail_trade.iloc[3:71, :]
retail_trade_2 = retail_trade_1[['NAICS Code','2020','2019r','2018r','2017r','2016r','2015r','2014r','2013r','2012','2011', '2010']]
retail_trade_3 = retail_trade_2.rename(columns = {'NAICS Code': 'naics',
                                                 '2019r': '2019',
                                                 '2018r': '2018',
                                                 '2017r': '2017',
                                                 '2016r': '2016',
                                                 '2015r': '2015',
                                                 '2014r': '2014',
                                                 '2013r': '2013'}).reset_index(drop=True)
# Subset to if only the naics is three digits long 
retail_trade_4_list = []

for i in range(0, len(retail_trade_3)):
    if len(retail_trade_3.loc[i, 'naics']) == 3:
        temp_list = list(retail_trade_3.iloc[i, :])
        retail_trade_4_list.append(temp_list)
        
retail_trade_4 = pd.DataFrame(retail_trade_4_list, columns=['naics', '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010'])

# Convert variables to integers and subset naics code to only two digits
retail_trade_4['naics'] = retail_trade_4['naics'].str.slice(0, 2)
for i in retail_trade_4.columns:
    retail_trade_4[i] = retail_trade_4[i].str.replace(',', '')
    retail_trade_4[i] = retail_trade_4[i].astype(int)

retail_trade_5 = retail_trade_4.groupby(['naics'])[['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']].sum().reset_index()

In [46]:
retail_trade_6 = pd.DataFrame(retail_trade_5.iloc[0, 1:] + retail_trade_5.iloc[1, 1:]).reset_index()

In [53]:
pd.pivot(retail_trade_6, columns='index', values=0)

index,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,,,,,,,,,,,5570393.0
1,,,,,,,,,,5402272.0,
2,,,,,,,,,5255425.0,,
3,,,,,,,,5040214.0,,,
4,,,,,,,4848096.0,,,,
5,,,,,,4726111.0,,,,,
6,,,,,4640651.0,,,,,,
7,,,,4459238.0,,,,,,,
8,,,4302229.0,,,,,,,,
9,,4102952.0,,,,,,,,,


In [51]:
retail_trade_6.columns

Index(['index', 0], dtype='object')

### Services

### Wholesale Trade

# Correlation Checks

## Retail Trade

In [29]:
retail_trade_corr = pd.concat([retail_trade_3['2019'], consumer_spending_state['pce_2019']], axis=1)

In [30]:
retail_trade_corr

Unnamed: 0,2019,pce_2019
0,1237744,36217.0
1,1065549,51499.0
2,942836,39450.0
3,122713,34568.0
4,75407,48722.0
...,...,...
63,750640,
64,657089,
65,9075,
66,84476,
