In [1]:
# Importing relevant packages
from bs4 import BeautifulSoup
import urllib.request
import requests
import pandas as pd
import numpy as np

### 1. Import Zip Codes

In [2]:
zip5 = pd.read_csv('Zip5.csv')
zip5.columns=['Zip5']

In [3]:
zip5.shape

(1954, 1)

### 2. Scrape Median HH Income 
sample URL: https://statisticalatlas.com/zip/30097/Household-Income

In [4]:
df = pd.DataFrame()

for zipcode in zip5.Zip5:
    url = "https://statisticalatlas.com/zip/{}/Household-Income".format(zipcode)
    with requests.get(url) as r:
        soup = BeautifulSoup(r.text, 'lxml')
        table = soup.find_all('text', {"fill-opacity":"0.400"})[3:4]
        values = [zipcode]
        values.extend([row.text for row in table])
        df = df.append(pd.DataFrame(values).T, ignore_index=True)

cols = ['Zip5', 'MedanHHIncome']
df.columns=cols
df.head()

Unnamed: 0,Zip5,MedanHHIncome
0,30097,$96.9k
1,30318,$44.0k
2,30309,$78.3k
3,30363,$66.9k
4,30328,$80.8k


In [5]:
df.count()

Zip5             1954
MedanHHIncome    1913
dtype: int64

- Drop rows where zip code is not found on the website
- Remove dollar sign, 'k', and '>' in >250k
- Remove wrong entries with '%'
- Change column data type to float and rename

In [6]:
df1 = df.copy()
df1 = df1.dropna()
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('$','')
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('k','')
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('>','')
df1 = df1[-df1['MedanHHIncome'].str.contains('%')]
df1['MedanHHIncome'] = df1['MedanHHIncome'].astype(float)
df1.columns = ['Zip5', 'MedanHHIncome(000)']

### 3. Scrape Marital Status Info
sample URL: https://statisticalatlas.com/zip/30097/Marital-Status

In [7]:
dfm = pd.DataFrame()

for zipcode in zip5.Zip5:
    url = "https://statisticalatlas.com/zip/{}/Marital-Status".format(zipcode)
    with requests.get(url) as r:
        soup = BeautifulSoup(r.text, 'lxml')
        table = soup.find_all('text', {"fill-opacity":"0.500"})[:8]
        values = [zipcode]
        values.extend([row.text for row in table])
        dfm = dfm.append(pd.DataFrame(values).T, ignore_index=True)

cols = ['Zip5', 'Never_Married_F', 'Never_Married_M', 'Married_F', 'Married_M', 
        'Separated/Divorced_F', 'Separated/Divorced_M', 'Widowed_F', 'Widowed_M']
dfm.columns=cols
dfm.head()

Unnamed: 0,Zip5,Never_Married_F,Never_Married_M,Married_F,Married_M,Separated/Divorced_F,Separated/Divorced_M,Widowed_F,Widowed_M
0,30097,4507,4384,11.9k,12.2k,1578,909,844,177
1,30318,9604,14.2k,4882,5393,2518,1978,2190,1288
2,30309,5031,6226,3302,3595,1358,1163,301,150
3,30363,798,1097,405,433,126,68,0,0
4,30328,5246,4440,7420,7879,2526,1092,1238,243


- Drop rows where zip code is not found on the website
- Change format: 11.9k to 11900 
- Change column data type to integer
- Generate married % and male/female variables

In [8]:
dfm1 = dfm.copy()
dfm1 = dfm1.dropna()
for col in dfm.columns[1:]:
    dfm1[col] = dfm1[col].str.replace(',','')
    dfm1[col] = dfm1[col].apply(lambda x: round(float(x[:-1])*1000) if x[-1:]=='k' else x).astype(int)
dfm1['male'] = dfm1[['Never_Married_M', 'Married_M', 'Separated/Divorced_M', 'Widowed_M']].sum(axis=1)
dfm1['female'] = dfm1[['Never_Married_F', 'Married_F', 'Separated/Divorced_F', 'Widowed_F']].sum(axis=1)
dfm1['population'] = dfm1[['male','female']].sum(axis=1)
dfm1['married'] = dfm1[['Married_F', 'Married_M']].sum(axis=1)
dfm1['married %'] = dfm1['married']/dfm1['population']*100
dfm1['married %'].replace(0, np.nan, inplace=True)
dfm1['male/female'] = dfm1['male']/dfm1['female']

### 4. Merge and Export Income and Marriage data

In [9]:
dfmg = df1.merge(dfm1[['Zip5', 'married %', 'male/female']], how='outer')
dfmg.shape

(1925, 4)

In [10]:
dfmg.describe()

Unnamed: 0,Zip5,MedanHHIncome(000),married %,male/female
count,1925.0,1910.0,1922.0,1925.0
mean,50132.948052,54.527068,46.575208,0.995139
std,22473.225784,21.769816,11.448893,0.822831
min,27006.0,13.1,0.101877,0.394516
25%,31792.0,40.0,40.186142,0.887395
50%,33809.0,49.7,47.536718,0.935197
75%,77011.0,64.65,54.059453,0.991362
max,79938.0,250.0,90.712431,33.6


In [11]:
dfmg.to_csv('income_marriage.csv', index=False)

### 5. Deposit

In [12]:
deposit = pd.read_excel('FDIC Deposit.xlsx')

In [13]:
print(deposit.shape)
deposit.head()

(574, 3)


Unnamed: 0,State,County,Deposit (000s)
0,GA,Fulton,100332784
1,GA,Gwinnett,17717075
2,GA,Cobb,15632932
3,GA,DeKalb,12481873
4,GA,Muscogee,8394232


In [14]:
pop = pd.read_excel('Census Population.xlsx')

In [15]:
print(pop.shape)
pop.head()

(580, 2)


Unnamed: 0,Geography,Population Estimate (as of July 1) - 2018
0,"Anderson County, Texas",58057
1,"Andrews County, Texas",18128
2,"Angelina County, Texas",87092
3,"Aransas County, Texas",23792
4,"Archer County, Texas",8786


Extract county and state from Geography.

In [16]:
pop['State'] = pop['Geography'].apply(lambda x: x.split(', ')[1])
pop['State'] = pop['State'].map({'Texas':'TX',
                                 'Georgia':'GA',
                                 'North Carolina':'NC',
                                 'Florida':'FL'})
pop['County'] = pop['Geography'].apply(lambda x: x.split(' County')[0])
pop.columns = ['Geography', 'Population Est 2018', 'State', 'County']
pop.head()

Unnamed: 0,Geography,Population Est 2018,State,County
0,"Anderson County, Texas",58057,TX,Anderson
1,"Andrews County, Texas",18128,TX,Andrews
2,"Angelina County, Texas",87092,TX,Angelina
3,"Aransas County, Texas",23792,TX,Aransas
4,"Archer County, Texas",8786,TX,Archer


In [17]:
mg = deposit.merge(pop[['State', 'County', 'Population Est 2018']], on=['State','County'])
print(mg.shape)
print(mg.count())
mg.head()

(572, 4)
State                  572
County                 572
Deposit (000s)         572
Population Est 2018    572
dtype: int64


Unnamed: 0,State,County,Deposit (000s),Population Est 2018
0,GA,Fulton,100332784,1050114
1,GA,Gwinnett,17717075,927781
2,GA,Cobb,15632932,756865
3,GA,DeKalb,12481873,756558
4,GA,Muscogee,8394232,194160


Calculate per capita saving.

In [18]:
mg['Deposit (000s) Per Capita'] = mg['Deposit (000s)']/mg['Population Est 2018']
mg.head()

Unnamed: 0,State,County,Deposit (000s),Population Est 2018,Deposit (000s) Per Capita
0,GA,Fulton,100332784,1050114,95.544659
1,GA,Gwinnett,17717075,927781,19.096182
2,GA,Cobb,15632932,756865,20.654849
3,GA,DeKalb,12481873,756558,16.498237
4,GA,Muscogee,8394232,194160,43.233581


In [19]:
mg.describe()

Unnamed: 0,Deposit (000s),Population Est 2018,Deposit (000s) Per Capita
count,572.0,572.0,572.0
mean,3674213.0,123898.7,18.570632
std,17517880.0,343749.8,13.667547
min,7602.0,726.0,0.928432
25%,167930.5,11957.0,10.966622
50%,421448.5,27284.0,15.334716
75%,1248876.0,85954.75,21.66414
max,208660400.0,4698619.0,172.786415


In [21]:
mg.to_csv('per_capita_deposit.csv', index=False)