In [None]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import pandas as pd
import numpy as np

### 1. Import Zip Codes

In [None]:
zip5 = pd.read_csv('Zip5.csv')
zip5.columns=['Zip5']

In [None]:
zip5.shape

### 2. Scrape Median HH Income 
sample URL: https://statisticalatlas.com/zip/30097/Household-Income

In [None]:
df = pd.DataFrame()

for zipcode in zip5.Zip5:
    url = "https://statisticalatlas.com/zip/{}/Household-Income".format(zipcode)
    with requests.get(url) as r:
        soup = BeautifulSoup(r.text, 'lxml')
        table = soup.find_all('text', {"fill-opacity":"0.400"})[3:4]
        values = [zipcode]
        values.extend([row.text for row in table])
        df = df.append(pd.DataFrame(values).T, ignore_index=True)

cols = ['Zip5', 'MedanHHIncome']
df.columns=cols
df.head()

In [None]:
df.count()

- Drop rows where zip code is not found on the website
- Remove dollar sign, 'k', and '>' in >250k
- Remove wrong entries with '%'
- Change column data type to float and rename

In [None]:
df1 = df.copy()
df1 = df1.dropna()
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('$','')
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('k','')
df1['MedanHHIncome'] = df1['MedanHHIncome'].str.replace('>','')
df1 = df1[-df1['MedanHHIncome'].str.contains('%')]
df1['MedanHHIncome'] = df1['MedanHHIncome'].astype(float)
df1.columns = ['Zip5', 'MedanHHIncome(000)']

### 3. Scrape Marital Status Info
sample URL: https://statisticalatlas.com/zip/30097/Marital-Status

In [None]:
dfm = pd.DataFrame()

for zipcode in zip5.Zip5:
    url = "https://statisticalatlas.com/zip/{}/Marital-Status".format(zipcode)
    with requests.get(url) as r:
        soup = BeautifulSoup(r.text, 'lxml')
        table = soup.find_all('text', {"fill-opacity":"0.500"})[:8]
        values = [zipcode]
        values.extend([row.text for row in table])
        dfm = dfm.append(pd.DataFrame(values).T, ignore_index=True)

cols = ['Zip5', 'Never_Married_F', 'Never_Married_M', 'Married_F', 'Married_M', 
        'Separated/Divorced_F', 'Separated/Divorced_M', 'Widowed_F', 'Widowed_M']
dfm.columns=cols
dfm.head()

- Drop rows where zip code is not found on the website
- Change format: 11.9k to 11900 
- Change column data type to integer
- Generate married % and male/female variables

In [None]:
dfm1 = dfm.copy()
dfm1 = dfm1.dropna()
for col in dfm.columns[1:]:
    dfm1[col] = dfm1[col].str.replace(',','')
    dfm1[col] = dfm1[col].apply(lambda x: round(float(x[:-1])*1000) if x[-1:]=='k' else x).astype(int)
dfm1['male'] = dfm1[['Never_Married_M', 'Married_M', 'Separated/Divorced_M', 'Widowed_M']].sum(axis=1)
dfm1['female'] = dfm1[['Never_Married_F', 'Married_F', 'Separated/Divorced_F', 'Widowed_F']].sum(axis=1)
dfm1['population'] = dfm1[['male','female']].sum(axis=1)
dfm1['married'] = dfm1[['Married_F', 'Married_M']].sum(axis=1)
dfm1['married %'] = dfm1['married']/dfm1['population']*100
dfm1['married %'].replace(0, np.nan, inplace=True)
dfm1['male/female'] = dfm1['male']/dfm1['female']

### 4. Merge and Export Income and Marriage data

In [None]:
dfmg = df1.merge(dfm1[['Zip5', 'married %', 'male/female']], how='outer')
dfmg.shape

In [None]:
dfmg.describe()

In [None]:
dfmg.to_csv('income_marriage.csv', index=False)

### 5. Deposit

In [None]:
deposit = pd.read_excel('FDIC Deposit.xlsx')

In [None]:
print(deposit.shape)
deposit.head()

In [None]:
pop = pd.read_excel('Census Population.xlsx')

In [None]:
print(pop.shape)
pop.head()

Extract county and state from Geography.

In [None]:
pop['State'] = pop['Geography'].apply(lambda x: x.split(', ')[1])
pop['State'] = pop['State'].map({'Texas':'TX',
                                 'Georgia':'GA',
                                 'North Carolina':'NC',
                                 'Florida':'FL'})
pop['County'] = pop['Geography'].apply(lambda x: x.split(' County')[0])
pop.columns = ['Geography', 'Population Est 2018', 'State', 'County']
pop.head()

In [None]:
mg = deposit.merge(pop[['State', 'County', 'Population Est 2018']], on=['State','County'])
print(mg.shape)
print(mg.count())
mg.head()

Calculate per capita saving.

In [None]:
mg['Deposit (000s) Per Capita'] = mg['Deposit (000s)']/mg['Population Est 2018']
mg.head()

In [None]:
mg.describe()

In [None]:
mg.to_csv('per_capita_deposit.csv', index=False)