In [118]:
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from time import sleep
import json
import os
import pandas as pd
from datetime import date

# SQLAlchemy
import sqlalchemy

In [2]:
# Selenium modules
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_path = 'C:\WebDrivers\chromedriver.exe'

In [5]:
# Create an engine that can talk to the database
password = ''
engine = sqlalchemy.create_engine(f'postgresql://postgres:{password}@localhost/etl_team_06_db')

In [6]:
# Connect to database (Note: The package psychopg2 is required for Postgres to work with SQLAlchemy)
connection = engine.connect()

In [7]:
# Print the names of tables in my homework PostgreSQL database: etl_team_06_db.
print(engine.table_names())

[]


## Web Scraping for COVID Data: USAFacts.org

In [8]:
#USAFacts site
url = 'https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/'
state_covid_page = requests.get(url)

In [9]:
state_covid_page.status_code

200

In [10]:
state_covid_soup = BeautifulSoup(state_covid_page.content, 'lxml')
#print(state_covid_soup.prettify())

In [11]:
title = state_covid_soup.find('title')
print(title.text)

Coronavirus Locations: COVID-19 Map by County and State | USAFacts


In [12]:
# We want to get data from the table for each state.
all_tables = state_covid_soup.find_all('table')
# all_tables
len(all_tables)

1

In [14]:
# Extract the table.
the_table = all_tables[0]
# type(the_table)
#the_table

In [15]:
p = state_covid_soup.find('p', class_='jss26')
string_list= p.text.split()
month = string_list[2]
day = string_list[3].strip()
year = string_list[-1]
the_date = f'{year}-{month}-{day}'.split(',')
date = the_date[0]
date

'2020-Sep-01'

In [16]:
# Lets get the name of each state.
#states = [a.string for a in the_table.find_all('a')]
#len(states)

In [17]:
# Extract the row of data
date_list = []
state_list = []
confirmed_state_list = []
deaths_state_list = []

all_trs = [tr for tr in the_table.find_all('tr')]

for tr in all_trs[1:]:
    # Name of state
    state = tr.a.text
    state_list.append(state)
    
    # COVID cases and deaths
    all_tds = [td for td in tr.find_all('td')]
    confirmed = all_tds[0].text
    deaths = all_tds[1].text
    confirmed_state_list.append(confirmed)
    deaths_state_list.append(deaths)
    
    date_list.append(date)

In [18]:
# Create a dictionary with the information.
covid_state_dict = {}
covid_state_dict["state"] = state_list
covid_state_dict["confirmed"] = confirmed_state_list
covid_state_dict["deaths"] = deaths_state_list
covid_state_dict["date"] = date_list

## COVID: State Level

In [50]:
# Create the DataFrame.
covid_state_df = pd.DataFrame(data = covid_state_dict)
covid_state_df.head()

Unnamed: 0,state,confirmed,deaths,date
0,Alabama,87723,1580,2020-Sep-01
1,Alaska,2879,23,2020-Sep-01
2,Arizona,174010,3694,2020-Sep-01
3,Arkansas,42511,453,2020-Sep-01
4,California,500421,9224,2020-Sep-01


## Now scrap the url for each county in the US

In [128]:
# These are the url's for each state to link to the HTML for the county level data.
base_url = 'https://usafacts.org'
all_state_url = [base_url+a['href'] for a in the_table.find_all('a')]
#all_state_url

In [137]:
#all_state_url[0].split('/')[-1].capitalize()

'Alabama'

In [23]:
# Initialize the lists for the Pandas DateFrame.
date_county_list = []
county_list = []
the_state_list =[]
confirmed_county_list = []
deaths_county_list = []
confirmed_per_100k_list = []

# Loop through every county of each state.
for state_url in all_state_url:
    state_county_page = requests.get(state_url)
    state_county_soup = BeautifulSoup(state_county_page.content, 'lxml')
    
    # The date that data was extracted.
    string_county_list= state_county_soup.find('p').text.split()
    month_county = string_county_list[2]
    day_county = string_county_list[3].strip()
    year_county = string_county_list[-1]
    the_date_county = f'{year}-{month}-{day}'.split(',')
    date_county = the_date[0]
    
    # Parse the state_county_soup for table information.
    all_county_tables = state_county_soup.find_all('table')
    #len(all_county_tables)

    # Extract the table.
    the_county_table = all_county_tables[0]
    # type(the_table)

    all_county_trs = [tr for tr in the_county_table.tbody.find_all('tr')]

    # Extract the state name from the url.
    state_of_county = all_county_trs[0].a['href'].split('/')[-3]
    
    for county_tr in all_county_trs:
        # Name of each county.
        county = county_tr.a.text
        county_list.append(county)
        
        # County COVID cases, deaths and Confirmed Per 100k
        all_county_tds = [td for td in county_tr.find_all('td')]
        county_confirmed = all_county_tds[0].text
        county_deaths = all_county_tds[1].text
        county_confirmed_per_100k = all_county_tds[2].text
        confirmed_county_list.append(county_confirmed)
        deaths_county_list.append(county_deaths)
        confirmed_per_100k_list.append(county_confirmed_per_100k)

        the_state_list.append(state_of_county.capitalize())
        date_county_list.append(date_county)
    
    # Create a dictionary with the information.
    covid_county_dict = {}
    covid_county_dict["county"] = county_list
    covid_county_dict["state"] = the_state_list
    covid_county_dict["confirmed"] = confirmed_county_list
    covid_county_dict["deaths"] = deaths_county_list
    covid_county_dict["confirmed_per_100k"] = confirmed_per_100k_list
    covid_county_dict["date"] = date_county_list
    
    #break

## COVID: County Level

In [24]:
# Create the DataFrame.
covid_county_df = pd.DataFrame(data = covid_county_dict)
covid_county_df.head()

Unnamed: 0,county,state,confirmed,deaths,confirmed_per_100k,date
0,Autauga County,Alabama,1015,21,1816.7,2020-Sep-01
1,Baldwin County,Alabama,3101,22,1389.1,2020-Sep-01
2,Barbour County,Alabama,598,5,2422.4,2020-Sep-01
3,Bibb County,Alabama,363,2,1621.0,2020-Sep-01
4,Blount County,Alabama,767,3,1326.4,2020-Sep-01


In [25]:
covid_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   county              3142 non-null   object
 1   state               3142 non-null   object
 2   confirmed           3142 non-null   object
 3   deaths              3142 non-null   object
 4   confirmed_per_100k  3142 non-null   object
 5   date                3142 non-null   object
dtypes: object(6)
memory usage: 147.4+ KB


## COVID: Race/Ethnicity Demographics
### The COVID Tracking Project:  The COVID Racial Data Tracker.

In [26]:
# Load the demographic data Race_Data _Entry _CRDT_utf_8 .csv file.
the_filepath = os.path.join(".", "Resources", "Race_Data _Entry _CRDT_utf_8 .csv")

# Read file and store into Pandas DataFrames
covid_race_df = pd.read_csv(the_filepath, encoding = 'utf-8')

# Visualize
covid_race_df.head()

Unnamed: 0,date,state,cases_total,cases_white,cases_black,cases_latinX,cases_asian,cases_AIAN,cases_NHPI,cases_multiracial,...,deaths_latinX,deaths_asian,deaths_AIAN,deaths_NHPI,deaths_multiracial,deaths_other,deaths_unknown,deaths_ethnicity_hispanic,deaths_ethnicity_non_hispanic,deaths_ethnicity_unknown
0,20200729,AK,2797.0,852.0,81.0,,83.0,366.0,103.0,68.0,...,,2.0,8.0,1.0,0.0,0.0,0.0,0.0,22.0,0.0
1,20200729,AL,83782.0,25915.0,23239.0,,309.0,,,,...,,4.0,,,,29.0,103.0,48.0,1267.0,223.0
2,20200729,AR,40968.0,20527.0,8673.0,,587.0,120.0,2103.0,,...,,6.0,2.0,31.0,,30.0,0.0,43.0,394.0,0.0
3,20200729,AS,,,,,,,,,...,,,,,,,,,,
4,20200729,AZ,168273.0,29227.0,3822.0,38887.0,1309.0,9631.0,,,...,946.0,42.0,430.0,,,57.0,503.0,946.0,2005.0,503.0


In [66]:
# Cases by race/ethnicity
cases_ethnicity_df = covid_race_df.filter(['date', 'state', 'cases_total', 'cases_white', 'cases_black',
       'cases_latinX', 'cases_asian', 'cases_AIAN', 'cases_NHPI',
       'cases_multiracial', 'cases_other', 'cases_unknown',
       'cases_ethnicity_hispanic', 'cases_ethnicity_nonHispanic',
       'cases_ethnicity_unknown'])
cases_ethnicity_df.head()

Unnamed: 0,date,state,cases_total,cases_white,cases_black,cases_latinX,cases_asian,cases_AIAN,cases_NHPI,cases_multiracial,cases_other,cases_unknown,cases_ethnicity_hispanic,cases_ethnicity_nonHispanic,cases_ethnicity_unknown
0,20200729,AK,2797.0,852.0,81.0,,83.0,366.0,103.0,68.0,48.0,1196.0,150.0,1215.0,1432.0
1,20200729,AL,83782.0,25915.0,23239.0,,309.0,,,,4527.0,29789.0,5832.0,40720.0,37228.0
2,20200729,AR,40968.0,20527.0,8673.0,,587.0,120.0,2103.0,,5456.0,3502.0,9493.0,31475.0,0.0
3,20200729,AS,,,,,,,,,,,,,
4,20200729,AZ,168273.0,29227.0,3822.0,38887.0,1309.0,9631.0,,,3978.0,81419.0,38887.0,47967.0,81419.0


In [28]:
# Deaths by race/ethnicity

deaths_ethnicity_df = covid_race_df.filter(['date', 'state', 'deaths_total', 'deaths_white',
       'deaths_black', 'deaths_latinX', 'deaths_asian', 'deaths_AIAN',
       'deaths_NHPI', 'deaths_multiracial', 'deaths_other', 'deaths_unknown',
       'deaths_ethnicity_hispanic', 'deaths_ethnicity_non_hispanic',
       'deaths_ethnicity_unknown'])
deaths_ethnicity_df

Unnamed: 0,date,state,deaths_total,deaths_white,deaths_black,deaths_latinX,deaths_asian,deaths_AIAN,deaths_NHPI,deaths_multiracial,deaths_other,deaths_unknown,deaths_ethnicity_hispanic,deaths_ethnicity_non_hispanic,deaths_ethnicity_unknown
0,20200729,AK,22.0,11.0,0.0,,2.0,8.0,1.0,0.0,0.0,0.0,0.0,22.0,0.0
1,20200729,AL,1538.0,779.0,623.0,,4.0,,,,29.0,103.0,48.0,1267.0,223.0
2,20200729,AR,434.0,255.0,113.0,,6.0,2.0,31.0,,30.0,0.0,43.0,394.0,0.0
3,20200729,AS,,,,,,,,,,,,,
4,20200729,AZ,3454.0,1372.0,104.0,946.0,42.0,430.0,,,57.0,503.0,946.0,2005.0,503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,20200412,VT,,,,,,,,,,,,,
1788,20200412,WA,508.0,290.0,9.0,24.0,36.0,5.0,1.0,3.0,7.0,133.0,24.0,351.0,133.0
1789,20200412,WI,144.0,77.0,61.0,,4.0,1.0,,0.0,0.0,1.0,4.0,138.0,2.0
1790,20200412,WV,,,,,,,,,,,,,


## Using World Population Review API: JSON

In [29]:
# Set up the URL will be using to retrieve the infromation as JSON.
query_url = "https://worldpopulationreview.com/static/states/abbr-name-list.json"

# Get data from API: World Population Review
response = requests.get(query_url)
response_json = response.json()

In [30]:
len(response_json)

51

In [31]:
# Initialize the lists
state_name_list = []
abbr_list = []
abbr_dict = {}

for result in response_json:
    state = result['name']
    abbr = result['abbreviation']
    
    # Append the lists.
    state_name_list.append(state)
    abbr_list.append(abbr)
    
# Create a dictionary with the information.    
abbr_dict["state"] = state_name_list
abbr_dict["abbr"] = abbr_list

# Create the DataFrame.
abbreviation_df = pd.DataFrame(data = abbr_dict)
abbreviation_df.head()

Unnamed: 0,state,abbr
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


## United States Census Bureau

In [32]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
filepath = os.path.join(".", "Resources", "Census_Data_utf.csv")

# Read file and store into Pandas DataFrames
census_data_df = pd.read_csv(filepath, encoding = 'utf-8')

# Visualize
census_data_df.head()

Unnamed: 0,FIPS,Geog_Level,State_Code,State_name,County_code,County_name,Lat_State,Lon_State,LAND_AREA,Med_HHD_Inc_ACS_14_18($),...,pct_Pov_Univ_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,pct_One_Health_Ins_ACS_14_18,pct_No_Health_Ins_ACS_14_18,pct_NoHealthIns_65P_ACS_14_18,pct_Pop_NoCompDevic_ACS_14_18,pct_Pop_w_BroadComp_ACS_14_18,pct_HHD_NoCompDevic_ACS_14_18,pct_HHD_No_Internet_ACS_14_18,pct_HHD_w_Broadband_ACS_14_18
0,1000,State,1,Alabama,0,All counties,32.7794,-86.8287,50645.319,48486.0,...,97.48,17.49,69.93,9.82,0.41,11.71,77.81,16.61,22.4,56.41
1,1001,County,1,Alabama,1,Autauga County,,,594.437,58786.0,...,99.21,15.38,72.36,7.02,0.0,8.18,84.0,13.01,19.03,61.97
2,1003,County,1,Alabama,3,Baldwin County,,,1589.786,55962.0,...,98.47,10.57,69.54,10.03,0.41,7.21,83.03,11.43,16.83,57.24
3,1005,County,1,Alabama,5,Barbour County,,,884.875,34186.0,...,88.65,28.86,58.07,9.92,0.29,18.02,64.92,23.95,34.11,38.84
4,1007,County,1,Alabama,7,Bibb County,,,622.581,45340.0,...,90.86,13.99,65.9,7.19,0.0,17.2,72.83,23.73,29.24,33.42


In [33]:
# Drop some columns.
census_data_df.drop(["State_Code", "County_code", 'Lat_State', 'Lon_State'], axis = 1, inplace = True)
census_data_df.columns = ['fips', 'level', 'state', 'county', 'land_area',
       'med_HHD_Inc_ACS_14_18($)', 'aggregate_HH_INC_ACS_14_18($)',
       'tot_Pop_CEN_2010', 'tot_Pop_ACS_14_18', 'pop_65plus_ACS_14_18',
       'nh_Blk_alone_ACS_14_18', 'pov_Univ_ACS_14_18',
       'prs_Blw_Pov_Lev_ACS_14_18', 'one_Health_Ins_ACS_14_18',
       'no_Health_Ins_ACS_14_18', 'pct_Pov_Univ_ACS_14_18',
       'pct_Prs_Blw_Pov_Lev_ACS_14_18', 'pct_One_Health_Ins_ACS_14_18',
       'pct_No_Health_Ins_ACS_14_18', 'pct_NoHealthIns_65P_ACS_14_18',
       'pct_Pop_NoCompDevic_ACS_14_18', 'pct_Pop_w_BroadComp_ACS_14_18',
       'pct_HHD_NoCompDevic_ACS_14_18', 'pct_HHD_No_Internet_ACS_14_18',
       'pct_HHD_w_Broadband_ACS_14_18']
census_data_df.head()

Unnamed: 0,fips,level,state,county,land_area,med_HHD_Inc_ACS_14_18($),aggregate_HH_INC_ACS_14_18($),tot_Pop_CEN_2010,tot_Pop_ACS_14_18,pop_65plus_ACS_14_18,...,pct_Pov_Univ_ACS_14_18,pct_Prs_Blw_Pov_Lev_ACS_14_18,pct_One_Health_Ins_ACS_14_18,pct_No_Health_Ins_ACS_14_18,pct_NoHealthIns_65P_ACS_14_18,pct_Pop_NoCompDevic_ACS_14_18,pct_Pop_w_BroadComp_ACS_14_18,pct_HHD_NoCompDevic_ACS_14_18,pct_HHD_No_Internet_ACS_14_18,pct_HHD_w_Broadband_ACS_14_18
0,1000,State,Alabama,All counties,50645.319,48486.0,125091000000.0,4779736.0,4864680.0,783832.0,...,97.48,17.49,69.93,9.82,0.41,11.71,77.81,16.61,22.4,56.41
1,1001,County,Alabama,Autauga County,594.437,58786.0,1594492000.0,54571.0,55200.0,8050.0,...,99.21,15.38,72.36,7.02,0.0,8.18,84.0,13.01,19.03,61.97
2,1003,County,Alabama,Baldwin County,1589.786,55962.0,6070565000.0,182265.0,208107.0,40665.0,...,98.47,10.57,69.54,10.03,0.41,7.21,83.03,11.43,16.83,57.24
3,1005,County,Alabama,Barbour County,884.875,34186.0,440096500.0,27457.0,25782.0,4634.0,...,88.65,28.86,58.07,9.92,0.29,18.02,64.92,23.95,34.11,38.84
4,1007,County,Alabama,Bibb County,622.581,45340.0,400338900.0,22915.0,22527.0,3661.0,...,90.86,13.99,65.9,7.19,0.0,17.2,72.83,23.73,29.24,33.42


In [53]:
fips_df = census_data_df.filter(['fips', 'state', 'county', 'level'])
fips_df.head()

Unnamed: 0,fips,state,county,level
0,1000,Alabama,All counties,State
1,1001,Alabama,Autauga County,County
2,1003,Alabama,Baldwin County,County
3,1005,Alabama,Barbour County,County
4,1007,Alabama,Bibb County,County


In [56]:
fips_state_df = fips_df[fips_df['level'] == 'State'].reset_index()
fips_state_df.drop(['level' , 'county', 'index'], axis = 1, inplace = True)
fips_state_df.head()

Unnamed: 0,fips,state
0,1000,Alabama
1,2000,Alaska
2,4000,Arizona
3,5000,Arkansas
4,6000,California


## FIPS State

In [89]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
state_filepath = os.path.join(".", "Resources", "state_centroid_utf_8.csv")

# Read file and store into Pandas DataFrames
fips_State = pd.read_csv(state_filepath, encoding = 'utf-8')

# Visualize
fips_State.head()

Unnamed: 0,state,fips_state,latitude,longitude
0,Alabama,1.0,32.7794,-86.8287
1,Alaska,2.0,64.0685,-152.2782
2,Arizona,4.0,34.2744,-111.6602
3,Arkansas,5.0,34.8938,-92.4426
4,California,6.0,37.1841,-119.4696


## FIPS County

In [36]:
# Load the Census_Data_utf.csv file that was create Census Bureau.
county_filepath = os.path.join(".", "Resources", "county_centroid_utf_8.csv")

# Read file and store into Pandas DataFrames
fips_county_df = pd.read_csv(county_filepath, encoding = 'utf-8')

# Visualize
fips_county_df.head()


Unnamed: 0,fips_county,state,county,latitude,longitude
0,1001,AL,Autauga,32.536382,-86.6445
1,1003,AL,Baldwin,30.659218,-87.7461
2,1005,AL,Barbour,31.87067,-85.4055
3,1007,AL,Bibb,33.015893,-87.1271
4,1009,AL,Blount,33.977448,-86.5672


In [37]:
fips_county_df.columns = ['fips', 'state', 'county', 'latitude', 'longitude']
fips_county_df.head()

Unnamed: 0,fips,state,county,latitude,longitude
0,1001,AL,Autauga,32.536382,-86.6445
1,1003,AL,Baldwin,30.659218,-87.7461
2,1005,AL,Barbour,31.87067,-85.4055
3,1007,AL,Bibb,33.015893,-87.1271
4,1009,AL,Blount,33.977448,-86.5672


In [39]:
fips_county_df.columns

Index(['fips', 'state', 'county', 'latitude', 'longitude'], dtype='object')

## Build the Database

In [40]:

table_name = 'state_county'
fips_df.to_sql(table_name, connection)

In [41]:
# Let's see if the table was created successful?
print(engine.table_names())

['state_county']


In [42]:
# close the connection to the database.
connection.close()

## census_data_state

In [107]:
census_data_state_df = census_data_df.filter(['fips', 'level', 'land_area', 'med_HHD_Inc_ACS_14_18($)', 'pct_No_Health_Ins_ACS_14_18', 'pct_HHD_No_Internet_ACS_14_18'])
census_data_state_df.head()

Unnamed: 0,fips,level,land_area,med_HHD_Inc_ACS_14_18($),pct_No_Health_Ins_ACS_14_18,pct_HHD_No_Internet_ACS_14_18
0,1000,State,50645.319,48486.0,9.82,22.4
1,1001,County,594.437,58786.0,7.02,19.03
2,1003,County,1589.786,55962.0,10.03,16.83
3,1005,County,884.875,34186.0,9.92,34.11
4,1007,County,622.581,45340.0,7.19,29.24


In [108]:
census_data_state_df.columns = ['fips', 'level', 'land_area', 'median_household_income', 'pct_no_health_Insurance', 'pct_households_no_internet']

In [111]:
census_data_state = census_data_state_df[census_data_state_df['level'] == 'State'].reset_index()
census_data_state.drop(['level' , 'index'], axis = 1, inplace = True)
census_data_state.head()

Unnamed: 0,fips,land_area,median_household_income,pct_no_health_Insurance,pct_households_no_internet
0,1000,50645.319,48486.0,9.82,22.4
1,2000,570640.952,76715.0,13.94,12.24
2,4000,113594.092,56213.0,10.77,14.7
3,5000,52035.483,45726.0,8.88,23.38
4,6000,155779.247,71228.0,8.38,12.24


In [120]:
# Save the Pandas DataFrame for each table in PostgreSQL.
census_data_state_path = os.path.join(".", "output_data", "census_data_state.csv")
census_data_state.to_csv(census_data_state_path, encoding="utf-8", index=False)

## covid_state

In [65]:
covid_state = fips_state_df.copy()
covid_state['confirmed'] = covid_state_df['confirmed']
covid_state['deaths'] = covid_state_df['deaths']
covid_state['as_of_date'] = date.today()
covid_state.head()

Unnamed: 0,fips,state,confirmed,deaths,as_of_date
0,1000,Alabama,87723,1580,2020-08-01
1,2000,Alaska,2879,23,2020-08-01
2,4000,Arizona,174010,3694,2020-08-01
3,5000,Arkansas,42511,453,2020-08-01
4,6000,California,500421,9224,2020-08-01


In [121]:
# Save the Pandas DataFrame for each table in PostgreSQL.
covid_state_path = os.path.join(".", "output_data", "covid_state.csv")
covid_state.to_csv(covid_state_path, encoding="utf-8", index=False)

## ethnicity_covid

In [94]:
ethnicity_covid_df = covid_race_df.copy()
ethnicity_covid_df['fips'] = covid_state['fips']
ethnicity_covid = ethnicity_covid_df.filter(['fips', 'cases_white', 'deaths_white', 'cases_black', 'deaths_black', 'cases_latinX', 'deaths_latinX', 'cases_asian', 'deaths_asian', 'cases_total', 'deaths_total'])

ethnicity_covid.head()

Unnamed: 0,fips,cases_white,deaths_white,cases_black,deaths_black,cases_latinX,deaths_latinX,cases_asian,deaths_asian,cases_total,deaths_total
0,1000.0,852.0,11.0,81.0,0.0,,,83.0,2.0,2797.0,22.0
1,2000.0,25915.0,779.0,23239.0,623.0,,,309.0,4.0,83782.0,1538.0
2,4000.0,20527.0,255.0,8673.0,113.0,,,587.0,6.0,40968.0,434.0
3,5000.0,,,,,,,,,,
4,6000.0,29227.0,1372.0,3822.0,104.0,38887.0,946.0,1309.0,42.0,168273.0,3454.0


In [122]:
# Save the Pandas DataFrame for each table in PostgreSQL.
ethnicity_covid_path = os.path.join(".", "output_data", "ethnicity_covid.csv")
ethnicity_covid.to_csv(ethnicity_covid_path, encoding="utf-8", index=False)

## state

In [97]:
state_df = fips_State.copy()
state_df['fips'] = covid_state['fips']
state_df['abbr'] = abbreviation_df['abbr']
state = state_df.filter(['fips', 'state', 'abbr', 'latitude', 'longitude'])
state.head()

Unnamed: 0,fips,state,abbr,latitude,longitude
0,1000,Alabama,AL,32.7794,-86.8287
1,2000,Alaska,AK,64.0685,-152.2782
2,4000,Arizona,AZ,34.2744,-111.6602
3,5000,Arkansas,AR,34.8938,-92.4426
4,6000,California,CA,37.1841,-119.4696


In [123]:
# Save the Pandas DataFrame for each table in PostgreSQL.
state_path = os.path.join(".", "output_data", "state.csv")
state.to_csv(state_path, encoding="utf-8", index=False)

## state_county

In [102]:
state_county_df = fips_county_df.copy()
state_county_df.columns = ['fips', 'abbr', 'county', 'latitude', 'longitude']
state_county = state_county_df.copy()
state_county.head()

Unnamed: 0,fips,abbr,county,latitude,longitude
0,1001,AL,Autauga,32.536382,-86.6445
1,1003,AL,Baldwin,30.659218,-87.7461
2,1005,AL,Barbour,31.87067,-85.4055
3,1007,AL,Bibb,33.015893,-87.1271
4,1009,AL,Blount,33.977448,-86.5672


In [124]:
# Save the Pandas DataFrame for each table in PostgreSQL.
state_county_path = os.path.join(".", "output_data", "state_county.csv")
state_county.to_csv(state_county_path, encoding="utf-8", index=False)

## census_data_county

In [112]:
census_data_county = census_data_state_df[census_data_state_df['level'] == 'County'].reset_index()
census_data_county.drop(['level' , 'index'], axis = 1, inplace = True)
census_data_county.head()

Unnamed: 0,fips,land_area,median_household_income,pct_no_health_Insurance,pct_households_no_internet
0,1001,594.437,58786.0,7.02,19.03
1,1003,1589.786,55962.0,10.03,16.83
2,1005,884.875,34186.0,9.92,34.11
3,1007,622.581,45340.0,7.19,29.24
4,1009,644.775,48695.0,10.93,27.35


In [125]:
# Save the Pandas DataFrame for each table in PostgreSQL.
census_data_county_path = os.path.join(".", "output_data", "census_data_county.csv")
census_data_county.to_csv(census_data_county_path, encoding="utf-8", index=False)

## covid_county

In [145]:
covid_county = covid_county_df.filter(['county', 'state', 'confirmed', 'deaths'])
covid_county['date'] = date.today()
covid_county.head()

Unnamed: 0,county,state,confirmed,deaths,date
0,Autauga County,Alabama,1015,21,2020-08-01
1,Baldwin County,Alabama,3101,22,2020-08-01
2,Barbour County,Alabama,598,5,2020-08-01
3,Bibb County,Alabama,363,2,2020-08-01
4,Blount County,Alabama,767,3,2020-08-01


In [126]:
# Save the Pandas DataFrame for each table in PostgreSQL.
covid_county_path = os.path.join(".", "output_data", "covid_county.csv")
covid_county.to_csv(covid_county_path, encoding="utf-8", index=False)

In [158]:
the_covid_county = covid_county.copy()
counties = [county for county in the_covid_county['county']]
i = 0
for county_name in counties:
    covid_county.loc[i, 'countynew'] = county_name.split()[0]
    i = i + 1
    
the_df = the_covid_county.filter(['countynew', 'state', 'confirmed', 'deaths', 'date'])
the_df.columns = ['county', 'state', 'confirmed', 'deaths', 'date']
the_df.head()

Unnamed: 0,county,state,confirmed,deaths,date
0,Autauga,Alabama,1015,21,2020-08-01
1,Baldwin,Alabama,3101,22,2020-08-01
2,Barbour,Alabama,598,5,2020-08-01
3,Bibb,Alabama,363,2,2020-08-01
4,Blount,Alabama,767,3,2020-08-01


In [143]:
covid_county_update = pd.merge(state_county, state, on="abbr", how="outer")
county = covid_county_update.filter(['fips_y', 'county', 'state'])
county.columns = ['fips', 'county', 'state']
county.head()

Unnamed: 0,fips,county,state
0,1000,Autauga,Alabama
1,1000,Baldwin,Alabama
2,1000,Barbour,Alabama
3,1000,Bibb,Alabama
4,1000,Blount,Alabama
