In [25]:
# -------------- Database Modeling, initial
# Goal: Exploratory Data Analysis

In [26]:
# Dependencies and Setup
import pandas as pd
import numpy as np 

In [27]:
# File to Load
county_statistics_to_load = "Data/county_statistics.csv"
covid19_us_county_to_load = "Data/covid19_us_county.csv"
us_county_demographics_to_load = "Data/us_county_demographics.csv"
us_county_pop_and_shps_to_load = "Data/us_county_pop_and_shps.csv"
religion_census_to_load = "Data/U.S. Religion Census Religious Congregations and Membership Study, 2010 (County File).csv"

# Read into and store into a pandas df
county_statistics_df = pd.read_csv(county_statistics_to_load)
covid19_us_county_df = pd.read_csv(covid19_us_county_to_load)
us_county_demographics_df = pd.read_csv(us_county_demographics_to_load)
us_county_pop_and_shps_df = pd.read_csv(us_county_pop_and_shps_to_load)
religion_census_df = pd.read_csv(religion_census_to_load)

In [28]:
# Reviewing individual datasets

## county_statistics_df.head()
## covid19_us_county_df.head()
## us_county_demographics_df.head()
## us_county_pop_and_shps_df.head()
## religion_census_df.head()

In [29]:
# Reviewing individual datasets, column levels / null values
county_statistics_df.dtypes

# Variable to keep, tentative
# all, columns except those after income

Unnamed: 0                        int64
county                           object
state                            object
percentage16_Donald_Trump       float64
percentage16_Hillary_Clinton    float64
total_votes16                   float64
votes16_Donald_Trump            float64
votes16_Hillary_Clinton         float64
percentage20_Donald_Trump       float64
percentage20_Joe_Biden          float64
total_votes20                   float64
votes20_Donald_Trump            float64
votes20_Joe_Biden               float64
lat                             float64
long                            float64
cases                           float64
deaths                          float64
TotalPop                        float64
Men                             float64
Women                           float64
Hispanic                        float64
White                           float64
Black                           float64
Native                          float64
Asian                           float64


In [30]:
covid19_us_county_df.dtypes

# Variable to keep, tentative
# date, county, state, cases, deaths, new_day_cases, new_day_deaths

date                               object
county                             object
state                              object
fips                                int64
state_fips                          int64
county_fips                         int64
cases                               int64
deaths                              int64
new_day_cases                       int64
new_day_deaths                      int64
cases_per_capita_100k             float64
deaths_per_capita_100k            float64
new_day_cases_per_capita_100k     float64
new_day_deaths_per_capita_100k    float64
county_pop_2019_est                 int64
pop_per_sq_mile_2010              float64
dtype: object

In [31]:
us_county_demographics_df.dtypes

# Variable to keep, tentative
# all, except state_fips, county_fips

state_fips          int64
county_fips         int64
state              object
county             object
TOT_POP             int64
MALE_PERC         float64
FEMALE_PERC       float64
WHITE_POP_PERC    float64
BLACK_POP_PERC    float64
ASIAN_POP_PERC    float64
HISP_POP_PERC     float64
AGE_OTO4            int64
AGE_5TO14           int64
AGE_15TO24          int64
AGE_25TO34          int64
AGE_35TO44          int64
AGE_45TO54          int64
AGE_55TO64          int64
AGE_65TO74          int64
AGE_75TO84          int64
AGE_84PLUS          int64
dtype: object

In [32]:
us_county_pop_and_shps_df.dtypes

# Variable to keep, tentative
# tentatively, dropping this dataset

state                   object
county                  object
fips                     int64
county_pop_2019_est    float64
county_center_lat      float64
county_center_lon      float64
center_point            object
county_geom             object
dtype: object

In [33]:
religion_census_df.head()

# Variable to keep, tentative
# tentatively, dropping this dataset without access to key to be able to execute on column names

Unnamed: 0,TOTCNG,TOTADH,TOTRATE,EVANCNG,EVANADH,EVANRATE,BPRTCNG,BPRTADH,BPRTRATE,MPRTCNG,...,ZOROCNG,ZOROADH,ZORORATE,FIPS,STCODE,STABBR,STNAME,CNTYCODE,CNTYNAME,POP2010
0,106.0,36938.0,676.878889,79.0,27503.0,503.99,13.0,2291.0,41.978889,12.0,...,0.0,3.0,0.05,1001,1,AL,Alabama,1,Autauga County,54571.0
1,271.0,96918.0,531.74,178.0,57986.0,318.138889,17.0,3130.0,17.17,48.0,...,,,,1003,1,AL,Alabama,3,Baldwin County,182265.0
2,89.0,15101.0,549.99,51.0,8793.0,320.25,21.0,3328.0,121.208889,14.0,...,,,,1005,1,AL,Alabama,5,Barbour County,27457.0
3,81.0,11430.0,498.8,63.0,10159.0,443.328889,8.0,966.0,42.158889,9.0,...,,,,1007,1,AL,Alabama,7,Bibb County,22915.0
4,156.0,37352.0,651.62,122.0,29223.0,509.8,2.0,58.0,1.01,29.0,...,,,,1009,1,AL,Alabama,9,Blount County,57322.0


In [None]:
# --------------- Tentative Next Step
# 1. Transformation 1 - Loop through counties in datasets to delete additional phrase "County" in "County" columns

# 2. Sources - I need to know where the initial datasets are from to be able to speak to their credibility in our paper

# 3. Additional Sources
# Campaign Donations in 2016 or 2020 by zip code: https://www.opensecrets.org/elections-overview/totals-by-zip-code 
# Demographic Data: https://usa.ipums.org/usa-action/variables/group?id=demog
# Very Detailed CDC database, at request of CDC only: https://docs.google.com/forms/d/e/1FAIpQLSeM9Ih3L7qqI0i5qiaXA7T9iNlcwQOMP1zkNCCTmw5fi-nmMA/viewform

# 4. Transformation 2 - Import connection to sqlalchemy and join tables accordingly


# 5. Decide - Are we studying California or the US?
#US

# 6. Transformation 3 - Concatenate county, state and lower case. Subsequently use this as unique identifier with which to do joins.