In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
covid_df = pd.read_csv("data/all-states-history.csv")
covid_df.head()

Unnamed: 0,date,state,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,hospitalizedIncrease,...,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease
0,2021-02-20,AK,289.0,,0,,1243.0,1243.0,34.0,0,...,1629829.0,0,,,,,,0,1629829.0,0
1,2021-02-20,AL,9590.0,7525.0,17,2065.0,44767.0,44767.0,895.0,0,...,2253891.0,5436,,,114532.0,,2253891.0,5436,,0
2,2021-02-20,AR,5348.0,4298.0,12,1050.0,14526.0,14526.0,605.0,26,...,2600443.0,3060,,,,426611.0,,0,2600443.0,3060
3,2021-02-20,AS,0.0,,0,,,,,0,...,2140.0,0,,,,,,0,2140.0,0
4,2021-02-20,AZ,15480.0,13674.0,59,1806.0,56872.0,56872.0,1650.0,140,...,7396328.0,45153,432949.0,,,,3683111.0,14137,7396328.0,45153


In [3]:
# Keep relevant columns
covid_df = covid_df[["date","state","positiveIncrease","totalTestResultsIncrease"]]

# Check datatype and empty values
print(covid_df.dtypes)
print(covid_df.isna().sum())

# Check states
print(covid_df.groupby('state').nunique().count())
print(covid_df.groupby('state').nunique().index)

covid_df.head()

date                        object
state                       object
positiveIncrease             int64
totalTestResultsIncrease     int64
dtype: object
date                        0
state                       0
positiveIncrease            0
totalTestResultsIncrease    0
dtype: int64
date                        56
positiveIncrease            56
totalTestResultsIncrease    56
dtype: int64
Index(['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype='object', name='state')


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20,AK,0,0
1,2021-02-20,AL,774,5436
2,2021-02-20,AR,517,3060
3,2021-02-20,AS,0,0
4,2021-02-20,AZ,2047,45153


In [4]:
# Change datatype
covid_df['date'] = pd.to_datetime(covid_df['date'])
print(covid_df.dtypes)

# Keep only 50 states + DC
delete = covid_df[(covid_df['state']=="AS")|(covid_df['state']=="GU")|(covid_df['state']=="PR")|\
             (covid_df['state']=="VI")|(covid_df['state']=="MP")].index
covid_df = covid_df.drop(delete)
print(covid_df.groupby('state').nunique().count())

covid_df.head()

date                        datetime64[ns]
state                               object
positiveIncrease                     int64
totalTestResultsIncrease             int64
dtype: object
date                        51
positiveIncrease            51
totalTestResultsIncrease    51
dtype: int64


Unnamed: 0,date,state,positiveIncrease,totalTestResultsIncrease
0,2021-02-20,AK,0,0
1,2021-02-20,AL,774,5436
2,2021-02-20,AR,517,3060
4,2021-02-20,AZ,2047,45153
5,2021-02-20,CA,6668,192222


In [5]:
policy_df = pd.read_excel('data/COVID-19 US state policy database 2_17_2021.xlsx')
policy_df.head()

Unnamed: 0,STATE,POSTCODE,FIPS,STEMERG,CLSCHOOL,CLDAYCR,OPNCLDCR,CLNURSHM,STAYHOME,STAYHOMENOGP,...,MINWAGE2020,ALTMINWAGE2020,TIPPEDMINWAGE2020,SMALLBIZMINWAGE2020,PLANMINWAGE2021,PLANMINWAGE2022,PLANMINWAGE2023,PLANMINWAGE2024,PLANMINWAGE2025,PLANMINWAGE2026
0,State,State Abbreviation,FIPS Code,State of emergency,Date closed K-12 public schools,Closed day cares,Reopen day cares,Date banned visitors to nursing homes,Stay at home/ shelter in place,Stay at home order' issued but did not specifi...,...,2020 Minimum Wage,2020 Alternative Minimum Wage,2020 Minimum Wage for Tipped Workers,Different Minimum Wage for Smaller Businesses,[Planned] 2021 Minimum Wage,[Planned] 2022 Minimum Wage,[Planned] 2023 Minimum Wage,[Planned] 2024 Minimum Wage,[Planned] 2025 Minimum Wage,[Planned] 2026 Minimum Wage
1,category,,,state_of_emergency,physical_distance_closure,physical_distance_closure,Reopening,physical_distance_closure,shelter,shelter,...,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage,minimum_wage
2,type,note,note,start,start,start,end,start,start,start,...,quantity,quantity,quantity,attribute,quantity,quantity,quantity,quantity,quantity,quantity
3,unit,text,attribute,date,date,date,date,date,date,date,...,dollars,dollars,dollars,flag,dollars,dollars,dollars,dollars,dollars,dollars
4,Alabama,AL,1,2020-03-13 00:00:00,2020-03-20 00:00:00,2020-03-20 00:00:00,2020-05-23 00:00:00,2020-03-19 00:00:00,2020-04-04 00:00:00,0,...,.,.,2.13,0,.,.,.,.,.,.


In [6]:
# Keep relevant columns
policy_df = policy_df[['POSTCODE','STEMERG','STAYHOME','STAYHOMENOGP','END_STHM','CLBSNS','END_BSNS','FM_ALL', 'FM_ALL2',\
                       'FM_END','QR_ALLST','QR_END','POPDEN18','POP18']]

# Rename columns
rename_col = {'POSTCODE':'state',
              'STEMERG':'state_of_emergency',
              'STAYHOME':'stay_at_home', 
              'END_STHM':'stay_at_home_end',
              'CLBSNS':'business_closure',
              'END_BSNS':'business_closure_end', 
              'FM_ALL':'facemask_mandate',
              'FM_END':'facemask_mandate_end',
              'QR_ALLST':'quaratine_mandate',
              'QR_END':'quaratine_mandate_end',
              'POPDEN18':'pop_density',
              'POP18':'population'}
policy_df = policy_df.rename(columns=rename_col)

# Drop irrelevant rows
policy_df = policy_df.drop([0,1,2,3,55,56])
policy_df = policy_df.reset_index(drop=True)

# Check datatype, empty values, states
print(policy_df.dtypes)
print(policy_df.isna().sum())
print(covid_df.groupby('state').nunique().count())

policy_df.head()

state                    object
state_of_emergency       object
stay_at_home             object
STAYHOMENOGP             object
stay_at_home_end         object
business_closure         object
business_closure_end     object
facemask_mandate         object
FM_ALL2                  object
facemask_mandate_end     object
quaratine_mandate        object
quaratine_mandate_end    object
pop_density              object
population               object
dtype: object
state                    0
state_of_emergency       0
stay_at_home             0
STAYHOMENOGP             0
stay_at_home_end         0
business_closure         0
business_closure_end     0
facemask_mandate         0
FM_ALL2                  0
facemask_mandate_end     0
quaratine_mandate        0
quaratine_mandate_end    0
pop_density              0
population               0
dtype: int64
date                        51
positiveIncrease            51
totalTestResultsIncrease    51
dtype: int64


Unnamed: 0,state,state_of_emergency,stay_at_home,STAYHOMENOGP,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,FM_ALL2,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end,pop_density,population
0,AL,2020-03-13 00:00:00,2020-04-04 00:00:00,0,2020-04-30 00:00:00,2020-03-28 00:00:00,2020-04-30 00:00:00,2020-07-16 00:00:00,0,0,0,0,93.24,4887871
1,AK,2020-03-11 00:00:00,2020-03-28 00:00:00,0,2020-04-24 00:00:00,2020-03-24 00:00:00,2020-04-24 00:00:00,2020-04-24 00:00:00,0,2020-05-22 00:00:00,2020-03-25 00:00:00,2021-02-14 00:00:00,1.11,737438
2,AZ,2020-03-11 00:00:00,2020-03-31 00:00:00,0,2020-05-16 00:00:00,2020-03-31 00:00:00,2020-05-08 00:00:00,0,0,0,0,2020-05-12 00:00:00,62.91,7171646
3,AR,2020-03-11 00:00:00,0,0,0,2020-04-06 00:00:00,2020-05-04 00:00:00,2020-07-20 00:00:00,0,0,0,2020-06-15 00:00:00,56.67,3013825
4,CA,2020-03-04 00:00:00,2020-03-19 00:00:00,0,0,2020-03-19 00:00:00,2020-05-08 00:00:00,2020-06-18 00:00:00,0,0,0,0,241.65,39557045


In [7]:
# Change datatype
policy_df = policy_df.replace(0,np.nan)
dt_columns = ['state_of_emergency', 'stay_at_home', 'STAYHOMENOGP', 'stay_at_home_end',"business_closure",\
              'business_closure_end','facemask_mandate','FM_ALL2','facemask_mandate_end','quaratine_mandate',\
              'quaratine_mandate_end']
policy_df[dt_columns] = policy_df[dt_columns].apply(pd.to_datetime,args=False)
print(policy_df.dtypes)
print(policy_df.isna().sum())

policy_df.head()

state                            object
state_of_emergency       datetime64[ns]
stay_at_home             datetime64[ns]
STAYHOMENOGP             datetime64[ns]
stay_at_home_end         datetime64[ns]
business_closure         datetime64[ns]
business_closure_end     datetime64[ns]
facemask_mandate         datetime64[ns]
FM_ALL2                  datetime64[ns]
facemask_mandate_end     datetime64[ns]
quaratine_mandate        datetime64[ns]
quaratine_mandate_end    datetime64[ns]
pop_density                     float64
population                        int64
dtype: object
state                     0
state_of_emergency        0
stay_at_home             11
STAYHOMENOGP             46
stay_at_home_end         10
business_closure          1
business_closure_end      2
facemask_mandate         13
FM_ALL2                  49
facemask_mandate_end     47
quaratine_mandate        37
quaratine_mandate_end    36
pop_density               0
population                0
dtype: int64


Unnamed: 0,state,state_of_emergency,stay_at_home,STAYHOMENOGP,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,FM_ALL2,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end,pop_density,population
0,AL,2020-03-13,2020-04-04,NaT,2020-04-30,2020-03-28,2020-04-30,2020-07-16,NaT,NaT,NaT,NaT,93.24,4887871
1,AK,2020-03-11,2020-03-28,NaT,2020-04-24,2020-03-24,2020-04-24,2020-04-24,NaT,2020-05-22,2020-03-25,2021-02-14,1.11,737438
2,AZ,2020-03-11,2020-03-31,NaT,2020-05-16,2020-03-31,2020-05-08,NaT,NaT,NaT,NaT,2020-05-12,62.91,7171646
3,AR,2020-03-11,NaT,NaT,NaT,2020-04-06,2020-05-04,2020-07-20,NaT,NaT,NaT,2020-06-15,56.67,3013825
4,CA,2020-03-04,2020-03-19,NaT,NaT,2020-03-19,2020-05-08,2020-06-18,NaT,NaT,NaT,NaT,241.65,39557045


In [8]:
# Replace missing values with data from other relevant columns
policy_df['stay_at_home'] = np.where(np.isnat(policy_df['stay_at_home']),
                                     policy_df['STAYHOMENOGP'],policy_df['stay_at_home'])
policy_df['facemask_mandate'] = np.where(np.isnat(policy_df['facemask_mandate']),
                                     policy_df['FM_ALL2'],policy_df['facemask_mandate'])
policy_df = policy_df.drop(['STAYHOMENOGP', 'FM_ALL2'], axis=1)

print(policy_df.dtypes)
print(policy_df.isna().sum())

policy_df.head()


state                            object
state_of_emergency       datetime64[ns]
stay_at_home             datetime64[ns]
stay_at_home_end         datetime64[ns]
business_closure         datetime64[ns]
business_closure_end     datetime64[ns]
facemask_mandate         datetime64[ns]
facemask_mandate_end     datetime64[ns]
quaratine_mandate        datetime64[ns]
quaratine_mandate_end    datetime64[ns]
pop_density                     float64
population                        int64
dtype: object
state                     0
state_of_emergency        0
stay_at_home              7
stay_at_home_end         10
business_closure          1
business_closure_end      2
facemask_mandate         12
facemask_mandate_end     47
quaratine_mandate        37
quaratine_mandate_end    36
pop_density               0
population                0
dtype: int64


Unnamed: 0,state,state_of_emergency,stay_at_home,stay_at_home_end,business_closure,business_closure_end,facemask_mandate,facemask_mandate_end,quaratine_mandate,quaratine_mandate_end,pop_density,population
0,AL,2020-03-13,2020-04-04,2020-04-30,2020-03-28,2020-04-30,2020-07-16,NaT,NaT,NaT,93.24,4887871
1,AK,2020-03-11,2020-03-28,2020-04-24,2020-03-24,2020-04-24,2020-04-24,2020-05-22,2020-03-25,2021-02-14,1.11,737438
2,AZ,2020-03-11,2020-03-31,2020-05-16,2020-03-31,2020-05-08,NaT,NaT,NaT,2020-05-12,62.91,7171646
3,AR,2020-03-11,NaT,NaT,2020-04-06,2020-05-04,2020-07-20,NaT,NaT,2020-06-15,56.67,3013825
4,CA,2020-03-04,2020-03-19,NaT,2020-03-19,2020-05-08,2020-06-18,NaT,NaT,NaT,241.65,39557045
