# Confirmed Cases and Deaths Code
### Updated on 4/11/2022 to move redundant code to archived/casesAndDeaths.ipynb

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## FIPS code

In [2]:
# importing fips data. and rearranging the columns
fips = pd.read_excel("~/Documents/ra/HPC/HPC_datahub/Raw data/fips.xlsx")
fips = fips.iloc[:, [1, 3, 2, 4, 5]]

# Convert the 'integers' to integers in fips file.
for i in range(fips.shape[0]):
    if fips.iloc[i, 4] != 'unknown':
        fips.iloc[i, 4] = int(fips.iloc[i, 4])
        fips.iloc[i, 2] = int(fips.iloc[i, 2])

fips = fips.drop(fips[fips.fips == 0].index)
fips = fips.drop(fips[fips.fips == "unknown"].index)
fips = fips.reset_index()
fips = fips.iloc[0:3142, 1:]

fips

Unnamed: 0,stfips,stname,ctyfips,ctyname,fips
0,1,Alabama,1,Autauga County,1001
1,1,Alabama,3,Baldwin County,1003
2,1,Alabama,5,Barbour County,1005
3,1,Alabama,7,Bibb County,1007
4,1,Alabama,9,Blount County,1009
...,...,...,...,...,...
3137,56,Wyoming,37,Sweetwater County,56037
3138,56,Wyoming,39,Teton County,56039
3139,56,Wyoming,41,Uinta County,56041
3140,56,Wyoming,43,Washakie County,56043


## Update data from CSSE

In [3]:
cases = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv")
cases.drop(cases.columns[[0,1,2, 3, 5, 6, 7, 8, 9, 10]], axis=1, inplace=True)

cases.rename(columns={"FIPS":"fips"}, inplace=True)

for col in cases.columns[1:]:
    cases.rename(columns={col: "confirmed_cases_" + datetime.strptime(col, "%m/%d/%y").strftime('%Y%m%d')}, inplace=True)

cases = pd.merge(fips, cases, on="fips", how="left")
#cases

In [4]:
deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")
deaths.drop(deaths.columns[[0,1,2, 3, 5, 6, 7, 8, 9, 10, 11]], axis=1, inplace=True)

deaths.rename(columns={"FIPS":"fips"}, inplace=True)

for col in deaths.columns[1:]:
    deaths.rename(columns={col: "deaths_" + datetime.strptime(col, "%m/%d/%y").strftime('%Y%m%d')}, inplace=True)

deaths = pd.merge(fips, deaths, on="fips", how="left")
#deaths

In [5]:
deaths = deaths.iloc[:, 5:]
csse = pd.concat([cases, deaths], axis=1)

In [6]:
# rename 'fips' column to 'scfips' to clarify it's state-county FIPS
csse.rename(columns = {'fips': 'scfips'}, inplace = True)
csse

Unnamed: 0,stfips,stname,ctyfips,ctyname,scfips,confirmed_cases_20200122,confirmed_cases_20200123,confirmed_cases_20200124,confirmed_cases_20200125,confirmed_cases_20200126,...,deaths_20220401,deaths_20220402,deaths_20220403,deaths_20220404,deaths_20220405,deaths_20220406,deaths_20220407,deaths_20220408,deaths_20220409,deaths_20220410
0,1,Alabama,1,Autauga County,1001,0,0,0,0,0,...,211,211,211,213,213,213,213,213,213,213
1,1,Alabama,3,Baldwin County,1003,0,0,0,0,0,...,675,675,675,675,675,676,676,677,677,677
2,1,Alabama,5,Barbour County,1005,0,0,0,0,0,...,97,97,97,97,97,98,98,98,98,98
3,1,Alabama,7,Bibb County,1007,0,0,0,0,0,...,101,101,101,101,101,101,101,101,101,101
4,1,Alabama,9,Blount County,1009,0,0,0,0,0,...,236,237,237,237,237,237,239,239,239,239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56,Wyoming,37,Sweetwater County,56037,0,0,0,0,0,...,124,124,124,124,124,124,124,124,124,124
3138,56,Wyoming,39,Teton County,56039,0,0,0,0,0,...,16,16,16,16,16,16,16,16,16,16
3139,56,Wyoming,41,Uinta County,56041,0,0,0,0,0,...,39,39,39,39,39,39,39,39,39,39
3140,56,Wyoming,43,Washakie County,56043,0,0,0,0,0,...,43,43,43,43,43,43,43,43,43,43


## CDC county-level cumulative provisional deaths data

In [7]:
cdcDeaths = pd.read_csv('https://data.cdc.gov/api/views/kn79-hsxy/rows.csv?accessType=DOWNLOAD')

In [8]:
cdcDeaths

Unnamed: 0,Date as of,Start Date,End Date,State,County name,FIPS County Code,Urban Rural Code,Deaths involving COVID-19,Deaths from All Causes,Footnote
0,04/06/2022,01/01/2020,04/02/2022,AK,Aleutians East Borough,2013,Noncore,,16.0,One or more data cells have counts between 1-9...
1,04/06/2022,01/01/2020,04/02/2022,AK,Anchorage Municipality,2020,Medium metro,662.0,5686.0,
2,04/06/2022,01/01/2020,04/02/2022,AK,Bethel Census Area,2050,Noncore,32.0,263.0,
3,04/06/2022,01/01/2020,04/02/2022,AK,Denali Borough,2068,Noncore,,14.0,One or more data cells have counts between 1-9...
4,04/06/2022,01/01/2020,04/02/2022,AK,Dillingham Census Area,2070,Noncore,,75.0,One or more data cells have counts between 1-9...
...,...,...,...,...,...,...,...,...,...,...
3069,04/06/2022,01/01/2020,04/02/2022,WY,Sweetwater County,56037,Micropolitan,98.0,777.0,
3070,04/06/2022,01/01/2020,04/02/2022,WY,Teton County,56039,Micropolitan,18.0,208.0,
3071,04/06/2022,01/01/2020,04/02/2022,WY,Uinta County,56041,Micropolitan,20.0,328.0,
3072,04/06/2022,01/01/2020,04/02/2022,WY,Washakie County,56043,Noncore,27.0,184.0,


In [9]:
# rename variables
cdcDeaths.rename(columns = {'State': 'stabbr', 'County name': 'ctyname', 'FIPS County Code': 'scfips',\
                            'Urban Rural Code': 'urbanrural', 'Deaths involving COVID-19': 'death_covid',\
                            'Deaths from All Causes': 'death_all', 'Start Date': 'death_startdate',\
                            'End Date': 'death_enddate'}, \
                 inplace = True)

In [10]:
# format dates
cdcDeaths['death_startdate'] = pd.to_datetime(cdcDeaths['death_startdate'])
cdcDeaths['death_startdate'] = cdcDeaths['death_startdate'].dt.strftime('%Y%m%d')
cdcDeaths['death_enddate'] = pd.to_datetime(cdcDeaths['death_enddate'])
cdcDeaths['death_enddate'] = cdcDeaths['death_enddate'].dt.strftime('%Y%m%d')

In [11]:
# extract end date
endDate = list(set(cdcDeaths['death_enddate']))[0]

# add end date to variable names
cdcDeaths.rename(columns = {'death_covid': 'death_covid_' + endDate, 'death_all': 'death_all_' + endDate}, \
                 inplace = True)

In [12]:
# keep selected variables only
cdcDeaths = cdcDeaths[['stabbr', 'scfips', 'urbanrural', 'death_startdate', 'death_covid_20220402', \
                       'death_all_20220402']].copy()

In [13]:
cdcDeaths

Unnamed: 0,stabbr,scfips,urbanrural,death_startdate,death_covid_20220402,death_all_20220402
0,AK,2013,Noncore,20200101,,16.0
1,AK,2020,Medium metro,20200101,662.0,5686.0
2,AK,2050,Noncore,20200101,32.0,263.0
3,AK,2068,Noncore,20200101,,14.0
4,AK,2070,Noncore,20200101,,75.0
...,...,...,...,...,...,...
3069,WY,56037,Micropolitan,20200101,98.0,777.0
3070,WY,56039,Micropolitan,20200101,18.0,208.0
3071,WY,56041,Micropolitan,20200101,20.0,328.0
3072,WY,56043,Noncore,20200101,27.0,184.0


## Merge date

In [14]:
# specify fips to be scfips
fips.rename(columns = {'fips': 'scfips'}, inplace = True)

In [15]:
# merge fips with csse
csse = fips.merge(csse, on = ['stfips', 'stname', 'ctyfips', 'ctyname', 'scfips'], how = 'outer')

In [16]:
# merge with cdc deaths
final = csse.merge(cdcDeaths, on = 'scfips', how = 'left')

In [17]:
final.columns

Index(['stfips', 'stname', 'ctyfips', 'ctyname', 'scfips',
       'confirmed_cases_20200122', 'confirmed_cases_20200123',
       'confirmed_cases_20200124', 'confirmed_cases_20200125',
       'confirmed_cases_20200126',
       ...
       'deaths_20220406', 'deaths_20220407', 'deaths_20220408',
       'deaths_20220409', 'deaths_20220410', 'stabbr', 'urbanrural',
       'death_startdate', 'death_covid_20220402', 'death_all_20220402'],
      dtype='object', length=1630)

In [18]:
final = final[['stfips', 'stabbr', 'stname', 'ctyfips', 'ctyname', 'scfips', 'urbanrural'] \
              + [x for x in list(final.columns)[5:] if x not in ['stabbr', 'urbanrural']]]

In [19]:
final['stfips'] = final['stfips'].astype('int')
final['scfips'] = final['scfips'].astype('int')
final

Unnamed: 0,stfips,stabbr,stname,ctyfips,ctyname,scfips,urbanrural,confirmed_cases_20200122,confirmed_cases_20200123,confirmed_cases_20200124,...,deaths_20220404,deaths_20220405,deaths_20220406,deaths_20220407,deaths_20220408,deaths_20220409,deaths_20220410,death_startdate,death_covid_20220402,death_all_20220402
0,1,AL,Alabama,1,Autauga County,1001,Medium metro,0,0,0,...,213,213,213,213,213,213,213,20200101,170.0,1242.0
1,1,AL,Alabama,3,Baldwin County,1003,Small metro,0,0,0,...,675,675,676,676,677,677,677,20200101,596.0,5605.0
2,1,AL,Alabama,5,Barbour County,1005,Noncore,0,0,0,...,97,97,98,98,98,98,98,20200101,75.0,648.0
3,1,AL,Alabama,7,Bibb County,1007,Large fringe metro,0,0,0,...,101,101,101,101,101,101,101,20200101,45.0,463.0
4,1,AL,Alabama,9,Blount County,1009,Large fringe metro,0,0,0,...,237,237,237,239,239,239,239,20200101,88.0,1174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56,WY,Wyoming,37,Sweetwater County,56037,Micropolitan,0,0,0,...,124,124,124,124,124,124,124,20200101,98.0,777.0
3138,56,WY,Wyoming,39,Teton County,56039,Micropolitan,0,0,0,...,16,16,16,16,16,16,16,20200101,18.0,208.0
3139,56,WY,Wyoming,41,Uinta County,56041,Micropolitan,0,0,0,...,39,39,39,39,39,39,39,20200101,20.0,328.0
3140,56,WY,Wyoming,43,Washakie County,56043,Noncore,0,0,0,...,43,43,43,43,43,43,43,20200101,27.0,184.0


In [20]:
# output
final.to_csv("../Pandemic/county_casesAndDeaths.csv", index = False)

## Update dictionary

In [21]:
from get_dict_function import get_dict 

In [22]:
cDict = pd.read_csv('../Pandemic/county_data_dictionary.csv')

In [23]:
tDict = cDict[cDict['field'] == 'cases and deaths']

In [24]:
tDict.variable_name

0                       stfips
1                       stabbr
2                       stname
3                      ctyfips
4                      ctyname
5                       scfips
6     confirmed_cases_yyyymmdd
7              deaths_yyyymmdd
8              death_startdate
9                   urbanrural
10          death_all_20220305
11        death_covid_20220305
Name: variable_name, dtype: object

In [25]:
# move urbanrural variable to after scfips on 4/11/2022: run once
tDict1 = tDict.iloc[:6, :]
tDict2 = tDict.iloc[9, :]
tDict3 = tDict.iloc[6:9, :]
tDict4 = tDict.iloc[10:, :]
tDict = tDict1.append(tDict2, ignore_index = True)
tDict = tDict.append(tDict3, ignore_index = True)
tDict = tDict.append(tDict4, ignore_index = True)
tDict

Unnamed: 0,variable_name,label,file,field,source,link,start_column,end_column,start_date,end_date,filename_in_release4
0,stfips,State name,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,1,1,-1,-1,casesAndDeaths.csv
1,stabbr,State abbreviation,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,2,2,-1,-1,casesAndDeaths.csv
2,stname,State FIPS code,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,3,3,-1,-1,casesAndDeaths.csv
3,ctyfips,County name,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,4,4,-1,-1,casesAndDeaths.csv
4,ctyname,County FIPS code,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,5,5,-1,-1,casesAndDeaths.csv
5,scfips,FIPS code of state-county,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,6,6,-1,-1,casesAndDeaths.csv
6,urbanrural,NCHS urban-rural classification,county_casesAndDeaths.csv,cases and deaths,CDC,https://data.cdc.gov/NCHS/Provisional-COVID-19...,1562,1562,-1,-1,
7,confirmed_cases_yyyymmdd,No of confirmed cases on yyyymmdd,county_casesAndDeaths.csv,cases and deaths,CSSE,https://github.com/CSSEGISandData/COVID-19,7,783,20200122,20220308,casesAndDeaths.csv
8,deaths_yyyymmdd,No of deaths on yyyymmdd,county_casesAndDeaths.csv,cases and deaths,CSSE,https://github.com/CSSEGISandData/COVID-19,784,1560,20200122,20220308,casesAndDeaths.csv
9,death_startdate,First date of data period,county_casesAndDeaths.csv,cases and deaths,CDC,https://data.cdc.gov/NCHS/Provisional-COVID-19...,1561,1561,-1,-1,


In [26]:
tDict.loc[10, 'variable_name'] = 'death_all_' + endDate
tDict.loc[11, 'variable_name'] = 'death_covid_' + endDate

In [27]:
tDict.variable_name

0                       stfips
1                       stabbr
2                       stname
3                      ctyfips
4                      ctyname
5                       scfips
6                   urbanrural
7     confirmed_cases_yyyymmdd
8              deaths_yyyymmdd
9              death_startdate
10          death_all_20220402
11        death_covid_20220402
Name: variable_name, dtype: object

In [28]:
tDict = get_dict(final, tDict)

0
Single column for this item...
1
Single column for this item...
2
Single column for this item...
3
Single column for this item...
4
Single column for this item...
5
Single column for this item...
6
Single column for this item...
7
Multiple columns for this item...
8
Multiple columns for this item...
9
Single column for this item...
10
Single column for this item...
11
Single column for this item...


In [29]:
cDict[cDict['field'] == 'cases and deaths'] = tDict.copy()

In [30]:
cDict

Unnamed: 0,variable_name,label,file,field,source,link,start_column,end_column,start_date,end_date,filename_in_release4
0,stfips,State name,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,1,1,-1,-1,casesAndDeaths.csv
1,stabbr,State abbreviation,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,2,2,-1,-1,casesAndDeaths.csv
2,stname,State FIPS code,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,3,3,-1,-1,casesAndDeaths.csv
3,ctyfips,County name,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,4,4,-1,-1,casesAndDeaths.csv
4,ctyname,County FIPS code,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,5,5,-1,-1,casesAndDeaths.csv
5,scfips,FIPS code of state-county,county_casesAndDeaths.csv,cases and deaths,Census,https://www.census.gov/geographies/reference-f...,6,6,-1,-1,casesAndDeaths.csv
6,urbanrural,NCHS urban-rural classification,county_casesAndDeaths.csv,cases and deaths,CDC,https://data.cdc.gov/NCHS/Provisional-COVID-19...,7,7,-1,-1,
7,confirmed_cases_yyyymmdd,No of confirmed cases on yyyymmdd,county_casesAndDeaths.csv,cases and deaths,CSSE,https://github.com/CSSEGISandData/COVID-19,8,817,20200122,20220410,casesAndDeaths.csv
8,deaths_yyyymmdd,No of deaths on yyyymmdd,county_casesAndDeaths.csv,cases and deaths,CSSE,https://github.com/CSSEGISandData/COVID-19,818,1627,20200122,20220410,casesAndDeaths.csv
9,death_startdate,First date of data period,county_casesAndDeaths.csv,cases and deaths,CDC,https://data.cdc.gov/NCHS/Provisional-COVID-19...,1628,1628,-1,-1,


In [31]:
# output
final.to_csv("../Pandemic/county_data_dictionary.csv", index = False)