# COVID-19 Case Surveillance Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dask
import dask.dataframe as dd

In [2]:
np.__version__

'1.22.4'

In [4]:
%pip install -U numpy==1.22.4

Note: you may need to restart the kernel to use updated packages.


In [3]:
dask.__version__

'2023.7.0'

### Using Dask to Make Smaller Chunks in Data

In [39]:
df = dd.read_csv('../data/cdc/COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv', 
                 dtype={'county_fips_code': 'float64', 'state_fips_code': 'float64', 
                        'res_county': 'string', 'underlying_conditions_yn': 'string', 'case_month': 'string'})

In [19]:
df

Unnamed: 0_level_0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
npartitions=219,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,string,object,float64,string,float64,object,object,object,object,float64,float64,object,object,object,object,object,object,object,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [20]:
df.compute()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-08,WI,55.0,OCONTO,55083.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Probable Case,Symptomatic,No,Missing,No,
1,2022-05,NY,36.0,WARREN,36113.0,0 - 17 years,Female,Unknown,Unknown,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
2,2020-11,MA,25.0,ESSEX,25009.0,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
3,2021-12,OH,39.0,LORAIN,39093.0,18 to 49 years,Female,Unknown,Unknown,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
4,2020-12,MO,29.0,PLATTE,29165.0,18 to 49 years,Female,Unknown,Unknown,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456919,2020-09,MI,26.0,OTTAWA,26139.0,65+ years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
456920,2021-02,KS,20.0,SALINE,20169.0,0 - 17 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Laboratory reported,Yes,Probable Case,Symptomatic,No,Missing,No,
456921,2021-12,IN,18.0,CLINTON,18023.0,50 to 64 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,No,Missing,No,
456922,2022-07,UT,49.0,CARBON,49007.0,65+ years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,Unknown,


In [21]:
df.memory_usage(deep=True).compute()

Index                                   28032
age_group                          6885112968
case_month                         6332967976
case_onset_interval                 791621024
case_positive_specimen_interval     791621024
county_fips_code                    791621024
current_status                     7912860700
death_yn                           6081659810
ethnicity                          6435103628
exposure_yn                        6311836596
hosp_yn                            6157649396
icu_yn                             6317221234
process                            6390613168
race                               5746227555
res_county                         6222672888
res_state                          5838163958
sex                                6064827904
state_fips_code                     791621024
symptom_status                     6480151359
underlying_conditions_yn           4031409354
dtype: int64

In [22]:
df.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-08,WI,55.0,OCONTO,55083.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Probable Case,Symptomatic,No,Missing,No,
1,2022-05,NY,36.0,WARREN,36113.0,0 - 17 years,Female,Unknown,Unknown,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
2,2020-11,MA,25.0,ESSEX,25009.0,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
3,2021-12,OH,39.0,LORAIN,39093.0,18 to 49 years,Female,Unknown,Unknown,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
4,2020-12,MO,29.0,PLATTE,29165.0,18 to 49 years,Female,Unknown,Unknown,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Unknown,


In [50]:
df_2023 = df.loc[(df['case_month'].str.contains('2023'))]
df_2023.compute()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
65,2023-01,ID,16.0,KOOTENAI,16055.0,50 to 64 years,Female,,,0.0,,Laboratory reported,Missing,Laboratory-confirmed case,Unknown,Unknown,Missing,Unknown,
95,2023-05,OH,39.0,BUTLER,39017.0,,,,,0.0,0.0,Missing,Missing,Probable Case,Symptomatic,Missing,Missing,Unknown,
98,2023-04,MI,26.0,HURON,26063.0,18 to 49 years,,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
159,2023-03,IL,17.0,KNOX,17095.0,50 to 64 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
162,2023-03,OK,40.0,KAY,40071.0,,,,,,,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456870,2023-06,WI,55.0,PIERCE,55093.0,,,,,,0.0,Laboratory reported,Yes,Probable Case,Symptomatic,No,Missing,No,
456874,2023-02,WA,53.0,CLALLAM,53009.0,50 to 64 years,Female,Missing,Unknown,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Unknown,Missing,Missing,
456877,2023-03,LA,22.0,LAFOURCHE,22057.0,0 - 17 years,Male,,,0.0,,Missing,Missing,Probable Case,Unknown,Unknown,Unknown,Missing,
456878,2023-02,TX,48.0,WEBB,48479.0,0 - 17 years,Male,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Missing,


In [52]:
df_2022 = df.loc[(df['case_month'].str.contains('2022'))]
df_2022.compute()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
1,2022-05,NY,36.0,WARREN,36113.0,0 - 17 years,Female,Unknown,Unknown,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
6,2022-08,NY,36.0,GREENE,36039.0,18 to 49 years,Male,White,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
10,2022-01,IN,18.0,POSEY,18129.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,
12,2022-02,SC,45.0,ANDERSON,45007.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Unknown,Missing,No,
15,2022-01,CA,6.0,LOS ANGELES,6037.0,0 - 17 years,Male,Missing,Unknown,,,Missing,Missing,Laboratory-confirmed case,Unknown,No,Missing,Missing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456901,2022-08,TN,47.0,HENDERSON,47077.0,0 - 17 years,Male,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
456902,2022-01,GA,13.0,CLAYTON,13063.0,18 to 49 years,Male,Unknown,Missing,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,
456910,2022-01,MI,26.0,ISABELLA,26073.0,18 to 49 years,Male,Unknown,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
456915,2022-05,SD,46.0,LINCOLN,46083.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,


In [54]:
df_2021 = df.loc[(df['case_month'].str.contains('2021'))]
df_2021.compute()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-08,WI,55.0,OCONTO,55083.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Probable Case,Symptomatic,No,Missing,No,
3,2021-12,OH,39.0,LORAIN,39093.0,18 to 49 years,Female,Unknown,Unknown,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
5,2021-01,VA,51.0,PAGE,51139.0,18 to 49 years,Male,,,0.0,,Routine surveillance,Missing,Probable Case,Missing,Missing,Missing,,
11,2021-08,IN,18.0,WHITLEY,18183.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,
13,2021-11,ID,16.0,BONNER,16017.0,65+ years,Male,White,Non-Hispanic/Latino,0.0,,Clinical evaluation,Missing,Laboratory-confirmed case,Missing,Missing,Missing,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456912,2021-09,IL,17.0,DUPAGE,17043.0,0 - 17 years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,Missing,
456914,2021-12,KY,21.0,LETCHER,21133.0,18 to 49 years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,Yes
456916,2021-11,OH,39.0,WILLIAMS,39171.0,18 to 49 years,Female,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,Missing,Missing,No,
456920,2021-02,KS,20.0,SALINE,20169.0,0 - 17 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Laboratory reported,Yes,Probable Case,Symptomatic,No,Missing,No,


In [55]:
df_2020 = df.loc[(df['case_month'].str.contains('2020'))]
df_2020.compute()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
2,2020-11,MA,25.0,ESSEX,25009.0,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
4,2020-12,MO,29.0,PLATTE,29165.0,18 to 49 years,Female,Unknown,Unknown,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Unknown,
7,2020-08,TX,48.0,MAVERICK,48323.0,18 to 49 years,Male,White,Hispanic/Latino,,-1.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
8,2020-09,MN,27.0,BECKER,27005.0,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
9,2020-11,KY,21.0,LINCOLN,21137.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Clinical evaluation,Yes,Laboratory-confirmed case,Unknown,No,Missing,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456909,2020-12,AZ,4.0,PIMA,4019.0,18 to 49 years,Female,White,Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
456913,2020-11,AR,5.0,GREENE,5055.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,,Missing,Yes,Probable Case,Symptomatic,Unknown,Unknown,Unknown,Yes
456917,2020-12,TN,47.0,HAWKINS,47073.0,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
456919,2020-09,MI,26.0,OTTAWA,26139.0,65+ years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,


#### Exporting Smaller Chunks to CSV

In [57]:
df_2023.compute().to_csv('../data/cdc/CDC_2023.csv')

In [59]:
df_2022.compute().to_csv('../data/cdc/CDC_2022.csv')

In [60]:
df_2021.compute().to_csv('../data/cdc/CDC_2021.csv')

In [61]:
df_2020.compute().to_csv('../data/cdc/CDC_2020.csv')

### Reading in Smaller CSV's with Pandas

In [77]:
df_2023 = pd.read_csv('../data/cdc/CDC_2023.csv', index_col=0, dtype={'res_county': 'string'})

In [78]:
df_2023.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
65,2023-01,ID,16.0,KOOTENAI,16055.0,50 to 64 years,Female,,,0.0,,Laboratory reported,Missing,Laboratory-confirmed case,Unknown,Unknown,Missing,Unknown,
95,2023-05,OH,39.0,BUTLER,39017.0,,,,,0.0,0.0,Missing,Missing,Probable Case,Symptomatic,Missing,Missing,Unknown,
98,2023-04,MI,26.0,HURON,26063.0,18 to 49 years,,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
159,2023-03,IL,17.0,KNOX,17095.0,50 to 64 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
162,2023-03,OK,40.0,KAY,40071.0,,,,,,,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Unknown,


In [79]:
df_2023.sort_values(by='case_month', ascending=False, inplace=True)

In [80]:
df_2023.reset_index(drop=True, inplace=True)

In [81]:
df_2023.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2023-06,AR,5.0,BENTON,5007.0,18 to 49 years,Female,,,,,Missing,Unknown,Probable Case,Symptomatic,Unknown,Unknown,Unknown,
1,2023-06,MI,26.0,WAYNE,26163.0,65+ years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
2,2023-06,MI,26.0,INGHAM,26065.0,18 to 49 years,Female,White,Hispanic/Latino,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
3,2023-06,CA,6.0,SACRAMENTO,6067.0,0 - 17 years,Female,,,,,Missing,Missing,Laboratory-confirmed case,Unknown,Missing,Missing,Missing,
4,2023-06,TX,48.0,SMITH,48423.0,50 to 64 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Missing,


In [82]:
df_2023.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3891832 entries, 0 to 3891831
Data columns (total 19 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_month                       object 
 1   res_state                        object 
 2   state_fips_code                  float64
 3   res_county                       string 
 4   county_fips_code                 float64
 5   age_group                        object 
 6   sex                              object 
 7   race                             object 
 8   ethnicity                        object 
 9   case_positive_specimen_interval  float64
 10  case_onset_interval              float64
 11  process                          object 
 12  exposure_yn                      object 
 13  current_status                   object 
 14  symptom_status                   object 
 15  hosp_yn                          object 
 16  icu_yn                           object 
 17  death_yn

In [83]:
df_2023.isna().sum()

case_month                               0
res_state                              173
state_fips_code                        173
res_county                          310721
county_fips_code                    310721
age_group                           138301
sex                                 294099
race                                957003
ethnicity                          1092425
case_positive_specimen_interval    2032083
case_onset_interval                2957564
process                                  0
exposure_yn                              0
current_status                           0
symptom_status                           0
hosp_yn                                  0
icu_yn                                   0
death_yn                            181484
underlying_conditions_yn           3808457
dtype: int64

In [93]:
df_2023 = df_2023[df_2023['state_fips_code'].notnull()]
df_2023

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2023-06,AR,5.0,BENTON,5007.0,18 to 49 years,Female,,,,,Missing,Unknown,Probable Case,Symptomatic,Unknown,Unknown,Unknown,
1,2023-06,MI,26.0,WAYNE,26163.0,65+ years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
2,2023-06,MI,26.0,INGHAM,26065.0,18 to 49 years,Female,White,Hispanic/Latino,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
3,2023-06,CA,6.0,SACRAMENTO,6067.0,0 - 17 years,Female,,,,,Missing,Missing,Laboratory-confirmed case,Unknown,Missing,Missing,Missing,
4,2023-06,TX,48.0,SMITH,48423.0,50 to 64 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Missing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3891827,2023-01,FL,12.0,PALM BEACH,12099.0,18 to 49 years,Female,White,Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,Missing,Missing,Missing,
3891828,2023-01,NY,36.0,WESTCHESTER,36119.0,18 to 49 years,Female,White,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
3891829,2023-01,FL,12.0,PALM BEACH,12099.0,18 to 49 years,Female,White,Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,No,Missing,Missing,
3891830,2023-01,NY,36.0,NASSAU,36059.0,50 to 64 years,Female,Unknown,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,


> Above I dropped the NaN values in the 'state_fips_code' column since there is no location data at all for these entries.

In [94]:
df_2023['state_fips_code'] = df_2023['state_fips_code'].astype('int')
df_2023['state_fips_code']

0           5
1          26
2          26
3           6
4          48
           ..
3891827    12
3891828    36
3891829    12
3891830    36
3891831    39
Name: state_fips_code, Length: 3891659, dtype: int32

In [98]:
df_2023[df_2023['county_fips_code'].isnull()]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
697,2023-06,WI,55,,,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
699,2023-06,WI,55,,,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,
702,2023-06,WI,55,,,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,
705,2023-06,WI,55,,,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
711,2023-06,WI,55,,,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3883879,2023-01,KS,20,,,0 - 17 years,Male,White,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3883956,2023-01,KS,20,,,0 - 17 years,Male,White,,,0.0,Missing,Missing,Probable Case,Symptomatic,Unknown,Missing,No,
3883964,2023-01,KS,20,,,0 - 17 years,Male,White,,,0.0,Missing,Missing,Probable Case,Symptomatic,Unknown,Missing,No,
3883975,2023-01,KS,20,,,0 - 17 years,Male,White,,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,No,


> There are still a lot of NaN's in this column so it won't let me change the column thing to 'int' dtype

In [95]:
df_2023.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2023-06,AR,5,BENTON,5007.0,18 to 49 years,Female,,,,,Missing,Unknown,Probable Case,Symptomatic,Unknown,Unknown,Unknown,
1,2023-06,MI,26,WAYNE,26163.0,65+ years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
2,2023-06,MI,26,INGHAM,26065.0,18 to 49 years,Female,White,Hispanic/Latino,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
3,2023-06,CA,6,SACRAMENTO,6067.0,0 - 17 years,Female,,,,,Missing,Missing,Laboratory-confirmed case,Unknown,Missing,Missing,Missing,
4,2023-06,TX,48,SMITH,48423.0,50 to 64 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Missing,


In [112]:
df_2023['county_fips_code'] = df_2023['county_fips_code'].replace(np.NaN, 0).astype(int)
df_2023['county_fips_code']

0           5007
1          26163
2          26065
3           6067
4          48423
           ...  
3891827    12099
3891828    36119
3891829    12099
3891830    36059
3891831    39003
Name: county_fips_code, Length: 3891659, dtype: int32

> I changed the NaN values to 0 in order to change the dtype to 'int'

In [116]:
df_2023[df_2023['county_fips_code'] == 0]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
697,2023-06,WI,55,,0,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
699,2023-06,WI,55,,0,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,
702,2023-06,WI,55,,0,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,
705,2023-06,WI,55,,0,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
711,2023-06,WI,55,,0,18 to 49 years,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3883879,2023-01,KS,20,,0,0 - 17 years,Male,White,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3883956,2023-01,KS,20,,0,0 - 17 years,Male,White,,,0.0,Missing,Missing,Probable Case,Symptomatic,Unknown,Missing,No,
3883964,2023-01,KS,20,,0,0 - 17 years,Male,White,,,0.0,Missing,Missing,Probable Case,Symptomatic,Unknown,Missing,No,
3883975,2023-01,KS,20,,0,0 - 17 years,Male,White,,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,No,


#### Exporting cleaned df_2023 to CSV

In [118]:
df_2023.to_csv('../data/cdc/CDC_2023_cleaned.csv', index=False)