# Cleaning and Chunking CDC Case Surveillance Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 2020 Dataset

In [2]:
df_2020 = pd.read_csv('../data/cdc/CDC_2020.csv', index_col=0, 
            dtype={'res_county': 'string', 'underlying_conditions_yn': 'string'})

In [3]:
df_2020.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
2,2020-11,MA,25.0,ESSEX,25009.0,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
4,2020-12,MO,29.0,PLATTE,29165.0,18 to 49 years,Female,Unknown,Unknown,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Unknown,
7,2020-08,TX,48.0,MAVERICK,48323.0,18 to 49 years,Male,White,Hispanic/Latino,,-1.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
8,2020-09,MN,27.0,BECKER,27005.0,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,Missing,Missing,No,
9,2020-11,KY,21.0,LINCOLN,21137.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Clinical evaluation,Yes,Laboratory-confirmed case,Unknown,No,Missing,No,Yes


In [4]:
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20109295 entries, 2 to 456923
Data columns (total 19 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_month                       object 
 1   res_state                        object 
 2   state_fips_code                  float64
 3   res_county                       string 
 4   county_fips_code                 float64
 5   age_group                        object 
 6   sex                              object 
 7   race                             object 
 8   ethnicity                        object 
 9   case_positive_specimen_interval  float64
 10  case_onset_interval              float64
 11  process                          object 
 12  exposure_yn                      object 
 13  current_status                   object 
 14  symptom_status                   object 
 15  hosp_yn                          object 
 16  icu_yn                           object 
 17  death_yn     

In [5]:
df_2020.sort_values(by='case_month', ascending=False, inplace=True)

In [6]:
df_2020[df_2020['state_fips_code'].isnull()]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
45405,2020-12,,,,,,,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
45409,2020-12,,,,,,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
45413,2020-12,,,,,,,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
45417,2020-12,,,,,,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
45389,2020-12,,,,,,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50726,2020-01,,,,,,,,,50.0,,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,
50710,2020-01,,,,,,,,,11.0,,Missing,Yes,Laboratory-confirmed case,Symptomatic,Yes,Missing,No,Yes
50714,2020-01,,,,,,,,,50.0,,Laboratory reported,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,No,
50718,2020-01,,,,,,,,,47.0,,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,No,No,


#### Dropping the rows with no location information

In [7]:
df_2020 = df_2020[df_2020['state_fips_code'].notnull()]

In [8]:
df_2020

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
106080,2020-12,CA,6.0,ORANGE,6059.0,18 to 49 years,Male,White,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,Missing,Missing,Missing,
299113,2020-12,AZ,4.0,COCHISE,4003.0,18 to 49 years,Female,Multiple/Other,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
299333,2020-12,MI,26.0,LENAWEE,26091.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,Yes
329729,2020-12,TX,48.0,CAMERON,48061.0,0 - 17 years,Male,Missing,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
299319,2020-12,OH,39.0,ASHTABULA,39007.0,50 to 64 years,Male,Unknown,Unknown,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173108,2020-01,MI,26.0,,,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3609,2020-01,FL,12.0,ORANGE,12095.0,,,,,105.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Missing,
214904,2020-01,NY,36.0,,,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
88464,2020-01,TN,47.0,DAVIDSON,47037.0,,,,,48.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,


In [9]:
df_2020['state_fips_code'] = df_2020['state_fips_code'].astype('int')
df_2020['state_fips_code']

106080     6
299113     4
299333    26
329729    48
299319    39
          ..
173108    26
3609      12
214904    36
88464     47
448182    36
Name: state_fips_code, Length: 20108838, dtype: int32

In [10]:
df_2020['county_fips_code'] = df_2020['county_fips_code'].replace(np.NaN, 0).astype('int')
df_2020['county_fips_code']

106080     6059
299113     4003
299333    26091
329729    48061
299319    39007
          ...  
173108        0
3609      12095
214904        0
88464     47037
448182    36065
Name: county_fips_code, Length: 20108838, dtype: int32

> Changing the NaN values to 0 in order to change the dtype for this column to 'int'

#### Splitting the dataset into 6 month periods

In [11]:
df_2020_2 = df_2020[df_2020['case_month'] >= '2020-07']
df_2020_1 = df_2020[df_2020['case_month'] <= '2020-06']

In [12]:
df_2020_1.reset_index(drop=True, inplace=True)
df_2020_2.reset_index(drop=True, inplace=True)

In [13]:
df_2020_1

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-06,MO,29,JASPER,29097,18 to 49 years,Female,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Unknown,
1,2020-06,NC,37,GASTON,37071,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,0.0,Missing,Unknown,Laboratory-confirmed case,Symptomatic,No,Unknown,No,
2,2020-06,SC,45,DILLON,45033,18 to 49 years,Male,Black,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3,2020-06,WI,55,BROWN,55009,18 to 49 years,Female,Multiple/Other,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,,
4,2020-06,NJ,34,SOMERSET,34035,18 to 49 years,Male,Unknown,Missing,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3008091,2020-01,MI,26,,0,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3008092,2020-01,FL,12,ORANGE,12095,,,,,105.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,Missing,
3008093,2020-01,NY,36,,0,,,,,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3008094,2020-01,TN,47,DAVIDSON,47037,,,,,48.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Missing,


In [14]:
df_2020_2

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-12,CA,6,ORANGE,6059,18 to 49 years,Male,White,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Unknown,Missing,Missing,Missing,
1,2020-12,AZ,4,COCHISE,4003,18 to 49 years,Female,Multiple/Other,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
2,2020-12,MI,26,LENAWEE,26091,18 to 49 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,Yes
3,2020-12,TX,48,CAMERON,48061,0 - 17 years,Male,Missing,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
4,2020-12,OH,39,ASHTABULA,39007,50 to 64 years,Male,Unknown,Unknown,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,Missing,Missing,Unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17100737,2020-07,TX,48,MAVERICK,48323,18 to 49 years,Male,White,Hispanic/Latino,,-3.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
17100738,2020-07,TX,48,VICTORIA,48469,50 to 64 years,Female,Unknown,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
17100739,2020-07,TX,48,MCLENNAN,48309,18 to 49 years,Male,Unknown,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Unknown,Missing,Missing,
17100740,2020-07,NY,36,ALBANY,36001,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,


#### Exporting cleaned 2020 datasets

In [15]:
df_2020.to_csv('../data/cdc/CDC_2020_cleaned.csv', index=False)
df_2020_1.to_csv('../data/cdc/CDC_2020_1_cleaned.csv', index=False)
df_2020_2.to_csv('../data/cdc/CDC_2020_2_cleaned.csv', index=False)