# Cleaning and Chunking CDC Case Surveillance Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 2021 Dataset

In [2]:
df_2021 = pd.read_csv('../data/cdc/CDC_2021.csv', index_col=0, 
            dtype={'res_county': 'string', 'underlying_conditions_yn': 'string'})

In [3]:
df_2021.head()

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-08,WI,55.0,OCONTO,55083.0,65+ years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Probable Case,Symptomatic,No,Missing,No,
3,2021-12,OH,39.0,LORAIN,39093.0,18 to 49 years,Female,Unknown,Unknown,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Unknown,
5,2021-01,VA,51.0,PAGE,51139.0,18 to 49 years,Male,,,0.0,,Routine surveillance,Missing,Probable Case,Missing,Missing,Missing,,
11,2021-08,IN,18.0,WHITLEY,18183.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,
13,2021-11,ID,16.0,BONNER,16017.0,65+ years,Male,White,Non-Hispanic/Latino,0.0,,Clinical evaluation,Missing,Laboratory-confirmed case,Missing,Missing,Missing,,


In [4]:
df_2021.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34437682 entries, 0 to 456921
Data columns (total 19 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   case_month                       object 
 1   res_state                        object 
 2   state_fips_code                  float64
 3   res_county                       string 
 4   county_fips_code                 float64
 5   age_group                        object 
 6   sex                              object 
 7   race                             object 
 8   ethnicity                        object 
 9   case_positive_specimen_interval  float64
 10  case_onset_interval              float64
 11  process                          object 
 12  exposure_yn                      object 
 13  current_status                   object 
 14  symptom_status                   object 
 15  hosp_yn                          object 
 16  icu_yn                           object 
 17  death_yn     

In [5]:
df_2021.sort_values(by='case_month', ascending=False, inplace=True)

In [6]:
df_2021[df_2021['state_fips_code'].isnull()]

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
47641,2021-11,,,,,,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
47657,2021-11,,,,,,,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Missing,
47645,2021-11,,,,,,,,,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
47653,2021-11,,,,,,,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Missing,
47649,2021-11,,,,,,,,,,,Missing,Missing,Probable Case,Missing,Missing,Missing,Missing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44982,2021-01,,,,,,,,,,,Missing,Yes,Laboratory-confirmed case,Asymptomatic,No,Missing,No,
44958,2021-01,,,,,,,,,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,
44962,2021-01,,,,,,,,,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,
44966,2021-01,,,,,,,,,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,


#### Dropping the rows with no location information

In [7]:
df_2021 = df_2021[df_2021['state_fips_code'].notnull()]

In [8]:
df_2021

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
456921,2021-12,IN,18.0,CLINTON,18023.0,50 to 64 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,No,Missing,No,
115503,2021-12,NY,36.0,NASSAU,36059.0,65+ years,Male,Missing,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
119111,2021-12,AZ,4.0,MARICOPA,4013.0,50 to 64 years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
397465,2021-12,LA,22.0,ACADIA,22001.0,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,Missing,
115537,2021-12,NJ,34.0,ESSEX,34013.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390832,2021-01,PA,42.0,PHILADELPHIA,42101.0,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,Unknown,
258196,2021-01,TX,48.0,LUBBOCK,48303.0,18 to 49 years,Male,White,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
258207,2021-01,KY,21.0,DAVIESS,21059.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,No,
390816,2021-01,TX,48.0,DALLAS,48113.0,50 to 64 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,


In [9]:
df_2021['state_fips_code'] = df_2021['state_fips_code'].astype('int')
df_2021['state_fips_code']

456921    18
115503    36
119111     4
397465    22
115537    34
          ..
390832    42
258196    48
258207    21
390816    48
300097     8
Name: state_fips_code, Length: 34437165, dtype: int32

In [10]:
df_2021['county_fips_code'] = df_2021['county_fips_code'].replace(np.NaN, 0).astype('int')
df_2021['county_fips_code']

456921    18023
115503    36059
119111     4013
397465    22001
115537    34013
          ...  
390832    42101
258196    48303
258207    21059
390816    48113
300097     8031
Name: county_fips_code, Length: 34437165, dtype: int32

> Changing the NaN values to 0 in order to change the dtype for this column to 'int'

#### Splitting the dataset into 6 month periods

In [11]:
df_2021_2 = df_2021[df_2021['case_month'] >= '2021-07']
df_2021_1 = df_2021[df_2021['case_month'] <= '2021-06']

In [12]:
df_2021_1.reset_index(drop=True, inplace=True)
df_2021_2.reset_index(drop=True, inplace=True)

In [13]:
df_2021_1

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-06,MA,25,NORFOLK,25021,18 to 49 years,Male,,,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
1,2021-06,MI,26,OAKLAND,26125,50 to 64 years,Male,Unknown,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
2,2021-06,MI,26,GENESEE,26049,18 to 49 years,Female,Black,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Unknown,
3,2021-06,FL,12,MIAMI-DADE,12086,65+ years,Female,White,Hispanic/Latino,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
4,2021-06,RI,44,WASHINGTON,44009,0 - 17 years,Male,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12747354,2021-01,PA,42,PHILADELPHIA,42101,18 to 49 years,Female,,,0.0,,Missing,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,Unknown,
12747355,2021-01,TX,48,LUBBOCK,48303,18 to 49 years,Male,White,Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
12747356,2021-01,KY,21,DAVIESS,21059,18 to 49 years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,No,
12747357,2021-01,TX,48,DALLAS,48113,50 to 64 years,Female,White,Non-Hispanic/Latino,,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,


In [14]:
df_2021_2

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2021-12,IN,18,CLINTON,18023,50 to 64 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Probable Case,Missing,No,Missing,No,
1,2021-12,NY,36,NASSAU,36059,65+ years,Male,Missing,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,No,
2,2021-12,AZ,4,MARICOPA,4013,50 to 64 years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,Missing,No,
3,2021-12,LA,22,ACADIA,22001,50 to 64 years,Female,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Unknown,Unknown,Unknown,Missing,
4,2021-12,NJ,34,ESSEX,34013,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,,Missing,Missing,Laboratory-confirmed case,Missing,No,Missing,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21689801,2021-07,CA,6,LOS ANGELES,6037,18 to 49 years,Male,White,Non-Hispanic/Latino,,0.0,Missing,Missing,Probable Case,Symptomatic,No,Missing,Missing,
21689802,2021-07,TX,48,TARRANT,48439,18 to 49 years,Female,Unknown,Unknown,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,
21689803,2021-07,OH,39,CUYAHOGA,39035,18 to 49 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,Missing,Unknown,
21689804,2021-07,TX,48,DENTON,48121,65+ years,Missing,Missing,Missing,,,Missing,Missing,Laboratory-confirmed case,Missing,Missing,Missing,Missing,


#### Exporting cleaned 2021 datasets

In [15]:
df_2021.to_csv('../data/cdc/CDC_2021_cleaned.csv', index=False)
df_2021_1.to_csv('../data/cdc/CDC_2021_1_cleaned.csv', index=False)
df_2021_2.to_csv('../data/cdc/CDC_2021_2_cleaned.csv', index=False)