# EDA of Infrequently Reported Notifiable Diseases
Wikipedia defines "notifiable diseases" as "any disease that is required by law to be reported to government authorities." These datasets from the Centers for Disease Control and Prevention website (data.cdc.gov) contains information about cases of selected infrequently reported notifiable diseases from 2014 to 2018 in all 50 states.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid", {'axes.grid' : False})

In [2]:
df2014 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2014.csv')
df2015 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2015.csv')
df2016 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2016.csv')
df2017 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2017.csv')
df2018 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2018.csv')

In [3]:
df2014.head()

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,"Total cases reported 2013, flag",Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.)
0,Anthrax,2014,1,,-,,-,,-,,-,,-,1.0,,,-,1.0,,
1,"Arboviral diseases, California serogroup virus...",2014,1,,-,,-,,-,81.0,,81.0,,137.0,,75.0,,55.0,,
2,"Arboviral diseases, Eastern equine encephaliti...",2014,1,,-,,-,0.0,,6.0,,15.0,,4.0,,10.0,,4.0,,
3,"Arboviral diseases, Powassan virus disease§,¶",2014,1,,-,,-,0.0,,12.0,,7.0,,16.0,,8.0,,6.0,,
4,"Arboviral diseases, St. Louis encephalitis vir...",2014,1,,-,,-,0.0,,,-,3.0,,6.0,,10.0,,12.0,,


In [4]:
dfs = [df2014, df2015, df2016, df2017, df2018]

for df in dfs:
    print(df.columns)

Index(['Disease', 'MMWR year', 'MMWR week', 'Current week',
       'Current week, flag', 'Cum 2014', 'Cum 2014, flag',
       '5-year weekly average†', '5-year weekly average†, flag',
       'Total cases reported  2013', 'Total cases reported  2013, flag',
       'Total cases reported 2012', 'Total cases reported 2012, flag',
       'Total cases reported 2011', 'Total cases reported 2011, flag',
       'Total cases reported 2010', 'Total cases reported 2010, flag',
       'Total cases reported 2009', 'Total cases reported 2009, flag',
       'States reporting cases during current week (No.) '],
      dtype='object')
Index(['Disease', 'MMWR year', 'MMWR week', 'Current week',
       'Current week, flag', 'Cum 2015', 'Cum 2015, flag',
       '5-year weekly average§', '5-year weekly average§, flag',
       'Total cases reported  2014', 'Total cases reported  2014, flag',
       'Total cases reported 2013', 'Total cases reported 2013, flag',
       'Total cases reported 2012', 'Total cases

In [5]:
for df in dfs:
    print(df.shape)

(2915, 20)
(3176, 20)
(3362, 20)
(3329, 20)
(3073, 20)


## Clean Data
The disease column has unicode characters that may cause the diseases to be unique even if they aren't. In this section, my goal is to remove the unicode characters from the column in order to make the disease names as uniform as possible. I also want to make everything lowercase because there are some diseases are are written in all capital letters, which may interfere with the ones in title case.

In [6]:
def remove_nonalpha(string):
    string = string.replace('§', '')
    string = string.replace('¶', '')
    string = string.replace('*', '')
    string = string.replace('†', '')
    string = string.replace(',,', ',')
    string = string.strip(':, ')
    string = string.lower()
    
    return string

cleaned_dfs = []
year = 2014

for df in dfs:
    cleaned = df['Disease'].apply(remove_nonalpha)
    cleaned_dfs.append(cleaned)
    print('Dataset for year', year)
    print('Original number of diseases:', len(set(df['Disease'])))
    print('Number of diseases after cleaning:', len(set(cleaned)))
    print('----------------------------------')
    year += 1

Dataset for year 2014
Original number of diseases: 55
Number of diseases after cleaning: 55
----------------------------------
Dataset for year 2015
Original number of diseases: 124
Number of diseases after cleaning: 104
----------------------------------
Dataset for year 2016
Original number of diseases: 90
Number of diseases after cleaning: 68
----------------------------------
Dataset for year 2017
Original number of diseases: 99
Number of diseases after cleaning: 84
----------------------------------
Dataset for year 2018
Original number of diseases: 118
Number of diseases after cleaning: 95
----------------------------------


In [7]:
df2014['Disease (cleaned)'] = cleaned_dfs[0]
df2015['Disease (cleaned)'] = cleaned_dfs[1]
df2016['Disease (cleaned)'] = cleaned_dfs[2]
df2017['Disease (cleaned)'] = cleaned_dfs[3]
df2018['Disease (cleaned)'] = cleaned_dfs[4]

cleaned_dfs = [df2014, df2015, df2016, df2017, df2018]

df2016.head()

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2016,"Cum 2016, flag",5-year weekly average§,"5-year weekly average§, flag",Total cases reported 2015,...,Total cases reported 2014,"Total cases reported 2014, flag",Total cases reported 2013,"Total cases reported 2013, flag",Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",States reporting cases during current week (No.),Disease (cleaned)
0,Anthrax,2016,1,,-,,-,,-,,...,,-,,-,,-,1.0,,,anthrax
1,"Arboviral diseases¶,** Chikungunya virus††",2016,1,,NN,,NN,,NN,,...,,NN,,NN,,NN,,NN,,"arboviral diseases, chikungunya virus"
2,"Arboviral diseases¶,** Eastern equine encephal...",2016,1,,-,,-,0.0,,5.0,...,8.0,,8.0,,15.0,,4.0,,,"arboviral diseases, eastern equine encephaliti..."
3,"Arboviral diseases¶,** Jamestown Canyon virus§§",2016,1,,-,,-,,-,7.0,...,11.0,,22.0,,2.0,,3.0,,,"arboviral diseases, jamestown canyon virus"
4,"Arboviral diseases¶,** La Crosse virus§§",2016,1,,-,,-,,-,48.0,...,80.0,,85.0,,78.0,,130.0,,,"arboviral diseases, la crosse virus"


In [8]:
df2014.describe()

Unnamed: 0,MMWR year,MMWR week,Current week,Cum 2014,5-year weekly average†,Total cases reported 2013,Total cases reported 2012,Total cases reported 2011,Total cases reported 2010,Total cases reported 2009
count,2915.0,2915.0,765.0,1968.0,2318.0,2335.0,2438.0,2279.0,2332.0,2438.0
mean,2014.0,27.0,3.657516,75.380081,7.642796,153.559315,129.478261,137.651163,129.772727,1081.652174
std,0.0,15.299683,6.269709,135.445781,44.692998,233.582844,197.685127,188.258387,190.802887,6368.341081
min,2014.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
25%,2014.0,14.0,1.0,7.0,0.0,13.0,17.0,15.0,10.0,10.0
50%,2014.0,27.0,2.0,23.0,1.0,81.0,60.0,82.0,69.0,72.5
75%,2014.0,40.0,4.0,88.0,3.0,187.0,161.0,166.0,136.75,174.0
max,2014.0,53.0,87.0,1156.0,742.0,1299.0,1111.0,870.0,846.0,43774.0


## Cleaning Diseases
Some of the diseases aren't named the same way even though they refer to the same disease, so I had to replace the names.

In [9]:
df2014

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,...,Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.),Disease (cleaned)
0,Anthrax,2014,1,,-,,-,,-,,...,,-,1.0,,,-,1.0,,,anthrax
1,"Arboviral diseases, California serogroup virus...",2014,1,,-,,-,,-,81.0,...,81.0,,137.0,,75.0,,55.0,,,"arboviral diseases, california serogroup virus..."
2,"Arboviral diseases, Eastern equine encephaliti...",2014,1,,-,,-,0.0,,6.0,...,15.0,,4.0,,10.0,,4.0,,,"arboviral diseases, eastern equine encephaliti..."
3,"Arboviral diseases, Powassan virus disease§,¶",2014,1,,-,,-,0.0,,12.0,...,7.0,,16.0,,8.0,,6.0,,,"arboviral diseases, powassan virus disease"
4,"Arboviral diseases, St. Louis encephalitis vir...",2014,1,,-,,-,0.0,,,...,3.0,,6.0,,10.0,,12.0,,,"arboviral diseases, st. louis encephalitis vir..."
5,"Arboviral diseases, Western equine encephaliti...",2014,1,,-,,-,,-,,...,,-,,-,,-,,-,,"arboviral diseases, western equine encephaliti..."
6,"Botulism, total",2014,1,1.0,,1.0,,3.0,,131.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
7,"Botulism, foodborne",2014,1,,-,,-,0.0,,5.0,...,27.0,,24.0,,7.0,,10.0,,,"botulism, foodborne"
8,"Botulism, infant",2014,1,,-,,-,2.0,,115.0,...,123.0,,97.0,,80.0,,83.0,,,"botulism, infant"
9,"Botulism, other (wound and unspecified)",2014,1,1.0,,1.0,,1.0,,11.0,...,18.0,,32.0,,25.0,,25.0,,CA (1 ),"botulism, other (wound and unspecified)"


### Finding possible diseases that need to be cleaned

In [10]:
year = 2014

for df in cleaned_dfs:
    print('Working with year', year)
    candidate_diseases = []
    num_weeks = max(df['MMWR week'])
    
    for disease in set(df['Disease (cleaned)']):
        if len(df[df['Disease (cleaned)'] == disease]) != num_weeks:
            candidate_diseases.append(disease)
    
    candidate_diseases.sort()
    print('\n'.join(candidate_diseases))
    print('\n---------------------------------------\n')
    
    year += 1

Working with year 2014


---------------------------------------

Working with year 2015
a,c,y, and w-135
arboviral diseases, california serogroup virus disease
arboviral diseases, chikungunya virus neuroinvasive disease
arboviral diseases, eastern equine encephalitis virus disease
arboviral diseases, powassan virus disease
arboviral diseases, western equine encephalitis virus disease
botulism, foodborne
botulism, infant
botulism, other (wound and unspecified)
california serogroup
chikuangunya
chronic
crimean-congo fever
cyclosporiasis
cyclosporosis
eastern equine
ebola hemorrhagic fever
foodborne
guanarito fever
haemophilus influenzae, invasive disease (age <5 yrs), nonserotype b
haemophilus influenzae, invasive disease (age <5 yrs), serotype b
haemophilus influenzae, invasive disease (age <5 yrs), unknown serotype
hantavirus infection (nhps)
hantavirus infection (non-hps)
hantavirus pulmonary syn
hantavirus pulmonary syndrome (hps)
hemolytic uremic syndrome
hemolytic uremic syndrome,

In [11]:
# df2016[df2016['Disease (cleaned)'] == 'zika virus, zika virus disease, non-congenital infection']

### Replacing diseases' names if there were typos or discrepancies

In [12]:
df2015['Disease (cleaned)'].replace('cyclosporosis', 'cyclosporiasis', inplace=True)
df2015['Disease (cleaned)'].replace('hantavirus pulmonary syn', 'hantavirus pulmonary syndrome (hps)', inplace=True)
df2015['Disease (cleaned)'].replace('hepatitis b, vir perinatal', 'hepatitis b, virus infection perinatal', inplace=True)
df2015['Disease (cleaned)'].replace('influenza-associated ped', 'influenza-associated pediatric mortality', inplace=True)
df2015['Disease (cleaned)'].replace('sars cov', 'sars-cov', inplace=True)
df2015['Disease (cleaned)'].replace('streptococcal toxic-shock', 'streptococcal toxic-shock syndrome', inplace=True)
df2015['Disease (cleaned)'].replace('toxic-shock syndrome', 'toxic-shock syndrome (staphylococcal)', inplace=True)


df2016['Disease (cleaned)'].replace('haemophilus influenzae invasive disease (age <5 yrs), non typable serotype',
                                    'haemophilus influenzae invasive disease (age <5 yrs), non typeable serotype', inplace=True)
df2016['Disease (cleaned)'].replace('hantavirus infections, hantavirus infection (non-hps)',
                                    'hantavirus infections, hantavirus infection disease (non-hps)', inplace=True)


df2017['Disease (cleaned)'].replace('arboviral diseases, chikungunya virus', 'arboviral diseases, chikungunya virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, eastern equine encephalitis virus', 'arboviral diseases, eastern equine encephalitis virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, jamestown canyon virus', 'arboviral diseases, jamestown canyon virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, la crosse virus', 'arboviral diseases, la crosse virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, powassan virus', 'arboviral diseases, powassan virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, st. louis encephalitis virus', 'arboviral diseases, st. louis encephalitis virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, western equine encephalitis virus', 'arboviral diseases, western equine encephalitis virus disease', inplace=True)


In [13]:
# df2014[df2014['Disease (cleaned)'] == 'jamestown canyon virus']
# set(df2017['Disease (cleaned)'])
# df2016[df2016['Disease (cleaned)'] == 'arboviral diseases, jamestown canyon virus']

In [14]:
# df2017[df2017['Disease (cleaned)'] == 'zika virus, zika virus disease, non-congenital infection']