# EDA of Infrequently Reported Notifiable Diseases
Wikipedia defines "notifiable diseases" as "any disease that is required by law to be reported to government authorities." These datasets from the Centers for Disease Control and Prevention website (data.cdc.gov) contains information about cases of selected infrequently reported notifiable diseases from 2014 to 2018 in all 50 states.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid", {'axes.grid' : False})

In [2]:
df2014 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2014.csv')
df2015 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2015.csv')
df2016 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2016.csv')
df2017 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2017.csv')
df2018 = pd.read_csv('https://raw.githubusercontent.com/jchen2186/mortal-determination/master/raw_data/NNDSS_-_Table_I._infrequently_reported_notifiable_diseases_2018.csv')

In [3]:
df2014.head()

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,"Total cases reported 2013, flag",Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.)
0,Anthrax,2014,1,,-,,-,,-,,-,,-,1.0,,,-,1.0,,
1,"Arboviral diseases, California serogroup virus...",2014,1,,-,,-,,-,81.0,,81.0,,137.0,,75.0,,55.0,,
2,"Arboviral diseases, Eastern equine encephaliti...",2014,1,,-,,-,0.0,,6.0,,15.0,,4.0,,10.0,,4.0,,
3,"Arboviral diseases, Powassan virus disease§,¶",2014,1,,-,,-,0.0,,12.0,,7.0,,16.0,,8.0,,6.0,,
4,"Arboviral diseases, St. Louis encephalitis vir...",2014,1,,-,,-,0.0,,,-,3.0,,6.0,,10.0,,12.0,,


In [4]:
dfs = [df2014, df2015, df2016, df2017, df2018]

for df in dfs:
    print(df.columns)

Index(['Disease', 'MMWR year', 'MMWR week', 'Current week',
       'Current week, flag', 'Cum 2014', 'Cum 2014, flag',
       '5-year weekly average†', '5-year weekly average†, flag',
       'Total cases reported  2013', 'Total cases reported  2013, flag',
       'Total cases reported 2012', 'Total cases reported 2012, flag',
       'Total cases reported 2011', 'Total cases reported 2011, flag',
       'Total cases reported 2010', 'Total cases reported 2010, flag',
       'Total cases reported 2009', 'Total cases reported 2009, flag',
       'States reporting cases during current week (No.) '],
      dtype='object')
Index(['Disease', 'MMWR year', 'MMWR week', 'Current week',
       'Current week, flag', 'Cum 2015', 'Cum 2015, flag',
       '5-year weekly average§', '5-year weekly average§, flag',
       'Total cases reported  2014', 'Total cases reported  2014, flag',
       'Total cases reported 2013', 'Total cases reported 2013, flag',
       'Total cases reported 2012', 'Total cases

In [5]:
for df in dfs:
    print(df.shape)

(2915, 20)
(3176, 20)
(3362, 20)
(3329, 20)
(3073, 20)


## Clean Data

### Clean Disease Column
The disease column has unicode characters that may cause the diseases to be unique even if they aren't. In this section, my goal is to remove the unicode characters from the column in order to make the disease names as uniform as possible. I also want to make everything lowercase because there are some diseases are are written in all capital letters, which may interfere with the ones in title case.

In [6]:
def remove_nonalpha(string):
    string = string.replace('§', '')
    string = string.replace('¶', '')
    string = string.replace('*', '')
    string = string.replace('†', '')
    string = string.replace(',,', ',')
    string = string.strip(':, ')
    string = string.lower()
    
    return string

cleaned_dfs = []
year = 2014

for df in dfs:
    cleaned = df['Disease'].apply(remove_nonalpha)
    cleaned_dfs.append(cleaned)
    print('Dataset for year', year)
    print('Original number of diseases:', len(set(df['Disease'])))
    print('Number of diseases after cleaning:', len(set(cleaned)))
    print('----------------------------------')
    year += 1

Dataset for year 2014
Original number of diseases: 55
Number of diseases after cleaning: 55
----------------------------------
Dataset for year 2015
Original number of diseases: 124
Number of diseases after cleaning: 104
----------------------------------
Dataset for year 2016
Original number of diseases: 90
Number of diseases after cleaning: 68
----------------------------------
Dataset for year 2017
Original number of diseases: 99
Number of diseases after cleaning: 84
----------------------------------
Dataset for year 2018
Original number of diseases: 118
Number of diseases after cleaning: 95
----------------------------------


In [7]:
df2014['Disease (cleaned)'] = cleaned_dfs[0]
df2015['Disease (cleaned)'] = cleaned_dfs[1]
df2016['Disease (cleaned)'] = cleaned_dfs[2]
df2017['Disease (cleaned)'] = cleaned_dfs[3]
df2018['Disease (cleaned)'] = cleaned_dfs[4]

cleaned_dfs = [df2014, df2015, df2016, df2017, df2018]

df2016.head()

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2016,"Cum 2016, flag",5-year weekly average§,"5-year weekly average§, flag",Total cases reported 2015,...,Total cases reported 2014,"Total cases reported 2014, flag",Total cases reported 2013,"Total cases reported 2013, flag",Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",States reporting cases during current week (No.),Disease (cleaned)
0,Anthrax,2016,1,,-,,-,,-,,...,,-,,-,,-,1.0,,,anthrax
1,"Arboviral diseases¶,** Chikungunya virus††",2016,1,,NN,,NN,,NN,,...,,NN,,NN,,NN,,NN,,"arboviral diseases, chikungunya virus"
2,"Arboviral diseases¶,** Eastern equine encephal...",2016,1,,-,,-,0.0,,5.0,...,8.0,,8.0,,15.0,,4.0,,,"arboviral diseases, eastern equine encephaliti..."
3,"Arboviral diseases¶,** Jamestown Canyon virus§§",2016,1,,-,,-,,-,7.0,...,11.0,,22.0,,2.0,,3.0,,,"arboviral diseases, jamestown canyon virus"
4,"Arboviral diseases¶,** La Crosse virus§§",2016,1,,-,,-,,-,48.0,...,80.0,,85.0,,78.0,,130.0,,,"arboviral diseases, la crosse virus"


### Finding possible diseases that need to be cleaned
Some of the diseases aren't named the same way even though they refer to the same disease, so I had to replace the names.

In [10]:
year = 2014

for df in cleaned_dfs:
    print('Working with year', year)
    candidate_diseases = []
    num_weeks = max(df['MMWR week'])
    
    for disease in set(df['Disease (cleaned)']):
        if len(df[df['Disease (cleaned)'] == disease]) != num_weeks:
            candidate_diseases.append(disease)
    
    candidate_diseases.sort()
    print('\n'.join(candidate_diseases))
    print('\n---------------------------------------\n')
    
    year += 1

Working with year 2014


---------------------------------------

Working with year 2015
a,c,y, and w-135
arboviral diseases, california serogroup virus disease
arboviral diseases, chikungunya virus neuroinvasive disease
arboviral diseases, eastern equine encephalitis virus disease
arboviral diseases, powassan virus disease
arboviral diseases, western equine encephalitis virus disease
botulism, foodborne
botulism, infant
botulism, other (wound and unspecified)
california serogroup
chikuangunya
chronic
crimean-congo fever
cyclosporiasis
cyclosporosis
eastern equine
ebola hemorrhagic fever
foodborne
guanarito fever
haemophilus influenzae, invasive disease (age <5 yrs), nonserotype b
haemophilus influenzae, invasive disease (age <5 yrs), serotype b
haemophilus influenzae, invasive disease (age <5 yrs), unknown serotype
hantavirus infection (nhps)
hantavirus infection (non-hps)
hantavirus pulmonary syn
hantavirus pulmonary syndrome (hps)
hemolytic uremic syndrome
hemolytic uremic syndrome,

### Replacing diseases' names if there were typos or discrepancies

In [12]:
df2015['Disease (cleaned)'].replace('cyclosporosis', 'cyclosporiasis', inplace=True)
df2015['Disease (cleaned)'].replace('hantavirus pulmonary syn', 'hantavirus pulmonary syndrome (hps)', inplace=True)
df2015['Disease (cleaned)'].replace('hepatitis b, vir perinatal', 'hepatitis b, virus infection perinatal', inplace=True)
df2015['Disease (cleaned)'].replace('influenza-associated ped', 'influenza-associated pediatric mortality', inplace=True)
df2015['Disease (cleaned)'].replace('sars cov', 'sars-cov', inplace=True)
df2015['Disease (cleaned)'].replace('streptococcal toxic-shock', 'streptococcal toxic-shock syndrome', inplace=True)
df2015['Disease (cleaned)'].replace('toxic-shock syndrome', 'toxic-shock syndrome (staphylococcal)', inplace=True)


df2016['Disease (cleaned)'].replace('haemophilus influenzae invasive disease (age <5 yrs), non typable serotype',
                                    'haemophilus influenzae invasive disease (age <5 yrs), non typeable serotype', inplace=True)
df2016['Disease (cleaned)'].replace('hantavirus infections, hantavirus infection (non-hps)',
                                    'hantavirus infections, hantavirus infection disease (non-hps)', inplace=True)


df2017['Disease (cleaned)'].replace('arboviral diseases, chikungunya virus', 'arboviral diseases, chikungunya virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, eastern equine encephalitis virus', 'arboviral diseases, eastern equine encephalitis virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, jamestown canyon virus', 'arboviral diseases, jamestown canyon virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, la crosse virus', 'arboviral diseases, la crosse virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, powassan virus', 'arboviral diseases, powassan virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, st. louis encephalitis virus', 'arboviral diseases, st. louis encephalitis virus disease', inplace=True)
df2017['Disease (cleaned)'].replace('arboviral diseases, western equine encephalitis virus', 'arboviral diseases, western equine encephalitis virus disease', inplace=True)


In [13]:
# df2014[df2014['Disease (cleaned)'] == 'jamestown canyon virus']
# set(df2017['Disease (cleaned)'])
# df2016[df2016['Disease (cleaned)'] == 'arboviral diseases, jamestown canyon virus']

In [14]:
# df2017[df2017['Disease (cleaned)'] == 'zika virus, zika virus disease, non-congenital infection']

In [18]:
df2014

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,...,Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.),Disease (cleaned)
0,Anthrax,2014,1,,-,,-,,-,,...,,-,1.0,,,-,1.0,,,anthrax
1,"Arboviral diseases, California serogroup virus...",2014,1,,-,,-,,-,81.0,...,81.0,,137.0,,75.0,,55.0,,,"arboviral diseases, california serogroup virus..."
2,"Arboviral diseases, Eastern equine encephaliti...",2014,1,,-,,-,0.0,,6.0,...,15.0,,4.0,,10.0,,4.0,,,"arboviral diseases, eastern equine encephaliti..."
3,"Arboviral diseases, Powassan virus disease§,¶",2014,1,,-,,-,0.0,,12.0,...,7.0,,16.0,,8.0,,6.0,,,"arboviral diseases, powassan virus disease"
4,"Arboviral diseases, St. Louis encephalitis vir...",2014,1,,-,,-,0.0,,,...,3.0,,6.0,,10.0,,12.0,,,"arboviral diseases, st. louis encephalitis vir..."
5,"Arboviral diseases, Western equine encephaliti...",2014,1,,-,,-,,-,,...,,-,,-,,-,,-,,"arboviral diseases, western equine encephaliti..."
6,"Botulism, total",2014,1,1.0,,1.0,,3.0,,131.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
7,"Botulism, foodborne",2014,1,,-,,-,0.0,,5.0,...,27.0,,24.0,,7.0,,10.0,,,"botulism, foodborne"
8,"Botulism, infant",2014,1,,-,,-,2.0,,115.0,...,123.0,,97.0,,80.0,,83.0,,,"botulism, infant"
9,"Botulism, other (wound and unspecified)",2014,1,1.0,,1.0,,1.0,,11.0,...,18.0,,32.0,,25.0,,25.0,,CA (1 ),"botulism, other (wound and unspecified)"


In [40]:
df2018.isna().sum()

Disease                                                  0
MMWR year                                                1
MMWR week                                                1
Current week                                          2455
Current week, flag                                     619
Cum 2018                                              1237
Cum 2018, flag                                        1837
5-year weekly average§                                 981
5-year weekly average§, flag                          2141
Total cases reported for pervious years  2017          958
Total cases reported for pervious years 2017, flag    2164
Total cases reported for pervious years 2016           961
Total cases reported for pervious years 2016, flag    2161
Total cases reported for pervious years 2015           817
Total cases reported for pervious years 2015, flag    2305
Total cases reported for pervious years 2014           913
Total cases reported for pervious years 2014, flag    22

In [35]:
mask = df2018['Total cases reported for pervious years  2017'] == 0
df2018[mask]

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2018,"Cum 2018, flag",5-year weekly average§,"5-year weekly average§, flag",Total cases reported for pervious years 2017,...,Total cases reported for pervious years 2016,"Total cases reported for pervious years 2016, flag",Total cases reported for pervious years 2015,"Total cases reported for pervious years 2015, flag",Total cases reported for pervious years 2014,"Total cases reported for pervious years 2014, flag",Total cases reported for pervious years 2013,"Total cases reported for pervious years 2013, flag",States reporting cases during current week (No.),Disease (cleaned)


## Drop Rows Where Data is Null
There are some columns that cannot be null if we want to use them: Disease, MMWR year, MMWR week. If there are any rows where they are null, we must drop them. After looking at the datasets, I noticed that only df2017 and df2018 have such rows.

In [43]:
df2017 = df2017.dropna(axis=0, subset=['Disease', 'MMWR year', 'MMWR week'])
df2018 = df2018.dropna(axis=0, subset=['Disease', 'MMWR year', 'MMWR week'])

## Separate Files
The data containing the total number of cases reported for each of the previous years is the same for all of the rows containing info for the same diseases. We can separate each of the datasets into two parts: one part that contains info for just the current year and another part that contains info for the previous years.

In [94]:
# update cleaned_dfs
cleaned_dfs = [df2014, df2015, df2016, df2017, df2018]

In [95]:
dfs_totals = []

for df in cleaned_dfs:
    new_df = df.drop_duplicates(subset=['Disease (cleaned)'],
                               keep='last')
    dfs_totals.append(new_df)
    # keep the ones that were recorded during the last week
#     last_week = max(df['MMWR week'])
#     dfs_totals.append(df[df['MMWR week'] == last_week])


df2014_totals = dfs_totals[0]
df2015_totals = dfs_totals[1]
df2016_totals = dfs_totals[2]
df2017_totals = dfs_totals[3]
df2018_totals = dfs_totals[4]

# drop unnecessary columns
df2014_totals.drop(['Disease', 'Current week',
                    'Current week, flag', 'Cum 2014', 'Cum 2014, flag',
                    '5-year weekly average†', '5-year weekly average†, flag', 
                    'States reporting cases during current week (No.) '],
                   axis=1,
                   inplace=True)

df2015_totals.drop(['Disease', 'Current week',
                    'Current week, flag', 'Cum 2015', 'Cum 2015, flag',
                    '5-year weekly average§', '5-year weekly average§, flag',
                    'States reporting cases during current week (No.) '],
                   axis=1,
                   inplace=True)

df2016_totals.drop(['Disease', 'Current week',
                    'Current week, flag', 'Cum 2016', 'Cum 2016, flag',
                    '5-year weekly average§', '5-year weekly average§, flag',
                    'States reporting cases during current week (No.) '],
                   axis=1,
                   inplace=True)

df2017_totals.drop(['Disease', 'Current week',
                    'Current week, flag', 'Cum 2017', 'Cum 2017, flag',
                    '5-year weekly average§', '5-year weekly average§, flag',
                    'States reporting cases during current week (No.) '],
                   axis=1,
                   inplace=True)

df2018_totals.drop(['Disease', 'Current week',
                    'Current week, flag', 'Cum 2018', 'Cum 2018, flag',
                    '5-year weekly average§', '5-year weekly average§, flag',
                    'States reporting cases during current week (No.) '],
                   axis=1,
                   inplace=True)

In [100]:

df2018_totals.fillna(0).replace('-', 0)


Unnamed: 0,MMWR year,MMWR week,Total cases reported for pervious years 2017,"Total cases reported for pervious years 2017, flag",Total cases reported for pervious years 2016,"Total cases reported for pervious years 2016, flag",Total cases reported for pervious years 2015,"Total cases reported for pervious years 2015, flag",Total cases reported for pervious years 2014,"Total cases reported for pervious years 2014, flag",Total cases reported for pervious years 2013,"Total cases reported for pervious years 2013, flag",Disease (cleaned)
2177,2018.0,34.0,156.0,0,247.0,0,896.0,0,0.0,NN,0.0,NN,"arboviral diseases, chikungunya virus"
2178,2018.0,34.0,5.0,0,7.0,0,6.0,0,8.0,0,8.0,0,"arboviral diseases, eastern equine encephaliti..."
2179,2018.0,34.0,75.0,0,15.0,0,11.0,0,11.0,0,22.0,0,"arboviral diseases, jamestown canyon virus"
2180,2018.0,34.0,63.0,0,35.0,0,55.0,0,80.0,0,85.0,0,"arboviral diseases, la crosse virus"
2181,2018.0,34.0,34.0,0,22.0,0,7.0,0,8.0,0,12.0,0,"arboviral diseases, powassan virus"
2182,2018.0,34.0,11.0,0,8.0,0,23.0,0,10.0,0,1.0,0,"arboviral diseases, st. louis encephalitis virus"
2183,2018.0,34.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,"arboviral diseases, western equine encephaliti..."
2193,2018.0,34.0,33.0,0,30.0,0,29.0,0,40.0,0,31.0,0,haemophilus influenzae invasive disease (age <...
2194,2018.0,34.0,200.0,0,196.0,0,175.0,0,128.0,0,141.0,0,haemophilus influenzae invasive disease (age <...
2195,2018.0,34.0,189.0,0,159.0,0,135.0,0,266.0,0,233.0,0,haemophilus influenzae invasive disease (age <...


In [63]:
df2014_totals = df2014.drop_duplicates(subset=['Disease (cleaned)'],
                                      keep='last')
mask = df2014_totals['MMWR week'] == max(df2014['MMWR week'])
df2014_totals[mask]

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,...,Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.),Disease (cleaned)
2860,Anthrax,2014,53,,-,,-,,-,,...,,-,1.0,,,-,1.0,,,anthrax
2861,"Arboviral diseases, California serogroup virus...",2014,53,,-,80.0,,0.0,,95.0,...,81.0,,137.0,,75.0,,55.0,,,"arboviral diseases, california serogroup virus..."
2862,"Arboviral diseases, Eastern equine encephaliti...",2014,53,,-,8.0,,0.0,,8.0,...,15.0,,4.0,,10.0,,4.0,,,"arboviral diseases, eastern equine encephaliti..."
2863,"Arboviral diseases, Powassan virus disease§,¶",2014,53,,-,8.0,,0.0,,12.0,...,7.0,,16.0,,8.0,,6.0,,,"arboviral diseases, powassan virus disease"
2864,"Arboviral diseases, St. Louis encephalitis vir...",2014,53,,-,9.0,,0.0,,1.0,...,3.0,,6.0,,10.0,,12.0,,,"arboviral diseases, st. louis encephalitis vir..."
2865,"Arboviral diseases, Western equine encephaliti...",2014,53,,-,,-,,-,,...,,-,,-,,-,,-,,"arboviral diseases, western equine encephaliti..."
2866,"Botulism, total",2014,53,,-,139.0,,3.0,,152.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
2867,"Botulism, foodborne",2014,53,,-,11.0,,0.0,,4.0,...,27.0,,24.0,,7.0,,10.0,,,"botulism, foodborne"
2868,"Botulism, infant",2014,53,,-,109.0,,3.0,,136.0,...,123.0,,97.0,,80.0,,83.0,,,"botulism, infant"
2869,"Botulism, other (wound and unspecified)",2014,53,,-,19.0,,0.0,,12.0,...,18.0,,32.0,,25.0,,25.0,,,"botulism, other (wound and unspecified)"


In [64]:
df2015_totals = df2015.drop_duplicates(subset=['Disease (cleaned)'],
                                      keep='last')
mask = df2015_totals['MMWR week'] == max(df2015['MMWR week'])
df2015_totals[mask]

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2015,"Cum 2015, flag",5-year weekly average§,"5-year weekly average§, flag",Total cases reported 2014,...,Total cases reported 2013,"Total cases reported 2013, flag",Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",States reporting cases during current week (No.),Disease (cleaned)
3115,ANTHRAX,2015,52,,-,,-,,-,,...,,-,,-,1.0,,,-,,anthrax
3116,CALIFORNIA SEROGROUP,2015,52,,-,56.0,,0.0,,96.0,...,95.0,,81.0,,137.0,,75.0,,,california serogroup
3117,CHIKUANGUNYA,2015,52,,NN,,NN,,-,,...,,NN,,NN,,NN,,NN,,chikuangunya
3118,EASTERN EQUINE,2015,52,,-,4.0,,0.0,,8.0,...,8.0,,15.0,,4.0,,10.0,,,eastern equine
3119,POWASSAN,2015,52,,-,5.0,,,-,8.0,...,12.0,,7.0,,16.0,,8.0,,,powassan
3120,ST. LOUIS,2015,52,,-,19.0,,0.0,,10.0,...,1.0,,3.0,,6.0,,10.0,,,st. louis
3121,WESTERN EQUINE,2015,52,,-,,-,,-,,...,,-,,-,,-,,-,,western equine
3122,"BOTULISM, TOTAL",2015,52,,-,177.0,,3.0,,161.0,...,152.0,,168.0,,153.0,,112.0,,,"botulism, total"
3123,FOODBORNE,2015,52,,-,40.0,,0.0,,15.0,...,4.0,,27.0,,24.0,,7.0,,,foodborne
3124,INFANT,2015,52,,-,118.0,,3.0,,127.0,...,136.0,,123.0,,97.0,,80.0,,,infant


In [60]:
set(df2014['Disease (cleaned)'])

tmp = df2014.drop_duplicates(subset=[
    'Disease (cleaned)',
    'Total cases reported  2013', 'Total cases reported  2013, flag',
    'Total cases reported 2012', 'Total cases reported 2012, flag', 
    'Total cases reported 2011', 'Total cases reported 2011, flag',
    'Total cases reported 2010', 'Total cases reported 2010, flag',
    'Total cases reported 2009', 'Total cases reported 2009, flag'])

mask = tmp.duplicated(subset='Disease (cleaned)') == True
tmp[mask]

tmp[tmp['Disease (cleaned)'] == 'botulism, total']

Unnamed: 0,Disease,MMWR year,MMWR week,Current week,"Current week, flag",Cum 2014,"Cum 2014, flag",5-year weekly average†,"5-year weekly average†, flag",Total cases reported 2013,...,Total cases reported 2012,"Total cases reported 2012, flag",Total cases reported 2011,"Total cases reported 2011, flag",Total cases reported 2010,"Total cases reported 2010, flag",Total cases reported 2009,"Total cases reported 2009, flag",States reporting cases during current week (No.),Disease (cleaned)
6,"Botulism, total",2014,1,1.0,,1.0,,3.0,,131.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
61,"Botulism, total",2014,2,,-,1.0,,2.0,,135.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
116,"Botulism, total",2014,3,,-,2.0,,2.0,,136.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
173,"Botulism, total",2014,4,,-,5.0,,2.0,,140.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
227,"Botulism, total",2014,5,,-,6.0,,2.0,,141.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
337,"Botulism, total",2014,7,2.0,,15.0,,2.0,,144.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
392,"Botulism, total",2014,8,1.0,,16.0,,3.0,,145.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
448,"Botulism, total",2014,9,,-,21.0,,3.0,,148.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
502,"Botulism, total",2014,10,1.0,,25.0,,3.0,,149.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
557,"Botulism, total",2014,11,,-,26.0,,2.0,,150.0,...,168.0,,153.0,,112.0,,118.0,,,"botulism, total"
