In [101]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
%matplotlib inline

In [102]:
#First step: Cleaning up US census data

In [103]:
pop = pd.read_csv('statepop.csv', header=1)

In [104]:
pop['Estimates Base'] = pop['Estimates Base'].str.strip('.')

In [105]:
indices = [0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 51,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,]

In [106]:
pop.iloc[8,0] = 'Washington DC'

In [107]:
pop = pop.reindex(indices)

In [108]:
pop.set_index('Estimates Base', inplace=True)

In [109]:
pop.sort_index(inplace=True)

In [110]:
pop.drop(columns=['2016', '2017', '2018', '2019'], inplace=True)

In [111]:
pop.shape

(52, 16)

In [112]:
pop = pop.T

In [113]:
pop = pop.values.flatten()

In [114]:
len(pop)

832

In [115]:
#Second step: drop a few years from my data where there was not reliable census data,
#and drop some rows where there was not reliable census data (Guam, Paula) as well as a
#Multi state row, which cannot be normalized without underlying data that is not accessible

In [116]:
df = pd.read_csv('outbreaks.csv')

In [117]:
df[df['State'] == 'Maine']

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
475,1998,May,Maine,Restaurant,,,,,,5,0.0,0.0
476,1998,May,Maine,Restaurant,,,,,,3,0.0,0.0
591,1998,June,Maine,Restaurant,,,,,,3,,
720,1998,July,Maine,,"Other Milk, Unpasteurized",Milk,"Escherichia coli, Shiga toxin-producing",O157:H7,Confirmed,2,2.0,0.0
946,1998,September,Maine,Private Home/Residence,,,"Escherichia coli, Shiga toxin-producing",O157:H7,Confirmed,2,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16027,2012,May,Maine,Restaurant,,,Salmonella enterica,Enteritidis,Confirmed,14,0.0,0.0
16211,2012,August,Maine,Grocery Store,,,Salmonella enterica,Newport,Confirmed,2,1.0,0.0
17376,2014,January,Maine,,"Milk, Unpasteurized","Milk, Unpasteurized",Cryptosporidium parvum,IIaA15G2R1,Confirmed,4,0.0,0.0
17941,2014,August,Maine,,,,Listeria monocytogenes,,Confirmed,2,2.0,0.0


In [118]:
df = df.groupby(['Year', 'State']).sum()['Illnesses']

In [119]:
df = df.reset_index()

In [120]:
df.drop(df[df['State'] == 'Guam'].index, inplace=True)

In [121]:
df.drop(df[df['State'] == 'Republic of Palau'].index, inplace=True)

In [122]:
df.drop(df[df['State'] == 'Multistate'].index, inplace=True)

In [123]:
df.drop(df[df['Year'] == 1998].index, inplace=True)

In [124]:
df.drop(df[df['Year'] == 1999].index, inplace=True)

In [125]:
df[df['State'] == 'Maine']

Unnamed: 0,Year,State,Illnesses
115,2000,Maine,55
163,2001,Maine,214
213,2002,Maine,126
263,2003,Maine,82
312,2004,Maine,44
359,2005,Maine,278
409,2006,Maine,388
459,2007,Maine,2590
509,2008,Maine,730
558,2009,Maine,50


In [126]:
#Third step, make a skeleton dataframe on which to concat my data, so that I can then 
#iterate over segments of data that will have uniform lenghths

In [127]:
states = sorted(df['State'].unique())

In [128]:
len(states)

52

In [129]:
year_list = []
year = 2000
for j in range(2000,2016):
    for i in range(len(states)):
        year_list.append(year)
    year +=1

In [130]:
state_list = []
for i in range(2000,2016):
    for state in states:
        state_list.append(state)

In [131]:
len(state_list)

832

In [132]:
len(year_list)

832

In [133]:
#Fifth step: left merge my data onto skeleton

In [134]:
left = pd.DataFrame(data=[year_list,state_list])

In [135]:
left.shape

(2, 832)

In [136]:
left = left.T

In [137]:
left.shape

(832, 2)

In [138]:
left

Unnamed: 0,0,1
0,2000,Alabama
1,2000,Alaska
2,2000,Arizona
3,2000,Arkansas
4,2000,California
...,...,...
827,2015,Washington
828,2015,Washington DC
829,2015,West Virginia
830,2015,Wisconsin


In [139]:
left.rename(columns = {0:'Year', 1:'State'}, inplace=True)

In [140]:
df = left.merge(df, how='left', on=['Year', 'State'])

In [141]:
df[df['State'] == 'Maine']

Unnamed: 0,Year,State,Illnesses
18,2000,Maine,55.0
70,2001,Maine,214.0
122,2002,Maine,126.0
174,2003,Maine,82.0
226,2004,Maine,44.0
278,2005,Maine,278.0
330,2006,Maine,388.0
382,2007,Maine,2590.0
434,2008,Maine,730.0
486,2009,Maine,50.0


In [142]:
#Sixth: add population data and normalize

In [143]:
df['Population'] = pd.DataFrame(pop)

In [144]:
df['Illnesses / Population'] = df['Illnesses'] / df['Population']

In [145]:
df = df.fillna(0)

In [146]:
#Finally, graphs

In [147]:
y_list = []
for state in states:
    y_list.append(df[df['State'] == state]['Illnesses / Population'])

In [160]:
fig,ax=plt.subplots(figsize=(25,15))
for i in range(len(states)):
    ax.plot(range(2000,2016), y_list[i])
ax.set_xlabel('Year', size=40)
ax.set_ylabel('Cases (Normalized to Population)', size=40)
ax.set_title('Annual Cases of Foodborne Illness by State', size=50)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tight_layout
plt.legend(states, ncol=3, fontsize=18)
plt.save_fig('annual_states';

SyntaxError: invalid syntax (<ipython-input-160-82016666db83>, line 11)

In [149]:
df[df['Illnesses / Population'] > 0.0004].sort_values(by = 'Illnesses / Population', ascending=False)

Unnamed: 0,Year,State,Illnesses,Population,Illnesses / Population
382,2007,Maine,2590.0,1327040,0.001952
137,2002,North Dakota,541.0,638168,0.000848
155,2002,Wyoming,374.0,500017,0.000748
204,2003,Washington DC,320.0,568502,0.000563
675,2012,Wyoming,319.0,576305,0.000554
434,2008,Maine,730.0,1330509,0.000549
256,2004,Washington DC,282.0,567754,0.000497
317,2006,Colorado,2212.0,4720423,0.000469
152,2002,Washington DC,240.0,573158,0.000419
501,2009,North Dakota,267.0,664968,0.000402
