In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('outbreaks.csv')

In [3]:
df = df[df['Food'].notnull()][df['Species'].notnull()]

  """Entry point for launching an IPython kernel.


In [4]:
df.head()

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"Fish, Ahi",,Scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",,Salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"Stuffing, Unspecified; Sandwich, Turkey",,Salmonella enterica,Enteritidis,Confirmed,4,3.0,0.0
8,1998,January,Colorado,Restaurant,"Salad, Unspecified",,Virus,,Suspected,21,,
9,1998,January,Colorado,Restaurant,"Peppers, Chili",,Virus,,Suspected,3,,0.0


In [5]:
food_set = df['Food'].str.lower().str.replace(',','').str.replace(';','').str.split()
food_set.head()

3                                  [fish, ahi]
4          [lasagna, unspecified, eggs, other]
7    [stuffing, unspecified, sandwich, turkey]
8                         [salad, unspecified]
9                             [peppers, chili]
Name: Food, dtype: object

In [6]:
species_set = df['Species'].str.lower().str.replace(';',',').str.split(',').map(lambda x:[i.strip() for i in x])
species_set.head()

3        [scombroid toxin]
4    [salmonella enterica]
7    [salmonella enterica]
8                  [virus]
9                  [virus]
Name: Species, dtype: object

In [7]:
all_foods = set()
for i in food_set:
    all_foods |= set(i)
len(all_foods)

1041

In [8]:
all_toxins = set()
for i in species_set:
    all_toxins |= set(i)
len(all_toxins)

80

In [9]:
food_toxin_matrix = pd.DataFrame(columns = all_foods, index = all_toxins, data = 0)
food_toxin_matrix.head()

Unnamed: 0,grape,pumpkin,popsicles,sauces,bruschetta,caesar,peanut,papaya,seabass,cupcakes,...,undercooked,soup,stew,squash,cheesecake,grain,hot,diced,orange,grey
bacillus cereus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bacillus unknown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
salmonella,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
salmonella enterica,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
enteropathogenic,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
for food, toxin in zip(food_set, species_set):
    for f in food:
        for t in toxin:
            food_toxin_matrix.loc[t,f] +=1

In [11]:
food_toxin_matrix.sum(0), food_toxin_matrix.sum(1)

(grape          4
 pumpkin        1
 popsicles      1
 sauces        21
 bruschetta    15
               ..
 grain          1
 hot           29
 diced          1
 orange         9
 grey           5
 Length: 1041, dtype: int64, bacillus cereus           1281
 bacillus unknown             5
 salmonella                   5
 salmonella enterica       3870
 enteropathogenic            44
                           ... 
 rotavirus                   19
 enterococcus faecalis        2
 bacillus other              11
 sapovirus                    9
 listeria monocytogenes     138
 Length: 80, dtype: int64)

In [14]:
food_toxin_matrix.to_csv('food_toxin_matrix.csv')