In [204]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [123]:
df = pd.read_csv('outbreaks.csv')

In [125]:
df = df[df['Food'].notnull()][df['Species'].notnull()]

  """Entry point for launching an IPython kernel.


In [140]:
df.head()

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"Fish, Ahi",,Scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",,Salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"Stuffing, Unspecified; Sandwich, Turkey",,Salmonella enterica,Enteritidis,Confirmed,4,3.0,0.0
8,1998,January,Colorado,Restaurant,"Salad, Unspecified",,Virus,,Suspected,21,,
9,1998,January,Colorado,Restaurant,"Peppers, Chili",,Virus,,Suspected,3,,0.0


In [145]:
df['Food'] = df['Food'].str.lower()

In [147]:
df['Species'] = df['Species'].str.lower()

In [187]:
df.head(20)

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"fish, ahi",,scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"lasagna, unspecified; eggs, other",,salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"stuffing, unspecified; sandwich, turkey",,salmonella enterica,Enteritidis,Confirmed,4,3.0,0.0
8,1998,January,Colorado,Restaurant,"salad, unspecified",,virus,,Suspected,21,,
9,1998,January,Colorado,Restaurant,"peppers, chili",,virus,,Suspected,3,,0.0
15,1998,January,Florida,Restaurant,"ethnic style, unspecified",,clostridium perfringens,,Suspected,3,0.0,0.0
16,1998,January,Florida,Restaurant,greek salad,,norovirus genogroup i,,Suspected,2,,
17,1998,January,Florida,Restaurant,"ground beef, cheeseburger",,staphylococcus aureus,,Suspected,2,0.0,0.0
18,1998,January,Florida,Restaurant,"pasta, unspecified",,bacillus cereus,,Suspected,2,0.0,0.0
19,1998,January,Florida,Restaurant,"sauces, unspecified; tuna, unspecified",,scombroid toxin,,Suspected,2,2.0,0.0


In [159]:
species = df['Species'].str.lower().str.replace(';', '').str.split(expand=True)

In [166]:
toxins = []
for row in range(species.shape[0]):
    toxins.extend(species.iloc[row,:].unique())
toxins = list(set([t.strip() for t in toxins if t is not None]))
stop_list = ['(msg)', 'herbal', 'cleaning', 'other', 'toxin-producing', 'neurotoxic', 'a', 'unknown',
            'genogroup', 'ii', 'i', 'toxin', 'or', 'poison', 'group', 'fish', 'plant', 'chemical', 'heavy',
            'fish', 'plant', 'agents']
toxins = [t for t in toxins if t not in stop_list]

In [175]:
toxins

['faecalis',
 'enterica',
 'vibrio',
 'giardia',
 'shellfish',
 'mycotoxins',
 'rotavirus',
 'coli',
 'enterocolitica',
 'boydii',
 'histamine',
 'flexneri',
 'spiralis',
 'bacterium',
 'parvum',
 'monocytogenes',
 'sonnei',
 'parasite',
 'adenovirus',
 'puffer',
 'brucella',
 'cloacae',
 'trichinella',
 'jejuni',
 'streptococcus',
 'campylobacter',
 'ciguatoxin',
 'cholerae',
 'cayatenensis',
 'virus',
 'enterococcus',
 'parahaemolyticus',
 'amnesic',
 'escherichia',
 'hepatitis',
 'hominis',
 'metals',
 'enteroaggregative',
 'shiga',
 'botulinum',
 'listeria',
 'salmonella',
 'coli,',
 'vulnificus',
 'enterotoxigenic',
 'shigella',
 'scombroid',
 'cereus',
 'cryptosporidium',
 'glutamate',
 'sapovirus',
 'norovirus',
 'cyclospora',
 'intestinalis',
 'toxins',
 'anisakis',
 'monosodium',
 'tetrodotoxin',
 'pesticides',
 'aureus',
 'paralytic',
 'enterobacter',
 'yersinia',
 'perfringens',
 'clostridium',
 'enteropathogenic',
 'staphylococcus',
 'bacillus']

In [168]:
foods = df['Food'].str.lower().str.replace(',', ' ').str.replace(';', ' ').str.replace('/', ' ').str.replace(':', ' ').str.replace('(', '').str.replace(')', '').str.replace('-', ' ').str.split(expand = True)

In [169]:
vocab = []
for row in range(foods.shape[0]):
    vocab.extend(foods.iloc[row,:].unique())

In [170]:
vocab = list(set([v for v in vocab if v is not None]))
stop_words = ['other', 'unspecified', '1%', 'contains', 'sa', 'con', 'cut', 'stir', 'royal',
             'head', 'heads', 'cooked', 'bites', 'unknown', 'chef', 'menu', 'foods', 'de', 'with',
             'easy', 'tri', 'w', 'multiple', 'bottled', 'food', 'see', 'of', 'made', 'sink',
             'based', '&', 'prepackaged', 'seven', 'soft', 'mini', 'ethnic', 'common', 'home', 'containing',
             'grilled', 'specialty', 'king', 'miscellaneous', 'sum', 'le', 'pre', 'packaged', 'whole',
             'mechanical', 'creamed', 'pasteurized', 'from', '2%', 'dietary', 'acid', 'commercial', 'pouch',
             'types', 'mashed', 'killed', 'cuisine', 'prime', 'dish', 'appetizer', 'waldorf', 'club', 'nine',
             "chef's", 'mixed', '*****', 'drink', 'tenderized', 'francaise', 'aus', 'acetic', 'crisp', 'tres',
             'puffed', 'ala', 'platter', 'twist', 'tray', 'shredded', 'lo', 'items', 'blackened', 'side', 'and',
             'white', 'homegrown', 'mediterranean', 'a', 'homemade', 'au', 'house', 'oriental', 'style',
             'ingredients', 'or', 'undercooked', 'free', 'served']
vocab = [v for v in vocab if not v in stop_words]

In [176]:
vocab

['cannelloni',
 'basil',
 'pastry',
 'filling',
 'leafy',
 'quesadillas',
 'goulash',
 'tomatilla',
 'vegetables',
 'peaches',
 'feet',
 'akule',
 'knifejaw',
 'grains',
 'cottage',
 'gifilte',
 'plantains',
 'scalloped',
 'potato',
 'rib',
 'landjaeger',
 'tandori',
 'carrots',
 'gratin',
 'scallop',
 'kitfo',
 'melon',
 'french',
 'cheesecake',
 'burritos',
 'couscous',
 'walnut',
 'mamey',
 'powdered',
 'chips',
 'poi',
 'vanilla',
 'iceberg',
 'cheeses',
 'frosting',
 'duck',
 'newburgh',
 'salad',
 'colostrum',
 'petit',
 'chilis',
 'puff',
 'brownies',
 'feta',
 'wrap',
 'conch',
 'skins',
 'crispy',
 'blueberries',
 'shooters',
 'iced',
 "d'oeuvres",
 'intestine',
 'fresco',
 'ladder',
 'carne',
 'mein',
 'bologna',
 'spicy',
 'drop',
 'wings',
 'gomes',
 'chimichanga',
 'fried',
 'tamale',
 'wild',
 'pretzel',
 'bar',
 'dumplings',
 'vegetarian',
 'zucchini',
 'grouper',
 'tarantulas',
 'vinagarette',
 'ricotta',
 'mimosa',
 'shumai',
 'mostaccioli',
 'kale',
 'horchata',
 'bra

In [172]:
source = pd.DataFrame(columns = vocab, index = toxins)

In [173]:
source

Unnamed: 0,cannelloni,basil,pastry,filling,leafy,quesadillas,goulash,tomatilla,vegetables,peaches,...,crawfish,beet,tomatillo,cactus,meringue,ranch,octopus,lettuce,peas,mango
faecalis,,,,,,,,,,,...,,,,,,,,,,
enterica,,,,,,,,,,,...,,,,,,,,,,
vibrio,,,,,,,,,,,...,,,,,,,,,,
giardia,,,,,,,,,,,...,,,,,,,,,,
shellfish,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
perfringens,,,,,,,,,,,...,,,,,,,,,,
clostridium,,,,,,,,,,,...,,,,,,,,,,
enteropathogenic,,,,,,,,,,,...,,,,,,,,,,
staphylococcus,,,,,,,,,,,...,,,,,,,,,,


In [186]:
df.iloc[0,:]['Species'].split(', ')

['scombroid toxin']

In [185]:
df.iloc[0,:]['Food'].split(', ')

['fish', 'ahi']

In [188]:
'ahi' in 'fishahi'

True

In [205]:
for row in tqdm(df.iterrows()):
    for food in vocab:
        for toxin in toxins:
            if food in row[1]['Food'] and toxin in row[1]['Species']:
                source.loc[toxin, food] +=1

7245it [39:02,  3.09it/s]


In [196]:
for row in df.iterrows():
    x = row
    break

In [202]:
x[1]

Year                            1998
Month                        January
State                     California
Location                  Restaurant
Food                       fish, ahi
Ingredient                       NaN
Species              scombroid toxin
Serotype/Genotype                NaN
Status                     Confirmed
Illnesses                          4
Hospitalizations                   0
Fatalities                         0
Name: 3, dtype: object

In [201]:
x

(3, Year                            1998
 Month                        January
 State                     California
 Location                  Restaurant
 Food                       fish, ahi
 Ingredient                       NaN
 Species              scombroid toxin
 Serotype/Genotype                NaN
 Status                     Confirmed
 Illnesses                          4
 Hospitalizations                   0
 Fatalities                         0
 Name: 3, dtype: object)