In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [2]:
df = pd.read_csv('outbreaks.csv')

In [3]:
df = df[df['Food'].notnull()][df['Species'].notnull()]

  """Entry point for launching an IPython kernel.


In [4]:
df.head()

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"Fish, Ahi",,Scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"Lasagna, Unspecified; Eggs, Other",,Salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"Stuffing, Unspecified; Sandwich, Turkey",,Salmonella enterica,Enteritidis,Confirmed,4,3.0,0.0
8,1998,January,Colorado,Restaurant,"Salad, Unspecified",,Virus,,Suspected,21,,
9,1998,January,Colorado,Restaurant,"Peppers, Chili",,Virus,,Suspected,3,,0.0


In [5]:
df['Food'] = df['Food'].str.lower()

In [6]:
df['Species'] = df['Species'].str.lower()

In [7]:
df.head(20)

Unnamed: 0,Year,Month,State,Location,Food,Ingredient,Species,Serotype/Genotype,Status,Illnesses,Hospitalizations,Fatalities
3,1998,January,California,Restaurant,"fish, ahi",,scombroid toxin,,Confirmed,4,0.0,0.0
4,1998,January,California,Private Home/Residence,"lasagna, unspecified; eggs, other",,salmonella enterica,Enteritidis,Confirmed,26,3.0,0.0
7,1998,January,California,Restaurant,"stuffing, unspecified; sandwich, turkey",,salmonella enterica,Enteritidis,Confirmed,4,3.0,0.0
8,1998,January,Colorado,Restaurant,"salad, unspecified",,virus,,Suspected,21,,
9,1998,January,Colorado,Restaurant,"peppers, chili",,virus,,Suspected,3,,0.0
15,1998,January,Florida,Restaurant,"ethnic style, unspecified",,clostridium perfringens,,Suspected,3,0.0,0.0
16,1998,January,Florida,Restaurant,greek salad,,norovirus genogroup i,,Suspected,2,,
17,1998,January,Florida,Restaurant,"ground beef, cheeseburger",,staphylococcus aureus,,Suspected,2,0.0,0.0
18,1998,January,Florida,Restaurant,"pasta, unspecified",,bacillus cereus,,Suspected,2,0.0,0.0
19,1998,January,Florida,Restaurant,"sauces, unspecified; tuna, unspecified",,scombroid toxin,,Suspected,2,2.0,0.0


In [8]:
species = df['Species'].str.lower().str.replace(';', '').str.split(expand=True)

In [9]:
toxins = []
for row in range(species.shape[0]):
    toxins.extend(species.iloc[row,:].unique())
toxins = list(set([t.strip() for t in toxins if t is not None]))
stop_list = ['(msg)', 'herbal', 'cleaning', 'other', 'toxin-producing', 'neurotoxic', 'a', 'unknown',
            'genogroup', 'ii', 'i', 'toxin', 'or', 'poison', 'group', 'fish', 'plant', 'chemical', 'heavy',
            'fish', 'plant', 'agents']
toxins = [t for t in toxins if t not in stop_list]

In [10]:
toxins

['enteroaggregative',
 'paralytic',
 'trichinella',
 'aureus',
 'brucella',
 'campylobacter',
 'ciguatoxin',
 'monosodium',
 'rotavirus',
 'monocytogenes',
 'streptococcus',
 'listeria',
 'intestinalis',
 'coli',
 'coli,',
 'hominis',
 'tetrodotoxin',
 'jejuni',
 'parasite',
 'cereus',
 'cyclospora',
 'mycotoxins',
 'puffer',
 'scombroid',
 'shellfish',
 'faecalis',
 'hepatitis',
 'adenovirus',
 'anisakis',
 'parahaemolyticus',
 'toxins',
 'botulinum',
 'yersinia',
 'cloacae',
 'enterica',
 'cholerae',
 'perfringens',
 'staphylococcus',
 'glutamate',
 'escherichia',
 'norovirus',
 'bacillus',
 'bacterium',
 'vulnificus',
 'cryptosporidium',
 'shigella',
 'parvum',
 'enterococcus',
 'enteropathogenic',
 'clostridium',
 'metals',
 'vibrio',
 'salmonella',
 'giardia',
 'enterotoxigenic',
 'pesticides',
 'boydii',
 'cayatenensis',
 'flexneri',
 'histamine',
 'spiralis',
 'virus',
 'enterocolitica',
 'shiga',
 'enterobacter',
 'sapovirus',
 'sonnei',
 'amnesic']

In [11]:
foods = df['Food'].str.lower().str.replace(',', ' ').str.replace(';', ' ').str.replace('/', ' ').str.replace(':', ' ').str.replace('(', '').str.replace(')', '').str.replace('-', ' ').str.split(expand = True)

In [12]:
vocab = []
for row in range(foods.shape[0]):
    vocab.extend(foods.iloc[row,:].unique())

In [13]:
vocab = list(set([v for v in vocab if v is not None]))
stop_words = ['other', 'unspecified', '1%', 'contains', 'sa', 'con', 'cut', 'stir', 'royal',
             'head', 'heads', 'cooked', 'bites', 'unknown', 'chef', 'menu', 'foods', 'de', 'with',
             'easy', 'tri', 'w', 'multiple', 'bottled', 'food', 'see', 'of', 'made', 'sink',
             'based', '&', 'prepackaged', 'seven', 'soft', 'mini', 'ethnic', 'common', 'home', 'containing',
             'grilled', 'specialty', 'king', 'miscellaneous', 'sum', 'le', 'pre', 'packaged', 'whole',
             'mechanical', 'creamed', 'pasteurized', 'from', '2%', 'dietary', 'acid', 'commercial', 'pouch',
             'types', 'mashed', 'killed', 'cuisine', 'prime', 'dish', 'appetizer', 'waldorf', 'club', 'nine',
             "chef's", 'mixed', '*****', 'drink', 'tenderized', 'francaise', 'aus', 'acetic', 'crisp', 'tres',
             'puffed', 'ala', 'platter', 'twist', 'tray', 'shredded', 'lo', 'items', 'blackened', 'side', 'and',
             'white', 'homegrown', 'mediterranean', 'a', 'homemade', 'au', 'house', 'oriental', 'style',
             'ingredients', 'or', 'undercooked', 'free', 'served']
vocab = [v for v in vocab if not v in stop_words]

In [238]:
# source = pd.DataFrame(columns = vocab, index = toxins)

In [272]:
# source = source.fillna(0)

In [271]:
# for row in tqdm(df.iterrows()):
#     for food in vocab:
#         for toxin in toxins:
#             if food in row[1]['Food'] and toxin in row[1]['Species']:
#                 source.loc[toxin, food] +=1

In [270]:
# source.drop(['in', 'the'], axis=1, inplace=True)

In [255]:
# source['chicken'] = source['chicken'] + source['chick']

In [257]:
# source.drop(['chick'], axis=1, inplace=True)

In [269]:
# source.drop(['ground'], axis=1, inplace=True)

In [268]:
# source.drop(['raw'], axis=1, inplace=True)

In [273]:
# source.drop(['tip'], axis=1, inplace=True)

In [275]:
# source['salad'] = source['salads'] + source['salad']

In [276]:
# source.drop(['salads'], axis=1, inplace=True)

In [278]:
# source.drop(['key'], axis=1, inplace=True)

In [280]:
# source.drop(['roast'], axis=1, inplace=True)

In [282]:
# source.drop(['pot'], axis=1, inplace=True)

In [284]:
# source['oyster'] = source['oyster'] + source['oysters']

In [288]:
# source.drop(['oysters'], axis=1, inplace=True)

In [290]:
# source.drop(['green'], axis=1, inplace=True)

In [291]:
# source.drop(['roasted', 'bar'], axis=1, inplace=True)

In [294]:
# source.drop(['refried', 'mix'], axis=1, inplace=True)

In [296]:
# source['sub'] = source['sub'] + source['submarine']

In [297]:
# source.drop(['submarine'], axis=1, inplace=True)

In [300]:
# source['egg'] = source['egg'] + source['eggs']

In [301]:
# source.drop(['eggs'], axis=1, inplace=True)

In [20]:
# source.drop(['coli,'], inplace=True)

In [110]:
source = pd.read_csv('source.csv')

In [111]:
source.drop(['Unnamed: 0'], axis=1, inplace=True)

In [112]:
source

Unnamed: 0,species,cannelloni,basil,pastry,filling,leafy,quesadillas,goulash,tomatilla,vegetables,...,crawfish,beet,tomatillo,cactus,meringue,ranch,octopus,lettuce,peas,mango
0,faecalis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,enterica,1,2,1,1,0,1,0,1,6,...,2,0,0,0,2,1,0,18,1,7
2,vibrio,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,giardia,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,shellfish,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,perfringens,0,0,0,0,0,0,2,0,2,...,1,2,0,0,0,0,0,5,2,0
63,clostridium,0,0,0,0,0,0,2,0,2,...,1,4,0,0,0,0,0,5,3,0
64,enteropathogenic,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,staphylococcus,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [109]:
source.to_csv('source.csv')