In [1]:
import pandas as pd # data manipulation

import spacy # nlp library

nlp = spacy.load('seafood_model_final') # load in NER model, see results below

sf = pd.read_csv('data/dflabeledtranslated_edited.csv') # load in translated data

sf.head() # view first five rows

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,seafood_yn,translatedText
0,2,Chicken gumbo,,111,117,1895,1960,0.1,0.8,0,Chicken gumbo
1,3,Tomato aux croutons,,13,13,1893,1917,0.25,0.4,0,Tomato with croutons
2,4,Onion au gratin,,41,41,1900,1971,0.25,1.0,0,Onion au gratin
3,7,Radishes,,3262,3346,1854,2928,0.0,25.0,0,Radishes
4,8,Chicken soup with rice,,48,49,1897,1961,0.1,0.6,0,Chicken soup with rice


*Results from training on 553 examples:*

|Label    |  Precision |  Recall  | F-Score
| --------  | --------- |  ------  | ------- |
LOCATION   |   73.913 |  77.273  |  75.556
SEAFOOD     |  90.244  | 82.222   | 86.047
METHOD      | 100.000  | 84.615   | 91.667
SIDE        |  76.471  | 59.091  |  66.667

In [2]:
sf = sf[sf['seafood_yn'] == 1] # get only seafood entries

sf = sf.set_index('id') # reset the index to existing

sf.drop(['name', 'description', 'seafood_yn'], axis=1, inplace=True) # drop un-needed cols

sf.head() # preview data

Unnamed: 0_level_0,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,translatedText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,14,16,1899,1962,0.15,0.4,Clam broth (cup)
11,157,157,1893,1937,0.25,60.0,Clear green turtle
12,2,2,1900,1900,0.0,0.0,"Striped bass saute, meuniere"
14,4,4,1899,1900,0.0,0.0,Fresh lobsters in every style
17,505,534,1880,1987,0.0,75.0,Caviar


In [18]:
def ent_extract(text):
    doc = nlp(text) # perform nlp on text
    ent_dict = {} # create dictionary to hold entities
    for ent in doc.ents: # for every found entitity
        ent_dict[ent.text.lower()] = ent.label_ # record entity and entity label
    return ent_dict # return all entities found in text

sf['entities'] = sf['translatedText'].apply(ent_extract) # apply new function to data and save in column

sf.head() # preview data

Unnamed: 0_level_0,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,translatedText,entities
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9,14,16,1899,1962,0.15,0.4,Clam broth (cup),{'clam': 'SEAFOOD'}
11,157,157,1893,1937,0.25,60.0,Clear green turtle,{}
12,2,2,1900,1900,0.0,0.0,"Striped bass saute, meuniere",{'striped bass': 'SEAFOOD'}
14,4,4,1899,1900,0.0,0.0,Fresh lobsters in every style,{'lobsters': 'SEAFOOD'}
17,505,534,1880,1987,0.0,75.0,Caviar,{'caviar': 'SEAFOOD'}


In [29]:
sf['entities'].iloc[29].items()

dict_items([('fried', 'METHOD'), ('flounders', 'SEAFOOD')])