# Exploration of Translated Items

> The purpose of this exploration is to get a grasp of what the translated data sets in the GitHub repo look like and how they could be of use for the task of NER or relation extraction.

In [1]:
# library for working with paths
import pathlib
# data manipulation library
import pandas as pd 
# lists directory items into list objects
from glob import glob

In [2]:
# set the path for where the data is
data_path = pathlib.Path().cwd() / 'data/*.csv' 

In [3]:
# iterate through the items in the path and save in a list of dfs
dfs = [pd.read_csv(path) for path in glob(str(data_path))]

In [4]:
dfs[0].head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,seafood_yn,translatedText
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,0,Royal Spring Consume
1,5,St. Emilion,,66,68,1881,1981,0.0,18.0,0,St. Emilion
2,16,Pim-olas,,145,148,1897,1918,0.15,35.0,0,Pim-waves
3,23,Pomard,,11,11,1880,1950,0.75,5.0,0,Pomard
4,29,G. H. Mumm & Co's Extra Dry,,14,14,1895,1914,2.0,4.0,0,G. H. Mumm & Co's Extra Dry


In [5]:
dfs[1].head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,seafood_yn,translatedText
0,2,Chicken gumbo,,111,117,1895,1960,0.1,0.8,0,Chicken gumbo
1,3,Tomato aux croutons,,13,13,1893,1917,0.25,0.4,0,Tomato with croutons
2,4,Onion au gratin,,41,41,1900,1971,0.25,1.0,0,Onion au gratin
3,7,Radishes,,3262,3346,1854,2928,0.0,25.0,0,Radishes
4,8,Chicken soup with rice,,48,49,1897,1961,0.1,0.6,0,Chicken soup with rice


In [6]:
dfs[2].head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,seafood_yn,translatedText
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,0,Royal Spring Consume
1,5,St. Emilion,,66,68,1881,1981,0.0,18.0,0,St. Emilion
2,16,Pim-olas,,145,148,1897,1918,0.15,35.0,0,Pim-waves
3,23,Pomard,,11,11,1880,1950,0.75,5.0,0,Pomard
4,29,G. H. Mumm & Co's Extra Dry,,14,14,1895,1914,2.0,4.0,0,G. H. Mumm & Co's Extra Dry


In [7]:
dfs[0].description.unique() 

array([nan])

In [8]:
# using the final form of the df
df = dfs[0][dfs[0]['seafood_yn'] == 1]

In [17]:
# setting list of words used to denote sides
sides = ['and', 'served', 'with', 'on', ',', 'paired', '-']

In [18]:
df.head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,seafood_yn,translatedText
48,340,Raw on the Half Shell,,1,1,0,0,0.0,0.0,1,Raw on the Half Shell
173,1129,Gerostete Sardinen auf Toast,,13,13,1899,1901,0.0,0.0,1,Roasted sardines on toast
201,1208,Roast on the Half Shell,,1,1,1900,1900,0.3,0.3,1,Roast on the Half Shell
312,1827,Extra filet steak au beurre d'anchois,,2,2,1900,1900,0.0,0.0,1,Extra filet steak with anchovy butter
532,3419,Lucines sur coquille,,5,5,1900,1906,0.0,0.0,1,Lucines on shell


In [19]:
# function that checks whether a translated item is in the 'sides' list
def checker(text):
    for word in text.split(): # splits text by whitespace
        if word in sides: # for each word, check if it's in the 'sides' list
            return True

In [20]:
# filter the df using the new function
df = df[df['translatedText'].apply(checker) == True]

In [16]:
# top 30 pairings
df.translatedText.value_counts()[:30]

Scrambled eggs with smoked salmon                                                                                   5
Scrambled eggs with smoked eel                                                                                      5
Anchovies on toast                                                                                                  5
Caviar on toast                                                                                                     4
Turtle Soup with Sherry                                                                                             4
Fresh lobster with mayonnaise                                                                                       4
Lobster mayonnaise with butter and toast                                                                            3
Turbot with Dutch sauce                                                                                             3
Fried plaice with lemon                                 