In [40]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from IPython.display import Image, display

import pandas as pd
import numpy as np
import json
%matplotlib notebook

from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import CountVectorizer

# Read ImageNet Words List

In [41]:
df_words = pd.read_csv('/projects/kawa7978/words.txt', sep="\t", header=None)
df_words.columns = ['id', 'words']
df_words.head(10)

Unnamed: 0,id,words
0,n00001740,entity
1,n00001930,physical entity
2,n00002137,"abstraction, abstract entity"
3,n00002452,thing
4,n00002684,"object, physical object"
5,n00003553,"whole, unit"
6,n00003993,congener
7,n00004258,"living thing, animate thing"
8,n00004475,"organism, being"
9,n00005787,benthos


# Read Ingredients List

In [44]:
f = open('/projects/kawa7978/det_ingrs.json', 'r')
ingr = json.load(f)

An example data looks like this

In [97]:
for k, v in ingr[0].items():
    print(k)
    print(v)
    print('-'*100)


valid
[True, True, True, True, True, True, True, True, False, False, True, True, True, False]
----------------------------------------------------------------------------------------------------
id
000018c8a5
----------------------------------------------------------------------------------------------------
ingredients
[{'text': 'penne'}, {'text': 'cheese sauce'}, {'text': 'cheddar cheese'}, {'text': 'gruyere cheese'}, {'text': 'dried chipotle powder'}, {'text': 'unsalted butter'}, {'text': 'all - purpose flour'}, {'text': 'milk'}, {'text': '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)'}, {'text': '2 ounces semisoft cheese (page 23), grated (1/2 cup)'}, {'text': 'kosher salt'}, {'text': 'dried chipotle powder'}, {'text': 'garlic powder'}, {'text': '(makes about 4 cups)'}]
----------------------------------------------------------------------------------------------------
ingredient_ids
[{'penne': 'n07701147'}, {'cheese sauce': 'n07836731'}, {'cheddar cheese': ''}, {'

# Strategy
## 1. Read Ingredients List
## 2. Loop over all ingredients
## 3. Find each ingredient in the Words List
## 4. If found, compute "similarity" and take the one that is above 90%

## 5. In the original dataset, store a dictionary containing {'ingredient': False, 'ingredient': ID} 

First define a funciton that computes a similarity and output a similar word and its id within ImageNet dataset

In [83]:
from fuzzywuzzy import fuzz

def similar_word_and_id(df, searchword, similarity_thres=90, similarity_func=fuzz.ratio):
    """
    df: pd.DataFrame
        Requires df to have columns ['id', 'words']
    """
    # returns Boolean
    try:
        index = df["words"].str.contains(searchword, regex=False)
    except:
        print(searchword)
    inds = index[index==True].index

    df_tmp = df.loc[inds]

    highest = -np.inf
    ret_word = ''
    ret_id = ''

    for i, row in df_tmp.iterrows():
        word = row['words']
        similarity = similarity_func(searchword, word)
        if similarity > similarity_thres:
            if highest < similarity:
                highest = similarity
                ret_word = word
                ret_id = row['id']

    return ret_word, ret_id, highest


word, ID, score = similar_word_and_id(df_words, 'tea', 10)
print(word, ID, score)

tea n07933274 100


Now loop for all the data

In [84]:
print(ingr[0].keys())
print(ingr[0].values())

print('-'*100)
n_data = len(ingr)
print('No. of Data: %s'%(n_data))

# Temporarily for small dataset
n_data = int(n_data/100)

# loop for all data
for i_data in range(n_data):
    ingr[i_data]['ingredient_ids'] = []
    ingr[i_data]['ingredient_matched_word'] = []
    ingr[i_data]['ingredient_similarity_score'] = []
    n_ingredients = len(ingr[i_data]['ingredients'])
    for i_ingr in range(n_ingredients):
        searchword = ingr[i_data]['ingredients'][i_ingr]['text']
        word, ID, score = similar_word_and_id(df_words, searchword, 90)
        ingr[i_data]['ingredient_ids'].append({searchword: ID})
        ingr[i_data]['ingredient_matched_word'].append({searchword: word})
        ingr[i_data]['ingredient_similarity_score'].append({searchword: score})
        

dict_keys(['valid', 'id', 'ingredients', 'ingredient_ids', 'ingredient_matched_word', 'ingredient_similarity_score'])
dict_values([[True, True, True, True, True, True, True, True, False, False, True, True, True, False], '000018c8a5', [{'text': 'penne'}, {'text': 'cheese sauce'}, {'text': 'cheddar cheese'}, {'text': 'gruyere cheese'}, {'text': 'dried chipotle powder'}, {'text': 'unsalted butter'}, {'text': 'all - purpose flour'}, {'text': 'milk'}, {'text': '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)'}, {'text': '2 ounces semisoft cheese (page 23), grated (1/2 cup)'}, {'text': 'kosher salt'}, {'text': 'dried chipotle powder'}, {'text': 'garlic powder'}, {'text': '(makes about 4 cups)'}], [{'penne': 'n07701147'}, {'cheese sauce': 'n07836731'}, {'cheddar cheese': ''}, {'gruyere cheese': ''}, {'dried chipotle powder': ''}, {'unsalted butter': ''}, {'all - purpose flour': ''}, {'milk': 'n05399034'}, {'14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)': ''}, {

In [87]:
with open("/projects/kawa7978/ingrs_with_imagenet_id.json", "w") as f:
    json.dump(ingr, f)

In [94]:
ingr[0]

{'valid': [True,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  False,
  False,
  True,
  True,
  True,
  False],
 'id': '000018c8a5',
 'ingredients': [{'text': 'penne'},
  {'text': 'cheese sauce'},
  {'text': 'cheddar cheese'},
  {'text': 'gruyere cheese'},
  {'text': 'dried chipotle powder'},
  {'text': 'unsalted butter'},
  {'text': 'all - purpose flour'},
  {'text': 'milk'},
  {'text': '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)'},
  {'text': '2 ounces semisoft cheese (page 23), grated (1/2 cup)'},
  {'text': 'kosher salt'},
  {'text': 'dried chipotle powder'},
  {'text': 'garlic powder'},
  {'text': '(makes about 4 cups)'}],
 'ingredient_ids': [{'penne': 'n07701147'},
  {'cheese sauce': 'n07836731'},
  {'cheddar cheese': ''},
  {'gruyere cheese': ''},
  {'dried chipotle powder': ''},
  {'unsalted butter': ''},
  {'all - purpose flour': ''},
  {'milk': 'n05399034'},
  {'14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)': ''},
  {'2 ounces

In [None]:
df_urls = pd.read_csv('/projects/kawa7978/fall11_urls.txt', sep="\t", header=None, error_bad_lines=False)
df_urls.columns = ['id', 'urls']
df_urls.head(10)