In [27]:
from nltk.corpus import wordnet as wn
from lxml import etree
import numpy as np
import pandas as pd
from os import path
from collections import Counter

In [28]:
taxonomy = pd.read_csv('taxonomy.csv', header=0)
taxonomy.columns

Index(['liquid-alcohol-soft', 'liquid-alcohol-soft-bw', 'liquid-alcohol',
       'liquid-juices', 'liquid-juices-citric', 'solid-fruits',
       'solid-fruits-citric', 'solid-veggies', 'te-syrups', 'te-sweet', 'te',
       'others', 'Unnamed: 12'],
      dtype='object')

In [29]:
ingredients_list = {
    ('liquid', 'alcohol', 'soft', 'bw'): [],
    ('liquid', 'alcohol', 'soft'): [],
    ('liquid', 'alcohol'): [],
    ('liquid', 'juices'): [],
    ('liquid', 'juices', 'citric'): [],
    ('solid', 'fruits'): [],
    ('solid', 'fruits', 'citric'): [],
    ('solid', 'veggies'): [],
    ('te', 'syrups'): [],
    ('te', 'sweet'): [],
    ('te',): [],
    ('others',): []
}

In [30]:
tree = etree.parse('ccc_cocktails.xml')

recipes = tree.findall('recipe')
# process each sentence in the file
titles = []
all_ingredients = []
cocktails = {}
for recipe in recipes:
    title = recipe[0].text
    titles.append(title)
    ingredients = recipe[1]
    ingredients_list = {   
        ('liquid', 'alcohol', 'soft', 'bw'): [],
        ('liquid', 'alcohol', 'soft'): [],
        ('liquid', 'alcohol'): [],
        ('liquid', 'juices'): [],
        ('liquid', 'juices', 'citric'): [],
        ('solid', 'fruits'): [],
        ('solid', 'fruits', 'citric'): [],
        ('solid', 'veggies'): [],
        ('te', 'syrups'): [],
        ('te', 'sweet'): [],
        ('te',): [],
        ('others',): []
    }
    for ingredient in ingredients:
        ingredient_val = ingredient.values()[-1].lower()
        all_ingredients.append(ingredient_val)
        for col in taxonomy.columns:
            col_tup = tuple(col.split('-'))

            if taxonomy[col].isin([ingredient_val]).any():
                ingredients_list[col_tup].append(ingredient_val)

    cocktails[title] = ingredients_list

In [31]:
cocktails

{"Ti'punch": {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): ['white rum'],
  ('liquid', 'juices'): [],
  ('liquid', 'juices', 'citric'): [],
  ('solid', 'fruits'): [],
  ('solid', 'fruits', 'citric'): ['lime'],
  ('solid', 'veggies'): [],
  ('te', 'syrups'): ['syrup'],
  ('te', 'sweet'): [],
  ('te',): [],
  ('others',): []},
 'Exotic Cocktail passion fruit': {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): ['blue curacao'],
  ('liquid', 'juices'): [],
  ('liquid', 'juices', 'citric'): ['lemonade', 'lemon juice'],
  ('solid', 'fruits'): [],
  ('solid', 'fruits', 'citric'): [],
  ('solid', 'veggies'): [],
  ('te', 'syrups'): ['passion fruit syrup'],
  ('te', 'sweet'): [],
  ('te',): [],
  ('others',): ['ice cube']},
 'Builder': {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): [],
  ('liquid', 'juices'): [],
  ('liquid', 'j

## PMI table

In [37]:
unique_ingredients = set(all_ingredients)
df_pmi = pd.DataFrame(0, columns=unique_ingredients, index=unique_ingredients)
for recipe in recipes:
    title = recipe[0].text
    titles.append(title)
    ingredients = recipe[1]
    for ingredient_row in ingredients:
        ingredient_val_row = ingredient_row.values()[-1].lower()
        for ingredient_col in ingredients:
            ingredient_val_col = ingredient_col.values()[-1].lower()
            if ingredient_val_row != ingredient_val_col:
                df_pmi.loc[ingredient_val_row, ingredient_val_col] += 1


In [40]:
from math import log
def pmi_score(ingredient1, ingredient2, df_pmi):
    fij = df_pmi.loc[ingredient1, ingredient2]
    fi = df_pmi.loc[ingredient1, :].sum()
    fj = df_pmi.loc[ingredient2, :].sum()
    sum_all = df_pmi.values.sum()
    print('fij', fij)
    print('fifj', fi*fj)
    p = fij/(fi*fj/sum_all)
    '''
    pmi = log(p, 2)
    if pmi < 0:
        pmi = 0
    return pmi, p
    '''
    return p
print(df_pmi.loc['tequila', 'lemon'])
pmi_score('tequila', 'lime', df_pmi)

0
fij 1
fifj 1121


1.6931311329170384

## Similarity in taxonomy

In [23]:
# ingredient_tup => (cat1, cat2, cat3, ingredient)
def taxonomy_score(ingredient_tup1, ingredient_tup2):
    common_ancestors = set(ingredient_tup1).intersection(set(ingredient_tup2))
    all_ancestors = set(ingredient_tup1).union(set(ingredient_tup2))
    return len(common_ancestors)/len(all_ancestors)

In [25]:
print(taxonomy_score(('liquid', 'alcohol', 'vodka'), ('liquid', 'alcohol', 'soft', 'bw', 'orange')))
print(taxonomy_score(('liquid', 'alcohol', 'vodka'), ('liquid', 'alcohol', 'tequila')))

0.3333333333333333
0.5


## Similarities preferences

In [286]:
cols = [('liquid', 'alcohol', 'soft', 'bw'), 
       ('liquid', 'alcohol', 'soft'), 
       ('liquid', 'alcohol'), 
       ('liquid', 'juices'), 
       ('liquid', 'juices', 'citric'),     
       ('solid', 'fruits'),
       ('solid', 'fruits', 'citric'), 
       ('solid', 'veggies'),
       ('te', 'syrups'): [],
       ('te', 'sweet'),
       ('te',), 
       ('others',)]
idx = ['alco', 'sweet', 'sour', 'tutti-frutti']

alco_vals =  [0.9, 0.95, 1, 0.7, 0.7, 0.7, 0.7, 0.65, 0.75, 0.7, 0.8, 0.85]

sweet_vals = [0.9, 0.95, 0.8, 0.85, 0.5, 0.9, 0.65, 0.5, 1, 1, 0.8, 0.8]

sour_vals = [0.8, 0.6, 0.8, 0.7, 1, 0.75, 1, 0.8, 0.6, 0.6, 0.85, 0.8]

frutti_vals = [0.85, 0.9, 0.7, 1, 1, 1, 1, 0.7, 0.9, 0.65, 0.7, 0.85]

df_preferences = pd.DataFrame(np.array([alco_vals, sweet_vals, sour_vals, frutti_vals]), columns=cols, index=idx)
df_preferences

Unnamed: 0,"(liquid, alcohol, soft, bw)","(liquid, alcohol, soft)","(liquid, alcohol)","(liquid, juices)","(liquid, juices, citric)","(liquid, syrups)","(solid, fruits)","(solid, fruits, citric)","(solid, veggies)","(te, sweet)","(te,)","(others,)"
alco,0.9,0.95,1.0,0.7,0.7,0.75,0.7,0.7,0.65,0.7,0.8,0.85
sweet,0.9,0.95,0.8,0.85,0.5,1.0,0.9,0.65,0.5,1.0,0.8,0.8
sour,0.8,0.6,0.8,0.7,1.0,0.6,0.75,1.0,0.8,0.6,0.85,0.8
tutti-frutti,0.85,0.9,0.7,1.0,1.0,0.9,1.0,1.0,0.7,0.65,0.7,0.85


In [299]:
df_preferences[('solid', 'fruits', 'citric')]['alco']

0.7

## Match cocktails

In [300]:
def matchCocktails(q_ingredients_list, preference='alco'):
    res = {}
    categories = list(q_ingredients_list.keys())
    for title in cocktails.keys():
        cocktail = cocktails[title]
        
        matches = {
            ('liquid', 'alcohol', 'soft', 'bw'): [],
            ('liquid', 'alcohol', 'soft'): [],
            ('liquid', 'alcohol'): [],
            ('liquid', 'juices'): [],
            ('liquid', 'juices', 'citric'): [],
            ('solid', 'fruits'): [],
            ('solid', 'fruits', 'citric'): [],
            ('solid', 'veggies'): [],
            ('te', 'syrups'): [],
            ('te', 'sweet'): [],
            ('te',): [],
            ('others',): []
        }
        c_matches = 0
        for cat in categories:
            for q_ingredient in q_ingredients_list[cat]:
                if q_ingredient in cocktail[cat]:
                    matches[cat] = q_ingredient
                    ######### normalize!
                    c_matches += df_preferences[cat][preference]
        res[title] = (c_matches, matches)
    return res

In [305]:
q= {
    ('liquid', 'alcohol', 'soft', 'bw'): [],
    ('liquid', 'alcohol', 'soft'): [],
    ('liquid', 'alcohol'): ['vodka'],
    ('liquid', 'juices'): ['lemon juice'],
    ('liquid', 'juices', 'citric'): [],
    ('solid', 'fruits'): [],
    ('solid', 'fruits', 'citric'): ['lime', 'orange'],
    ('solid', 'veggies'): [],
    ('te', 'syrups'): [],
    ('te', 'sweet'): ['sugar'],
    ('te',): [],
    ('others',): []
}
matches = matchCocktails(q)
titles_sort = sorted(matches, key=lambda x: matches[x][0], reverse=True )
max_val = matches[titles_sort[0]][0]
final_match = {}
top_n = 3
c_top = 0
for title in titles_sort:
    if matches[title][0] == max_val or c_top < top_n:
        final_match[title] = matches[title][1]
        c_top += 1
final_match

{'Cava Cocktail Vodka Lemon': {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): 'vodka',
  ('liquid', 'juices'): [],
  ('liquid', 'juices', 'citric'): [],
  ('liquid', 'syrups'): [],
  ('solid', 'fruits'): [],
  ('solid', 'fruits', 'citric'): 'lime',
  ('solid', 'veggies'): [],
  ('te', 'sweet'): [],
  ('te',): [],
  ('others',): []},
 'Cocktail appetizer': {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): [],
  ('liquid', 'juices'): [],
  ('liquid', 'juices', 'citric'): [],
  ('liquid', 'syrups'): [],
  ('solid', 'fruits'): [],
  ('solid', 'fruits', 'citric'): 'orange',
  ('solid', 'veggies'): [],
  ('te', 'sweet'): 'sugar',
  ('te',): [],
  ('others',): []},
 'Light cocktail martini': {('liquid', 'alcohol', 'soft', 'bw'): [],
  ('liquid', 'alcohol', 'soft'): [],
  ('liquid', 'alcohol'): [],
  ('liquid', 'juices'): [],
  ('liquid', 'juices', 'citric'): [],
  ('liquid', 'syrups')

In [166]:
len(unique_ingredients), unique_ingredients

(157,
 {'amber rum',
  'angostura bitter',
  'anise basil',
  'apple',
  'apple cider',
  'apple juice',
  'apple liqueur',
  'apricot',
  'apricot juice',
  'apricot liqueur',
  'banana',
  'banana juice',
  'beer',
  'benedictine',
  'berry',
  'berry juice',
  'blackcurrant',
  'blood orange',
  'blue curacao',
  'blush wine',
  'brown sugar',
  'brown_sugar',
  'cachaca',
  'calvados',
  'campari',
  'cane sugar',
  'cane sugar syrup',
  'cava',
  'celery salt',
  'champagne',
  'cherry',
  'cherry juice',
  'cinnamon',
  'citrus fruit',
  'clementine orange',
  'coca-cola',
  'coconut',
  'coconut milk',
  'coffee',
  'coffee liqueur',
  'cognac',
  'cointreau',
  'coriander',
  'cranberry juice',
  'cremant',
  'creme de cafe',
  'creme de cassis',
  'crushed ice',
  'cucumbers',
  'currant',
  'currant syrup',
  'dark rum',
  'dry white wine',
  'egg',
  'fruit',
  'fruit syrup',
  'get_27',
  'gin',
  'ginger',
  'grand marnier',
  'granulated sugar',
  'grape juice',
  'grapef

## Check similarity with wordnet

In [167]:
words_found = []
words_notfound = []
for w in unique_ingredients:
    syn = wn.synsets(w)
    if not syn:
        words_notfound.append(w)
    else:
        words_found.append(w)


In [168]:
len(words_notfound), len(words_found)

(102, 55)

In [169]:
# split the multi-word ingredients
ingredient_words = []
for ingr in all_ingredients:
    if '_' in ingr:
        print(ingr)
        ingr = ingr.replace('_',' ')
    if ingr == 'get_27':
        ingr = 'get27'
    elif ingr == 'manguo juice':
        ingr = 'mango juice'
    #elif ingr == 'cachaça':
    #    ingr = 'cachaca'
    ingredient_words.extend(ingr.split())

noilly_prat
brown_sugar
get_27


In [170]:
def w2v_get_weights(words, n_words, emb_dim, w2v_model):
    c = 0
    not_found = []
    embedding_matrix = np.zeros((n_words, emb_dim))
    for i, word in enumerate(words):
        try:
            embedding_vector = w2v_model.wv[word]
        except KeyError:
            embedding_vector = None
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            c += 1
            not_found.append(word)
    print(c, 'were not found')
    return embedding_matrix, not_found

In [171]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

dir_folder = 'glove'
glove_name = 'glove_42B_300d.txt'
weights_file = path.join(dir_folder, glove_name.split('.')[0] + '.npy')
glove_file = path.join(dir_folder, glove_name)
emb_file = '2' + glove_name
w2v_glove_file = path.join(dir_folder, emb_file)


if not path.exists(w2v_glove_file):
    glove2word2vec(glove_file, w2v_glove_file)

if not path.exists(weights_file):
    w2v_model = KeyedVectors.load_word2vec_format(w2v_glove_file, binary=False)
    emb_weights, not_found = w2v_get_weights(ingredient_words,
                                len(ingredient_words),
                                300,
                                w2v_model)
    del w2v_model
    np.save(path.join(dir_folder,weights_file), emb_weights)
else:
    emb_weights = np.load(weights_file)

In [None]:
# print embedding file
embeddings_index = {}
with open(path.join(dir_folder, glove_name)) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [173]:
emb_weights.shape

(781, 300)