### This notebook evaluates our models by computing MAP, MRR, and RR@[5, 10].

In [1]:
import pandas as pd
import numpy as np
import random
import ast
from gensim.models import KeyedVectors
from gensim.test.utils import datapath



In [2]:
word_vectors = KeyedVectors.load_word2vec_format('../Output/food_embeddings.txt', binary=False)
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01001', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#09221', 0.9428136944770813),
 ('http://idea.rpi.edu/heals/kb/usda#04584', 0.8614482879638672),
 ('http://idea.rpi.edu/heals/kb/usda#09404', 0.8404527306556702),
 ('http://idea.rpi.edu/heals/kb/usda#04529', 0.8273051381111145),
 ('http://idea.rpi.edu/heals/kb/usda#04582', 0.8045340180397034)]

**Food Categories**

In [3]:
df = pd.read_csv('../Input Data/food_category.csv')
df['NDB_No'] = df['NDB_No'].astype(str).str.rjust(5,'0')

In [4]:
food2cat = {}
food_label_map = {}
food_id_map = {}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#' + row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

**Functions**

In [5]:
def get_simscore_ingrank_onlyss_multisamerank(query_food, subs):
    # return rank based on KG embedding
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    for i, (e, sim) in enumerate(mostSimilar):
        if e == subs:
            return i+1    
    return TOPK

In [6]:
def get_simscore_ingrank_category_multisamerank(query_food, subs):
    # return rank based on KG embedding + foods' category
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[subs] != food2cat[e]: continue
        if e == subs:
            return i+1 
        i+=1
    return TOPK

In [7]:
def mrr_map_new(scraped_subs_dict, opt = 1):
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0
    print('number of ings: ',len(scraped_subs_dict.keys()))

    for query_food in sorted(scraped_subs_dict.keys()):
        relevant_ranks = []
        min_rank = 9999999999999999
        for subs in scraped_subs_dict[query_food]:
            if opt == 1:
                rank = get_simscore_ingrank_onlyss_multisamerank(query_food, subs)
            elif opt == 2:
                rank = get_simscore_ingrank_category_multisamerank(query_food, subs)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
        if min_rank <= 10:
            in_top_10 += 1
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
    print(f'MAP: {(np.mean(ave_p)):.4f}')
    print(f'MRR: {(np.mean(rank_scores)):.4f}')
    print(f'RR@5: {(in_top_5/len(scraped_subs_dict.keys())):.4f}')
    print(f'RR@10: {(in_top_10/len(scraped_subs_dict.keys())):.4f}')

**Ground Truth**

In [8]:
subs_df = pd.read_csv('../Input Data/final_substitution.csv', sep=';')
subs_df['Food id'] = subs_df['Food id']
subs_df['Substitution id'] = subs_df['Substitution id']

In [9]:
scraped_subs_dict = dict()

for i,row in subs_df.iterrows():
    food = row['Food id']
    subs = row['Substitution id']
    if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
        if food not in scraped_subs_dict:
            scraped_subs_dict[food]= set()
      
        scraped_subs_dict[food].add(subs)
    else:
        print (food, subs, 'not in embeddings')

**Evaluation**

In [10]:
#only compare to a ground truth which contains the foods that were in the test set
foods = pd.read_csv('../Output/foods_2_test.csv')

new_scraped_subs_dict = dict()
for food in np.unique(np.array(foods['id'])):
    new_scraped_subs_dict[food] = scraped_subs_dict[food]

In [11]:
mrr_map_new(new_scraped_subs_dict, 2)

number of ings:  216
MAP: 0.0940
MRR: 0.2394
RR@5: 0.3750
RR@10: 0.4491
