**------------------------------------------------------------------------------------------------------------------------------------------------------**

**Input: "Food" Embeddings**

**Evaluates Model by calculating MAP, MRR, and RR@[5, 10]**

**Output: Metrics**

**------------------------------------------------------------------------------------------------------------------------------------------------------**

# Libraries

In [1]:
import pandas as pd
import numpy as np
import random
import ast
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import warnings
warnings.simplefilter("ignore")



In [2]:
#Model = 'GraphSAGE'
vModel = 'GAT'

In [5]:
word_vectors = KeyedVectors.load_word2vec_format(f'../Output/k_fold_5_kg/{vModel}/{vModel}_0_food_embeddings.txt', binary=False)
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01001', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#09201', 0.9915021657943726),
 ('http://idea.rpi.edu/heals/kb/usda#07010', 0.9912489652633667),
 ('http://idea.rpi.edu/heals/kb/usda#09152', 0.98865807056427),
 ('http://idea.rpi.edu/heals/kb/usda#10864', 0.9885743856430054),
 ('http://idea.rpi.edu/heals/kb/usda#04582', 0.9872804880142212)]

**Get Food Categories**

In [6]:
df = pd.read_csv('../Input Data/food_category.csv')
df['NDB_No'] = df['NDB_No'].astype(str).str.rjust(5,'0')

In [7]:
food2cat = {}
food_label_map = {}
food_id_map = {}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#' + row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

**Utils to Evaluate Model**

In [8]:
def get_simscore_ingrank_category_multisamerank(query_food, subs):
    # return rank based on KG embedding + foods' category
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[subs] != food2cat[e]: continue
        if e == subs:
            return i+1 
        i+=1
    return TOPK

In [9]:
def evaluate_model(scraped_subs_dict):    
    top_5_food = []
    low_5_food = []
    
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0
    print('number of ings: ',len(scraped_subs_dict.keys()))

    for query_food in sorted(scraped_subs_dict.keys()):
        relevant_ranks = []
        min_rank = 9999999999999999
        for subs in scraped_subs_dict[query_food]:
            rank = get_simscore_ingrank_category_multisamerank(query_food, subs)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
            top_5_food.append(query_food)
        if min_rank <= 10:
            in_top_10 += 1
        if min_rank > 10:
            low_5_food.append(query_food)
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
        
    print(f'MAP: {(np.mean(ave_p)):.3f}')
    print(f'MRR: {(np.mean(rank_scores)):.3f}')
    print(f'RR@5: {(in_top_5/len(scraped_subs_dict.keys())):.3f}')
    print(f'RR@10: {(in_top_10/len(scraped_subs_dict.keys())):.3f}')
    
    return top_5_food, low_5_food

**Get Ground Truth**

In [10]:
subs_df = pd.read_csv('../Input Data/final_substitution.csv', sep=';')
subs_df['Food id'] = subs_df['Food id']
subs_df['Substitution id'] = subs_df['Substitution id']

In [11]:
scraped_subs_dict = dict()

for i,row in subs_df.iterrows():
    food = row['Food id']
    subs = row['Substitution id']
    if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
        if food not in scraped_subs_dict:
            scraped_subs_dict[food]= set()
      
        scraped_subs_dict[food].add(subs)
    else:
        print (food, subs, 'not in embeddings')

**Evaluate Model (+by comparing to a ground truth that only contains the foods that were also in the test set)**

In [12]:
foods = pd.read_csv(f'../Output/k_fold_5_kg/{vModel}/{vModel}_0_foods_2_test.csv')

new_scraped_subs_dict = dict()
for food in np.unique(np.array(foods['id'])):
    if food in scraped_subs_dict:
        new_scraped_subs_dict[food] = scraped_subs_dict[food]
    else:
        print (food, 'not in ground truth')

**Get Nutri-Scores + Nutri-Values**

In [13]:
nutri_scores = pd.read_csv('../Output/nutri_scores.csv')
food_2_score = dict()

for i, row in nutri_scores.iterrows():
    food = row['NDB_No']
    score = row['nutri_values']
    food_2_score[food] = score

**Get some Examples**

In [14]:
top_5_food, low_5_food = evaluate_model(new_scraped_subs_dict)

number of ings:  121
MAP: 0.331
MRR: 0.508
RR@5: 0.645
RR@10: 0.678


In [19]:
top_5_food[:5]

['http://idea.rpi.edu/heals/kb/usda#01001',
 'http://idea.rpi.edu/heals/kb/usda#01004',
 'http://idea.rpi.edu/heals/kb/usda#01006',
 'http://idea.rpi.edu/heals/kb/usda#01011',
 'http://idea.rpi.edu/heals/kb/usda#01020']

In [20]:
low_5_food[:5]

['http://idea.rpi.edu/heals/kb/usda#01184',
 'http://idea.rpi.edu/heals/kb/usda#01238',
 'http://idea.rpi.edu/heals/kb/usda#04641',
 'http://idea.rpi.edu/heals/kb/usda#06080',
 'http://idea.rpi.edu/heals/kb/usda#06116']

In [24]:
query_food = 'http://idea.rpi.edu/heals/kb/usda#01001'

In [25]:
food_2_score[query_food]

25

In [26]:
#Ground Truth
list(new_scraped_subs_dict[query_food])

['http://idea.rpi.edu/heals/kb/usda#04053',
 'http://idea.rpi.edu/heals/kb/usda#04668',
 'http://idea.rpi.edu/heals/kb/usda#04034',
 'http://idea.rpi.edu/heals/kb/usda#04615',
 'http://idea.rpi.edu/heals/kb/usda#04518',
 'http://idea.rpi.edu/heals/kb/usda#04556',
 'http://idea.rpi.edu/heals/kb/usda#04582',
 'http://idea.rpi.edu/heals/kb/usda#04073',
 'http://idea.rpi.edu/heals/kb/usda#04128',
 'http://idea.rpi.edu/heals/kb/usda#04669',
 'http://idea.rpi.edu/heals/kb/usda#04047',
 'http://idea.rpi.edu/heals/kb/usda#04581',
 'http://idea.rpi.edu/heals/kb/usda#04060',
 'http://idea.rpi.edu/heals/kb/usda#04601',
 'http://idea.rpi.edu/heals/kb/usda#04679']

In [27]:
#Get Rank
for subs in scraped_subs_dict[query_food]:
    rank = get_simscore_ingrank_category_multisamerank(query_food, subs)  
    print(str(subs) + ' - ' + str(rank))

http://idea.rpi.edu/heals/kb/usda#04053 - 3
http://idea.rpi.edu/heals/kb/usda#04668 - 25
http://idea.rpi.edu/heals/kb/usda#04034 - 109
http://idea.rpi.edu/heals/kb/usda#04615 - 7
http://idea.rpi.edu/heals/kb/usda#04518 - 9
http://idea.rpi.edu/heals/kb/usda#04556 - 1
http://idea.rpi.edu/heals/kb/usda#04582 - 4
http://idea.rpi.edu/heals/kb/usda#04073 - 5
http://idea.rpi.edu/heals/kb/usda#04128 - 80
http://idea.rpi.edu/heals/kb/usda#04669 - 10
http://idea.rpi.edu/heals/kb/usda#04047 - 6
http://idea.rpi.edu/heals/kb/usda#04581 - 27
http://idea.rpi.edu/heals/kb/usda#04060 - 22
http://idea.rpi.edu/heals/kb/usda#04601 - 13
http://idea.rpi.edu/heals/kb/usda#04679 - 14


In [28]:
#Get Top 5 of our algorithm
mostSimilar = word_vectors.most_similar(query_food, topn=200)
i = 0
for e, sim in mostSimilar:
    if e not in food2cat or food2cat[query_food] != food2cat[e]: 
        continue
    if food_2_score[e] >= food_2_score[query_food]:
        continue
    print(str(e) + ' - ' + str(food_2_score[e]))

http://idea.rpi.edu/heals/kb/usda#01012 - 1
http://idea.rpi.edu/heals/kb/usda#01036 - 5
http://idea.rpi.edu/heals/kb/usda#01156 - 14
http://idea.rpi.edu/heals/kb/usda#01004 - 19
http://idea.rpi.edu/heals/kb/usda#01011 - 15
http://idea.rpi.edu/heals/kb/usda#01095 - 16
http://idea.rpi.edu/heals/kb/usda#01019 - 18
http://idea.rpi.edu/heals/kb/usda#01020 - 17
http://idea.rpi.edu/heals/kb/usda#01118 - -2
http://idea.rpi.edu/heals/kb/usda#01023 - 17
http://idea.rpi.edu/heals/kb/usda#01088 - 1
http://idea.rpi.edu/heals/kb/usda#01009 - 17
http://idea.rpi.edu/heals/kb/usda#01090 - 23
http://idea.rpi.edu/heals/kb/usda#01085 - -1
http://idea.rpi.edu/heals/kb/usda#01096 - 4
http://idea.rpi.edu/heals/kb/usda#01179 - 5
http://idea.rpi.edu/heals/kb/usda#01052 - 12
http://idea.rpi.edu/heals/kb/usda#01032 - 20
http://idea.rpi.edu/heals/kb/usda#01040 - 9
http://idea.rpi.edu/heals/kb/usda#01056 - 11
http://idea.rpi.edu/heals/kb/usda#01256 - -5
http://idea.rpi.edu/heals/kb/usda#01059 - -2
http://idea.rpi.