**------------------------------------------------------------------------------------------------------------------------------------------------------**

**Input: "Food" Embeddings**

**Evaluates Model by calculating MAP, MRR, and RR@[5, 10]**

**Output: Metrics**

**------------------------------------------------------------------------------------------------------------------------------------------------------**

# Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import ast
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import warnings
warnings.simplefilter("ignore")

In [None]:
#Model = 'GraphSAGE'
vModel = 'GAT'

In [None]:
word_vectors = KeyedVectors.load_word2vec_format(f'../Output/{vModel}_food_embeddings.txt', binary=False)
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01001', topn=5)

**Get Food Categories**

In [None]:
df = pd.read_csv('../Input Data/food_category.csv')
df['NDB_No'] = df['NDB_No'].astype(str).str.rjust(5,'0')

In [None]:
food2cat = {}
food_label_map = {}
food_id_map = {}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#' + row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

**Utils to Evaluate Model**

In [None]:
def get_simscore_ingrank_multisamerank(query_food, subs):
    # return rank based on KG embedding 
    TOPK =  len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    for i,(e, sim) in enumerate(mostSimilar):
        if e == subs:
            return i+1    
    return TOPK

In [None]:
def get_simscore_ingrank_category_multisamerank(query_food, subs):
    # return rank based on KG embedding + foods' category
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[subs] != food2cat[e]: continue
        if e == subs:
            return i+1 
        i+=1
    return TOPK

In [None]:
def evaluate_model(scraped_subs_dict):    
    
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0
    print('number of ings: ',len(scraped_subs_dict.keys()))

    for query_food in sorted(scraped_subs_dict.keys()):
        relevant_ranks = []
        min_rank = 9999999999999999
        for subs in scraped_subs_dict[query_food]:
            rank = get_simscore_ingrank_category_multisamerank(query_food, subs)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
        if min_rank <= 10:
            in_top_10 += 1
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
        
    print(f'MAP: {(np.mean(ave_p)):.3f}')
    print(f'MRR: {(np.mean(rank_scores)):.3f}')
    print(f'RR@5: {(in_top_5/len(scraped_subs_dict.keys())):.3f}')
    print(f'RR@10: {(in_top_10/len(scraped_subs_dict.keys())):.3f}')

**Get Ground Truth**

In [None]:
subs_df = pd.read_csv('../Input Data/final_substitution.csv', sep=';')
subs_df['Food id'] = subs_df['Food id']
subs_df['Substitution id'] = subs_df['Substitution id']

In [None]:
scraped_subs_dict = dict()

for i,row in subs_df.iterrows():
    food = row['Food id']
    subs = row['Substitution id']
    if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
        if food not in scraped_subs_dict:
            scraped_subs_dict[food]= set()
      
        scraped_subs_dict[food].add(subs)
    else:
        print (food, subs, 'not in embeddings')

**Evaluate Model (+by comparing to a ground truth that only contains the foods that were also in the test set)**

In [None]:
foods = pd.read_csv(f'../Output/{vModel}_foods_2_test.csv')

new_scraped_subs_dict = dict()
for food in np.unique(np.array(foods['id'])):
    if food in scraped_subs_dict:
        new_scraped_subs_dict[food] = scraped_subs_dict[food]
    else:
        print (food, 'not in ground truth')

**Get Nutri-Scores + Nutri-Values**

In [None]:
nutri_scores = pd.read_csv('../Output/nutri_scores.csv')
food_2_score = dict()

for i, row in nutri_scores.iterrows():
    food = row['NDB_No']
    score = row['nutri_values']
    food_2_score[food] = score

**Evaluate**

In [None]:
evaluate_model(new_scraped_subs_dict)

**Get Example**

In [None]:
#Define query food
query_food = 'http://idea.rpi.edu/heals/kb/usda#01001'

In [None]:
#Nutri-Score of query food
food_2_score[query_food]

In [None]:
#Ground Truth
list(new_scraped_subs_dict[query_food])

In [None]:
#Get Rank with or without filtered ranking using food category
for subs in scraped_subs_dict[query_food]:
    rank = get_simscore_ingrank_category_multisamerank(query_food, subs)  
    #rank = get_simscore_ingrank_multisamerank(query_food, subs)
    print(str(subs) + ' - ' + str(rank))

In [None]:
#Get Top 5 of our algorithm with filtered ranking using food category and Nutri-Score
mostSimilar = word_vectors.most_similar(query_food, topn=100)
i = 0
for e, sim in mostSimilar:
    if e not in food2cat or food2cat[query_food] != food2cat[e]: 
        continue
    if food_2_score[e] >= food_2_score[query_food]:
        continue
    print(str(e) + ' - ' + str(food_2_score[e]))