**------------------------------------------------------------------------------------------------------------------------------------------------------**

k-fold cross-validation

**Input: "Food" Embeddings**

**Evaluates Model by calculating MAP, MRR, and RR@[5, 10]**

**Output: Metrics**

**------------------------------------------------------------------------------------------------------------------------------------------------------**

# Libraries

In [17]:
import pandas as pd
import numpy as np
import random
import ast
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import warnings
warnings.simplefilter("ignore")

In [18]:
use_filter = 1
#vModel = 'GraphSAGE'
vModel = 'GAT'

In [19]:
k_fold = 10

**Get Food Categories**

In [20]:
df = pd.read_csv('../Input Data/food_category.csv')
df['NDB_No'] = df['NDB_No'].astype(str).str.rjust(5,'0')

In [21]:
food2cat = {}
food_label_map = {}
food_id_map = {}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#' + row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

**Utils to Evaluate Model**

In [22]:
def get_simscore_ingrank_multisamerank(query_food, subs):
    # return rank based on KG embedding 
    TOPK =  len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    for i,(e, sim) in enumerate(mostSimilar):
        if e == subs:
            return i+1    
    return TOPK

In [23]:
def get_simscore_ingrank_category_multisamerank(query_food, subs):
    # return rank based on KG embedding + foods' category
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(query_food, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[subs] != food2cat[e]: continue
        if e == subs:
            return i+1 
        i+=1
    return TOPK

In [71]:
def evaluate_model(scraped_subs_dict, specific_food_group):
    
    count = 0
    
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0

    for query_food in sorted(scraped_subs_dict.keys()):
        
        if food2cat[query_food] != specific_food_group:
            continue
        count = count+1
        
        relevant_ranks = []
        min_rank = 9999999999999999
        for subs in scraped_subs_dict[query_food]:
            if use_filter == 0:
                rank = get_simscore_ingrank_multisamerank(query_food, subs)
            elif use_filter == 1:
                rank = get_simscore_ingrank_category_multisamerank(query_food, subs)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
        if min_rank <= 10:
            in_top_10 += 1
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
        
    if count != 0:
        return (np.mean(ave_p)), (np.mean(rank_scores)), (in_top_5/count), (in_top_10/count)
    else:
        return 0,0,0,0

**Get Ground Truth**

In [72]:
subs_df = pd.read_csv('../Input Data/final_substitution.csv', sep=';')
subs_df['Food id'] = subs_df['Food id']
subs_df['Substitution id'] = subs_df['Substitution id']

In [73]:
def get_scraped_subs_dict(word_vectors):
    
    scraped_subs_dict = dict()

    for i,row in subs_df.iterrows():
        food = row['Food id']
        subs = row['Substitution id']
        if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
            if food not in scraped_subs_dict:
                scraped_subs_dict[food]= set()

            scraped_subs_dict[food].add(subs)
        else:
            print (food, subs, 'not in embeddings')
            
    return scraped_subs_dict

**Get Evaluations**

In [74]:
specific_food_group = 'Fats and Oils'

In [75]:
df

Unnamed: 0,NDB_No,Long_Desc,FdGrp_Cd,FdGrp_Desc
0,01001,"Butter, salted",100,Dairy and Egg Products
1,01002,"Butter, whipped, with salt",100,Dairy and Egg Products
2,01003,"Butter oil, anhydrous",100,Dairy and Egg Products
3,01004,"Cheese, blue",100,Dairy and Egg Products
4,01005,"Cheese, brick",100,Dairy and Egg Products
...,...,...,...,...
8784,36629,"Restaurant, Chinese, orange chicken",3600,Restaurant Foods
8785,36630,"Restaurant, Italian, spaghetti with meat sauce",3600,Restaurant Foods
8786,36631,"OLIVE GARDEN, spaghetti with meat sauce",3600,Restaurant Foods
8787,36632,"CARRABBA'S ITALIAN GRILL, spaghetti with meat ...",3600,Restaurant Foods


In [None]:
for specific_food_group in np.unique(df['FdGrp_Desc']):
    
    print('--------------------------------------------------')
    print(specific_food_group)
    
    mean_average_precision = []
    mean_reciprocal_rank = []
    recall_rate_5 = []
    recall_rate_10 = []

    for i in range(k_fold): 

        #Get Embeddings and Ground Truth
        word_vectors = KeyedVectors.load_word2vec_format(f'../Output/k_fold_10_kg/GAT/{vModel}_{i}_food_embeddings.txt', binary=False)
        scraped_subs_dict = get_scraped_subs_dict(word_vectors)

        #Restrict Ground Truth to the Test Foods
        foods = pd.read_csv(f'../Output/k_fold_10_kg/GAT/{vModel}_{i}_foods_2_test.csv')
        new_scraped_subs_dict = dict()
        for food in np.unique(np.array(foods['id'])):
            if food in scraped_subs_dict:
                new_scraped_subs_dict[food] = scraped_subs_dict[food]
            else:
                print (food, 'not in ground truth')

        map_, mrr_, rr5_, rr10_ = evaluate_model(new_scraped_subs_dict, specific_food_group)
        mean_average_precision.append(map_)
        mean_reciprocal_rank.append(mrr_)
        recall_rate_5.append(rr5_)
        recall_rate_10.append(rr10_)

    print(round(np.array(mean_average_precision).mean(), 3))
    print(round(np.array(mean_reciprocal_rank).mean(), 3))
    print(round(np.array(recall_rate_5).mean(), 3))
    print(round(np.array(recall_rate_10).mean(), 3))

--------------------------------------------------
American Indian/Alaska Native Foods
0.0
0.0
0.0
0.0
--------------------------------------------------
Baby Foods
0.0
0.0
0.0
0.0
--------------------------------------------------
Baked Products
0.0
0.0
0.0
0.0
--------------------------------------------------
Beef Products
1.44
0.896
0.933
0.933
--------------------------------------------------
Beverages
0.0
0.0
0.0
0.0
--------------------------------------------------
Breakfast Cereals
0.0
0.0
0.0
0.0
--------------------------------------------------
Cereal Grains and Pasta
0.261
0.396
0.652
0.756
--------------------------------------------------
Dairy and Egg Products
0.346
0.631
0.788
0.855
--------------------------------------------------
Fast Foods
0.0
0.0
0.0
0.0
--------------------------------------------------
Fats and Oils
0.732
0.842
0.898
0.922
--------------------------------------------------
Finfish and Shellfish Products
0.215
0.408
0.574
0.841
-----------------