In [10]:
import pandas as pd
import numpy as np
import random
import ast
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [11]:
word_vectors = KeyedVectors.load_word2vec_format('../Output/food_embeddings.txt', binary=False)
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01001', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#04002', 0.8940567970275879),
 ('http://idea.rpi.edu/heals/kb/usda#20018', 0.8897666931152344),
 ('http://idea.rpi.edu/heals/kb/usda#09079', 0.8848801851272583),
 ('http://idea.rpi.edu/heals/kb/usda#20317', 0.8738973736763),
 ('http://idea.rpi.edu/heals/kb/usda#20016', 0.8738973736763)]

** ### ----------------------- Food Categories ----------------------- ### **

In [12]:
df = pd.read_csv('../Input Data/food_category.csv')
df['NDB_No']= df['NDB_No'].astype(str).str.rjust(5,'0')

In [13]:
food2cat ={}
food_label_map= {}
food_id_map ={}
for i, row in df.iterrows():
    uri = 'http://idea.rpi.edu/heals/kb/usda#'+row['NDB_No']
    label = row['Long_Desc'].strip()
    cat = row['FdGrp_Desc']
    food2cat[uri] = cat
    food_label_map[uri] = label
    food_id_map[label] = uri

** ### ----------------------- Functions ----------------------- ### **

In [14]:
def get_simscore_ingrank_onlyss_multisamerank(fromt, asdf):
    # return rank based on KG embedding
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(fromt, topn=TOPK)
    for i,(e, sim) in enumerate(mostSimilar):
        if e == asdf:
            return i+1    
    return TOPK

In [15]:
def get_simscore_ingrank_category_multisamerank(fromt, asdf):
    # return rank based on KG embedding + foods' category
    TOPK = len(list(word_vectors.index_to_key))
    mostSimilar = word_vectors.most_similar(fromt, topn=TOPK)
    i = 0
    for e, sim in mostSimilar:
        if e not in food2cat or food2cat[asdf] != food2cat[e]: continue
        if e == asdf:
            return i+1 
        i+=1
    return TOPK

In [16]:
def mrr_map_new(scraped_subs_dict, opt = 1):
    rank_scores = []
    ave_p = []
    in_top_5 = 0
    in_top_10 = 0
    print('number of ings: ',len(scraped_subs_dict.keys()))

    for fromt in sorted(scraped_subs_dict.keys()):
        relevant_ranks = []
        min_rank = 9999999999999999
        for asdf in scraped_subs_dict[fromt]:
            if opt == 1:
                rank = get_simscore_ingrank_onlyss_multisamerank(fromt, asdf)
            elif opt == 2:
                rank = get_simscore_ingrank_category_multisamerank(fromt, asdf)   
            relevant_ranks.append(rank)
            if rank < min_rank:
                min_rank = rank
        rank = min_rank
        rank_scores.append(1.0/rank)
        if min_rank <= 5:
            in_top_5 += 1
        if min_rank <= 10:
            in_top_10 += 1
        precisions = []
        for rank in relevant_ranks:
            good_docs = len([r for r in relevant_ranks if r <= rank])
            precisions.append(good_docs/rank)
        if len(precisions) == 0:
            precisions = [0]
        ave_p.append(np.mean(precisions))
    print(f'MAP: {(np.mean(ave_p)):.4f}')
    print(f'MRR: {(np.mean(rank_scores)):.4f}')
    print(f'RR@5: {(in_top_5/len(scraped_subs_dict.keys())):.4f}')
    print(f'RR@10: {(in_top_10/len(scraped_subs_dict.keys())):.4f}')

** ### ----------------------- Ground Truth ----------------------- ### **

In [17]:
subs_df = pd.read_csv('../Input Data/final_substitution.csv', sep=';')
subs_df['Food id'] = subs_df['Food id']
subs_df['Substitution id'] = subs_df['Substitution id']

In [18]:
ground_truth_foods = set(subs_df['Food id'].unique()).union(subs_df['Substitution id'].unique())

In [19]:
scraped_subs_dict = dict()

for i,row in subs_df.iterrows():
    food= row['Food id']
    subs = row['Substitution id']
    if food in word_vectors and subs in word_vectors and food in food2cat and subs in food2cat:
        if food not in scraped_subs_dict:
            scraped_subs_dict[food]= set()
      
        scraped_subs_dict[food].add(subs)
    else:
        print (food, subs, 'not in embeddings')

In [None]:
mrr_map_new(scraped_subs_dict, 1)

number of ings:  370


In [None]:
mrr_map_new(scraped_subs_dict, 2)