**------------------------------------------------------------------------------------------------------------------------------------------------------**

**Input: "Food" Embeddings**

**Evaluates Predict New Links**

**Output: New Links**

**------------------------------------------------------------------------------------------------------------------------------------------------------**

# Libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import warnings
warnings.simplefilter("ignore")



In [2]:
word_vectors = KeyedVectors.load_word2vec_format('../Output/GAT_food_embeddings.txt', binary=False)
word_vectors.most_similar('http://idea.rpi.edu/heals/kb/usda#01001', topn=5)

[('http://idea.rpi.edu/heals/kb/usda#04615', 0.9956585168838501),
 ('http://idea.rpi.edu/heals/kb/usda#01088', 0.9955866932868958),
 ('http://idea.rpi.edu/heals/kb/usda#04058', 0.9953868985176086),
 ('http://idea.rpi.edu/heals/kb/usda#02041', 0.9951779246330261),
 ('http://idea.rpi.edu/heals/kb/usda#01023', 0.9951662421226501)]

In [3]:
foods = pd.read_csv('../Input Data/data/all_foods.csv')
foods = np.unique(foods['subject'])
print(f'# Foods = {len(foods)}')

# Foods = 9372


**Get Top 10 Food Substitutes**

In [4]:
topn = 10
df_results = pd.DataFrame()

for food in foods:
    query_food = []
    top_subs = []
    top_sim_scores = []
    for j in range(topn):
        query_food.append(food)
        top_subs.append(word_vectors.most_similar(food, topn=topn)[j][0])
        top_sim_scores.append(word_vectors.most_similar(food, topn=topn)[j][1])
    df_small = pd.DataFrame()
    df_small['Food id'] = query_food
    df_small['Substitution id'] = top_subs
    df_small['Similarity Scores'] = top_sim_scores
    df_results = df_results.append(df_small)

**Get Food Labels**

In [5]:
food_labels = pd.read_excel('../Input Data/ABBREV.xlsx', sheet_name='ABBREV')
food_labels.NDB_No = 'http://idea.rpi.edu/heals/kb/usda#' + food_labels.NDB_No.astype(str).str.rjust(5,'0')
food_2_label = dict()

for i, row in food_labels.iterrows():
    food = row['NDB_No']
    label = row['Shrt_Desc']
    food_2_label[food] = label

**Get Nutri-Values + Nutri-Scores**

In [6]:
nutri_scores = pd.read_csv('../Output/nutri_scores.csv')
food_2_score = dict()

for i, row in nutri_scores.iterrows():
    food = row['NDB_No']
    score = row['nutri_values']
    food_2_score[food] = score

**Get Food Categories**

In [7]:
food_cat = pd.read_csv('../Input Data/food_category.csv')
food_cat['NDB_No'] = food_cat['NDB_No'].astype(str).str.rjust(5,'0')
food_2_cat = dict()

for i, row in food_cat.iterrows():
    food = 'http://idea.rpi.edu/heals/kb/usda#' + row['NDB_No']
    cat = row['FdGrp_Desc']
    food_2_cat[food] = cat

**Add Food Labels, Nutri-Values + Nutri-Scores, Food Categories to df_results**

In [8]:
food_labels = []
subs_labels = []
food_scores = []
subs_scores = []
food_cat = []
subs_cat = []

for i, row in df_results.iterrows():
    if(row['Food id'] in food_2_label.keys() and row['Substitution id'] in food_2_label.keys()):
        food_labels.append(food_2_label[row['Food id']])
        subs_labels.append(food_2_label[row['Substitution id']])
    else:
        food_labels.append('not found')
        subs_labels.append('not found')
    if(row['Food id'] in food_2_score.keys() and row['Substitution id'] in food_2_score.keys()):
        food_scores.append(food_2_score[row['Food id']])
        subs_scores.append(food_2_score[row['Substitution id']])
    else:
        food_scores.append(999)
        subs_scores.append(999)
    if(row['Food id'] in food_2_cat.keys() and row['Substitution id'] in food_2_cat.keys()):
        food_cat.append(food_2_cat[row['Food id']])
        subs_cat.append(food_2_cat[row['Substitution id']])
    else:
        food_cat.append('not found')
        subs_cat.append('not found')
        
df_results['Food label'] = food_labels
df_results['Substitution label'] = subs_labels
df_results['Food Nutri-Value'] = food_scores
df_results['Substitution Nutri-Value'] = subs_scores
df_results['Food Category'] = food_cat
df_results['Substitution Category'] = subs_cat

In [9]:
len(df_results)

93720

In [10]:
df_results = df_results[df_results['Food label'] != 'not found']
df_results = df_results[df_results['Food Category'] != 'not found']

**Filter out all substitutes which have a higher Nutri-Value than their query food**

In [11]:
len(df_results)

75626

In [12]:
df_results = df_results[df_results['Food Nutri-Value'] > df_results['Substitution Nutri-Value']]

**Filter out substitutes which are not in the same food category than their query food, and get final dataset, which will be labelled by our Nutri-Scholars**

In [13]:
len(df_results)

35758

In [14]:
len(np.unique(df_results['Food id']))

7341

In [15]:
def get_subs(df_results):
    subs_a_tot = pd.DataFrame()
    subs_b_tot = pd.DataFrame()
    for food in np.unique(df_results['Food id']):
        if len(df_results[(df_results['Food id'] == food) & (df_results['Food Category'] == df_results['Substitution Category'])]) >= 2:
            subs_a = df_results[(df_results['Food id'] == food) & 
                                (df_results['Food Category'] == df_results['Substitution Category'])].sort_values(by=['Similarity Scores', 'Substitution Nutri-Value'], ascending=[False, True]).iloc[[0]]
            subs_b = df_results[(df_results['Food id'] == food) & 
                                (df_results['Food Category'] == df_results['Substitution Category'])].sort_values(by=['Substitution Nutri-Value', 'Similarity Scores'], ascending=[True, False]).iloc[[0]]
            if subs_a['Substitution id'].iloc[0] != subs_b['Substitution id'].iloc[0]:
                subs_a_tot = subs_a_tot.append(subs_a)
                subs_b_tot = subs_b_tot.append(subs_b)
            else: 
                subs_b = subs_a
                subs_a = df_results[(df_results['Food id'] == food) & 
                                    (df_results['Food Category'] == df_results['Substitution Category'])].sort_values(by=['Similarity Scores', 'Substitution Nutri-Value'], ascending=[False, True]).iloc[[1]]
                subs_a_tot = subs_a_tot.append(subs_a)
                subs_b_tot = subs_b_tot.append(subs_b)
    return subs_a_tot, subs_b_tot

**Subs A = highest similarity score, Subs B = lowest Nutri-Value**

In [16]:
subs_a, subs_b = get_subs(df_results)

In [17]:
subs_a = subs_a[['Food label', 'Substitution label', 'Food Nutri-Value', 'Substitution Nutri-Value']]
subs_a.rename(columns = {'Substitution label':'Subs A', 'Substitution Nutri-Value':'Subs A Nutri-Value'}, inplace = True)

In [18]:
subs_b = subs_b[['Food label', 'Substitution label', 'Food Nutri-Value', 'Substitution Nutri-Value']]
subs_b.rename(columns = {'Substitution label':'Subs B', 'Substitution Nutri-Value':'Subs B Nutri-Value'}, inplace = True)

**Concatenate Both Datasets**

In [19]:
new_links = pd.merge(subs_a, subs_b)

In [20]:
len(new_links)

508

In [21]:
new_links.to_csv('../Output/new_links_before_shuffle.csv')

**Shuffle Dataset for Nutri-Scholars**

In [22]:
new_links = new_links[['Food label', 'Subs A', 'Subs B']]
new_links['True Label'] = new_links['Subs B']

In [23]:
shuffle = np.apply_along_axis(np.random.permutation, 1, new_links[['Subs A', 'Subs B']])
df = pd.DataFrame(shuffle, columns = ['Subs A (shuffle)','Subs B (shuffle)'])

In [24]:
new_links = new_links[['Food label', 'True Label']]

In [25]:
new_links['Subs A'] = df['Subs A (shuffle)']
new_links['Subs B'] = df['Subs B (shuffle)']

In [26]:
new_links.to_csv('../Output/new_links_after_shuffle.csv')