# EDA for DietDupe

### load the data

In [1]:
import pickle
import pandas as pd

# load the data of node embeddings in FlavourGraph
with open('data/FlavorGraph_node_embedding.pickle', 'rb') as f:
    data = pickle.load(f)

# cast node_ids to integer
data = {int(k): v for k, v in data.items()}


In [2]:
# read information abo
nodes= pd.read_csv('data/nodes_191120.csv')

#delete nan in node name column
nodes = nodes.dropna(subset=['name'])

# make sure name is a string
nodes['name'] = nodes['name'].astype(str)

nodes.head()

Unnamed: 0,node_id,name,id,node_type,is_hub
0,0,1%_fat_buttermilk,,ingredient,no_hub
1,1,1%_fat_cottage_cheese,,ingredient,no_hub
2,3,10%_cream,,ingredient,no_hub
3,4,100%_bran,,ingredient,no_hub
4,5,10_inch_flour_tortilla,,ingredient,no_hub


In [3]:
# merge the dataframe and dictionnary by name and display projection

nodes['embeddings'] = nodes['node_id'].map(data)

In [4]:
nodes.head()

Unnamed: 0,node_id,name,id,node_type,is_hub,embeddings
0,0,1%_fat_buttermilk,,ingredient,no_hub,"[-0.10600116, 0.047149494, 0.10841199, 0.07235..."
1,1,1%_fat_cottage_cheese,,ingredient,no_hub,"[-0.015829312, 0.09736368, -0.0006226096, 0.13..."
2,3,10%_cream,,ingredient,no_hub,"[-0.10132008, 0.033723958, 0.064727835, 0.1566..."
3,4,100%_bran,,ingredient,no_hub,"[-0.10309663, 0.03204953, 0.08858223, 0.105722..."
4,5,10_inch_flour_tortilla,,ingredient,no_hub,"[-0.09346332, 0.120890595, 0.10606088, 0.10007..."


## Our first extrenal dataset will concern the kcal and nutritional values for foods - for example for the search of a lower calorie alternative of a food, or richer in a certain vitamin group

In [5]:
# calories dataset
nutri_data= pd.read_csv('data/ABBREV.csv')
nutri_data.head()


Unnamed: 0,index,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Refuse_Pct
0,0,1001,"BUTTER,WITH SALT",15.87,717,0.85,81.11,2.11,0.06,0.0,...,7.0,51.368,21.021,3.043,215.0,5.0,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,0.0
1,1,1002,"BUTTER,WHIPPED,W/ SALT",16.72,718,0.49,78.3,1.62,2.87,0.0,...,4.6,45.39,19.874,3.331,225.0,3.8,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,0.0
2,2,1003,"BUTTER OIL,ANHYDROUS",0.24,876,0.28,99.48,0.0,0.0,0.0,...,8.6,61.924,28.732,3.694,256.0,12.8,1 tbsp,205.0,1 cup,0.0
3,3,1004,"CHEESE,BLUE",42.41,353,21.4,28.74,5.11,2.34,0.0,...,2.4,18.669,7.778,0.8,75.0,28.35,1 oz,17.0,1 cubic inch,0.0
4,4,1005,"CHEESE,BRICK",41.11,371,23.24,29.68,3.18,2.79,0.0,...,2.5,18.764,8.598,0.784,94.0,132.0,"1 cup, diced",113.0,"1 cup, shredded",0.0


## Preliminarly match the data

In [6]:
# embed the strings
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


  from .autonotebook import tqdm as notebook_tqdm


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
embedded_names = []
for i, name in enumerate(nodes["name"]):
    if i%500 == 0:
        print(f"Processing {i} : {name}")
    inputs = tokenizer(name.replace("_", " "), return_tensors='pt')
    outputs = model(**inputs)
    embedded_names.append(outputs[0].detach().numpy().squeeze().mean(axis=0))


Processing 0 : 1%_fat_buttermilk
Processing 500 : bleached_all_purpose_flour
Processing 1000 : chamomile_tea_bag
Processing 1500 : corn
Processing 2000 : dry_penne_pasta
Processing 2500 : frozen_hash_browns_with_onions_and_pepper
Processing 3000 : hazelnut_extract
Processing 3500 : lemon_juice_concentrate
Processing 4000 : mixed_salad_green
Processing 4500 : pillsbury_ready_made_pie_dough
Processing 5000 : reduced_fat_creamy_peanut_butter
Processing 5500 : smoked_link_sausage
Processing 6000 : tia_maria
Processing 6500 : whole_cumin_seed
Processing 7000 : cis-3-Hexenyl_pentanoate
Processing 7500 : Citronellyl_propionate
Processing 8000 : 3,4-Hexanedione


In [8]:
embedded_external_data = []
for i, name in enumerate(nutri_data["Shrt_Desc"]):
    if i%500 == 0:
        print(f"Processing {i} : {name}")
    inputs = tokenizer(name.lower().replace(",", " "), return_tensors='pt')
    outputs = model(**inputs)
    embedded_external_data.append(outputs[0].detach().numpy().squeeze().mean(axis=0))

Processing 0 : BUTTER,WITH SALT
Processing 500 : BABYFOOD,TURKEY,RICE&VEG,TODD
Processing 1000 : TURKEY FROM WHL,NECK,MEAT ONLY,CKD,SIMMRD
Processing 1500 : CAMPBELL'S RED & WHITE - MICROWAVEABLE BOWLS,TOMATO SOUP
Processing 2000 : RICE & WHEAT CRL BAR
Processing 2500 : PORK,FRSH,LOIN,WHL,LN&FAT,CKD,BRLD
Processing 3000 : LETTUCE,COS OR ROMAINE,RAW
Processing 3500 : TURNIPS,CKD,BLD,DRND,W/SALT
Processing 4000 : BEEF,RIB,LRG END (RIBS 6-9),LN&FAT,1/8"FAT,PRIME,CKD,RSTD
Processing 4500 : FISH,COD,PACIFIC,RAW (MAY HAVE BEEN PREVIOUSLY FROZEN)
Processing 5000 : PEANUTS,VIRGINIA,OIL-ROASTED,WO/SALT
Processing 5500 : LAMB,NZ,IMP,HIND-SHANK,LN & FAT,CKD,BRSD
Processing 6000 : TOSTADA SHELLS,CORN
Processing 6500 : MILLET,COOKED
Processing 7000 : MCDONALD'S,RANCH SNACK WRAP,CRISPY
Processing 7500 : BEEF,NZ,IMP,KNUCKLE,CKD,FAST FRIED
Processing 8000 : MOTHER'S,COCNT COCADAS COOKIES
Processing 8500 : CARRABBA'S ITALIAN GRILL,SPAGHETTI W/ POMODORO SAU


### Match the data using BERT model and compute similarity scores

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assuming 'embedded_names' and 'external_data_embed' are your lists of BERT embeddings
embedded_names_np = np.array(embedded_names)
external_data_embed_np = np.array(embedded_external_data)

# Compute the cosine similarity between all pairs of vectors
similarity_matrix = cosine_similarity(embedded_names_np, external_data_embed_np)

# Find the index of the most similar vector for each vector in 'embedded_names'
most_similar_indices = np.argmax(similarity_matrix, axis=1)
most_similar_values = np.max(similarity_matrix, axis=1)

# Create a list of tuples where each tuple is a pair of indices of the most similar embeddings
most_similar = list(zip(range(len(embedded_names)), most_similar_indices, most_similar_values))

In [10]:
#append the embedding and name froom extrernal data to nodes dataframe
nodes['best_match'] = None
nodes['similarity_of_best_match'] = None
# map nodes
for i, j, sim in most_similar:
    nodes.loc[i, 'best_match'] = nutri_data.loc[j, 'Shrt_Desc']
    nodes.loc[i, 'similarity_of_best_match'] = sim

In [11]:
nodes.head(20)

Unnamed: 0,node_id,name,id,node_type,is_hub,embeddings,best_match,similarity_of_best_match
0,0.0,1%_fat_buttermilk,,ingredient,no_hub,"[-0.10600116, 0.047149494, 0.10841199, 0.07235...","MILK,BUTTERMILK,DRIED",0.859678
1,1.0,1%_fat_cottage_cheese,,ingredient,no_hub,"[-0.015829312, 0.09736368, -0.0006226096, 0.13...","CHEESE,COTTAGE,LOWFAT,1% MILKFAT",0.84297
2,3.0,10%_cream,,ingredient,no_hub,"[-0.10132008, 0.033723958, 0.064727835, 0.1566...","TURTLE,GREEN,RAW",0.738032
3,4.0,100%_bran,,ingredient,no_hub,"[-0.10309663, 0.03204953, 0.08858223, 0.105722...","TURTLE,GREEN,RAW",0.745624
4,5.0,10_inch_flour_tortilla,,ingredient,no_hub,"[-0.09346332, 0.120890595, 0.10606088, 0.10007...","Tortilla chips, yellow, plain, salted",0.864151
5,7.0,12_inch_pizza_crust,,ingredient,no_hub,"[0.019675368, 0.0335593, -0.05435514, 0.112564...","PIZZA HUT 12"" PEPPERONI PIZZA,PAN CRUST",0.822786
6,9.0,18%_table_cream,,ingredient,no_hub,"[-0.17582406, 0.14053129, -0.022183008, 0.0534...","PORK,GROUND,84% LN / 16% FAT,RAW",0.781034
7,10.0,2%_buttermilk,,ingredient,no_hub,"[-0.07227368, 0.033502933, 0.037231416, 0.1503...","MILK,BUTTERMILK,DRIED",0.871265
8,11.0,2%_cheddar_cheese,,ingredient,no_hub,"[0.12324965, -0.26437616, -0.09264475, 0.06749...","CHEESE,CHEDDAR",0.879752
9,12.0,2%_evaporated_milk,,ingredient,no_hub,"[-0.23566853, -0.048889615, 0.09088527, 0.1119...","MILK,PRODUCER,FLUID,3.7% MILKFAT",0.757393


In [12]:
# save matched nodes
#nodes.to_csv('data/nodes_191120_matched.csv', index=False)

## Exploration of the external data

In [13]:
# get list of columns ending with 'mg'
micro_columns_list = [element for element in nutri_data.columns if element.endswith('mg)')]

# get dataframe with subset of columns for micronutrients
df_micro = nutri_data[micro_columns_list]

# get summary statistics of micronutrient columns
pd.DataFrame(
	data={
		'average':df_micro.mean(),
		'standard_deviation':df_micro.std()
		}
)

Unnamed: 0,average,standard_deviation
Calcium_(mg),76.738214,203.527453
Iron_(mg),2.699674,5.68756
Magnesium_(mg),35.295988,57.416785
Phosphorus_(mg),165.142126,204.704214
Potassium_(mg),279.47274,375.483729
Sodium_(mg),312.495923,943.431341
Zinc_(mg),2.117438,3.437209
Copper_mg),0.195984,0.582596
Manganese_(mg),0.658156,7.248609
Vit_C_(mg),9.231134,68.854696
