-
Notifications
You must be signed in to change notification settings - Fork 0
/
nutrimap.py
424 lines (377 loc) · 13 KB
/
nutrimap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
import altair as alt
import pandas as pd
import panel as pn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as pca
from scipy.cluster import hierarchy
pn.extension('vega')
# Read RDI and food data
rdi_url = 'https://raw.githubusercontent.com/joelostblom/nutrimap/main/data/processed/matched_rdi_sr_nih.csv'
rdis = pd.read_csv(rdi_url, comment='#')
food_url = 'https://raw.githubusercontent.com/joelostblom/nutrimap/main/data/processed/foods.csv'
foods = pd.read_csv(food_url, index_col=0)
# Fill in some missing values via manual lookups
foods.loc['Oats', 'Sugar'] = 1.1
foods.loc['Oats', 'Selenium'] = 28.9
foods.loc['Oats', 'Vitamin E'] = 0.42
foods.loc['Oats', 'Vitamin K'] = 2
foods.loc['Oats', 'beta-Carotene'] = 0
foods.loc['Oats', 'alpha-Carotene'] = 0
foods.loc['Oats', 'beta-Cryptoxanthin'] = 0
foods.loc['Oats', 'Lycopene'] = 0
foods.loc['Oats', 'Lutein + zeaxanthin'] = 180
foods.loc['Quinoa, uncooked', 'Sugar'] = 6.1
foods.loc['Quinoa, uncooked', 'Vitamin C'] = 0
foods.loc['Buckwheat', 'Sugar'] = 1.9
foods.loc['Millet, raw', 'Sugar'] = 1.5
# Add RDI to the foods df
foods = foods.reset_index().melt(
id_vars='food',
value_name='amount',
ignore_index=False
).rename(
columns={'variable': 'nutrient'}
).assign(
unit=lambda df: df['nutrient'].map(rdis.set_index('MatchedNutrient')['Unit']),
rdi_max=lambda df: df['nutrient'].map(rdis.set_index('MatchedNutrient')['Amount']),
# Reassign rdi as a proportion instead
rdi=lambda df: df['amount'] / df['rdi_max']
).drop(
columns='rdi_max'
)
food_groups = {
# TODO add corn on the cob as veggie
'grains': [
'Quinoa, uncooked',
'Amaranth grain, uncooked',
'Oats',
'Barley, pearled, raw',
'Corn grain, yellow',
'Buckwheat',
'Rice, brown, long-grain, raw',
'Wild rice, raw',
'Millet, raw',
'Bulgur, dry',
'Spelt, uncooked',
'Wheat, durum',
'Wheat, hard red winter',
'Sorghum grain',
'Wheat, kamut khorasan, uncooked',
'Rice, white, long-grain, regular, raw, unenriched',
'Teff, uncooked',
'Rye grain',
],
'vegetables': [
'Brussels sprouts, raw',
'Beets, raw',
'Broccoli, raw',
'Cauliflower, raw',
'Eggplant, raw',
'Tomatoes, red, ripe, raw, year round average',
'Peas, green, frozen, unprepared',
],
'greens': [
'Kale, raw',
'Spinach, raw',
'Lettuce, cos or romaine, raw',
'Chard, swiss, raw',
'Arugula, raw',
'Collards, raw',
'Lettuce, iceberg (includes crisphead types), raw',
'Beet greens, raw',
'Mustard greens, raw',
'Cabbage, chinese (pak-choi), raw',
'Cabbage, raw',
],
'legumes': [
'Beans, black turtle, mature seeds, raw',
'Lentils, raw',
'Lentils, pink or red, raw',
'Beans, snap, green, raw',
'Soybeans, green, raw',
'Beans, pinto, mature seeds, sprouted, raw',
'Beans, adzuki, mature seeds, raw',
'Beans, black, mature seeds, raw',
'Beans, kidney, all types, mature seeds, raw',
'Beans, pinto, mature seeds, raw',
'Beans, small white, mature seeds, raw',
'Soybeans, mature seeds, raw',
'Broadbeans (fava beans), mature seeds, raw',
'Peas, green, raw',
'Chickpeas (garbanzo beans, bengal gram), mature seeds, raw',
'Peas, green, split, mature seeds, raw',
],
'nuts': [
f'Nuts, {x.lower()}'
for x in [
'Brazilnuts, dried, unblanched',
'Cashew nuts, raw',
'Hazelnuts or filberts',
'Macadamia nuts, raw',
'Pine nuts, dried',
'Pistachio nuts, raw',
'Walnuts, english',
'Coconut meat, dried (desiccated), not sweetened',
]
]
+ ['Peanuts, all types, raw'],
'fruits': [
"Apples, raw, with skin",
'Pineapple, raw, all varieties',
'Plums, raw',
'Pears, raw',
'Apricots, raw',
'Avocados, raw, all commercial varieties',
'Blackberries, frozen, unsweetened',
'Blueberries, raw',
'Cranberries, raw',
'Raspberries, raw',
'Clementines, raw',
'Strawberries, raw',
'Dates, medjool',
'Plantains, green, raw',
'Pomegranates, raw',
'Bananas, raw',
'Plantains, yellow, raw',
'Kiwifruit, green, raw',
'Figs, dried, uncooked',
'Oranges, raw, all commercial varieties',
'Apricots, dried, sulfured, uncooked',
'Olives, ripe, canned (small-extra large)',
'Raisins, dark, seedless',
],
'meats': [
'Fish, salmon, atlantic, farmed, raw',
'Fish, salmon, atlantic, wild, raw',
'Fish, cod, atlantic, raw',
'Chicken, broiler or fryers, breast, skinless, boneless, meat only, raw',
'Pork, fresh, ground, raw',
'Beef, grass-fed, ground, raw',
'Egg, whole, raw, fresh',
],
}
nutrient_groups = dict(
macros=[
'Energy',
'Carbs',
'Protein',
'Fiber',
'Fat',
],
detailed_macros=[
'Sugar',
'Fat (mono)',
'Fat (poly)',
'Fat (sat)',
'Cholesterol',
],
minerals=[
'Calcium',
'Copper',
'Iron',
'Magnesium',
'Manganese',
'Phosphorus',
'Potassium',
'Selenium',
'Sodium',
'Zinc',
],
# TODO shorten names by removing "vitamin" and adding suffixes https://en.wikipedia.org/wiki/B_vitamins
# add b1 k2 etc annotation
vitamins=[
'Thiamin (B1)',
'Riboflavin (B2)',
'Niacin (B3)',
'Pantothenic acid (B5)',
'Pyridoxine (B6)',
'Folate (B9)',
'Cobalmins (B12)',
'Vitamin A',
'Vitamin C',
'Vitamin D (D2 + D3)',
'Vitamin E',
'Vitamin K',
# 'Vitamin K (Dihydrophylloquinone)',
# 'Vitamin K (Menaquinone-4)',
# 'Vitamin K (phylloquinone)'
],
# eaas plus cysteine and tyrosine
aas=[
'Cystine',
'Histidine',
'Isoleucine',
'Leucine',
'Lysine',
'Methionine',
'Phenylalanine',
'Threonine',
'Tryptophan',
'Tyrosine',
'Valine',
],
caretonoids=[
'Lycopene',
'Lutein + zeaxanthin',
'alpha-Carotene',
'beta-Carotene',
# 'Retinol',
'beta-Cryptoxanthin',
],
)
# add checkbuttongroup for food groups
food_group = pn.widgets.MultiChoice(
name='Food Groups',
value=['vegetables', 'grains'],
options=list(food_groups.keys()),
)
# add checkbuttongroup for nutrient groups
nutrient_group = pn.widgets.MultiChoice(
name='Nutrient Groups',
value=['macros', 'detailed_macros'],
options=list(nutrient_groups.keys()),
)
# add slider to set maximum DV value, affecting max value of heatmap color range
max_dv = pn.widgets.IntSlider(
name='Cap color scale at this RDI',
start=0,
end=600,
value=300,
)
# function to get food group for each food in foods dataframe
def get_food_group(food) -> str:
for k in food_groups:
if food in food_groups.get(k):
return k
# fill NA values of wide-form foods data with column mean value
def fill_na_mean(data):
for col in data.columns[data.isnull().any(axis=0)]:
data[col] = data[col].fillna(data[col].mean())
return data
# selects interval over scatterplot for filtering heatmap
#brush = alt.selection_interval(fields = ["food"])
# perform PCA to reduce dataframe to 2 dimensions
def pca_2_components(data):
data = pd.pivot(data, index="food", columns="nutrient", values="rdi").reset_index()
data.columns = data.columns.get_level_values(0)
data['food_group'] = data.apply(lambda row: get_food_group(row["food"]), axis=1)
# fill NA values with column mean
data = fill_na_mean(data)
X = data.iloc[:, 1:-1].values
# create scaler object
scaler = StandardScaler()
# get mean and standard deviation
scaler.fit(X)
# transform values
X_scaled = scaler.transform(X)
# reduce filtered data to n dimensions using PCA
pca_2 = pca(n_components = 2, random_state = 2023)
pca_2.fit(X_scaled)
X_pca_2 = pca_2.transform(X_scaled)
# convert numpy array to dataframe
pca_2_df = pd.DataFrame(X_pca_2, columns=("component_1", "component_2"))
pca_2_df['food'] = data["food"]
pca_2_df['food_group'] = data["food_group"]
return pca_2_df
def make_scatter(pca_data):
brush = alt.selection_interval(name = "brush")
scatter = alt.Chart(
pca_data,
width=200,
height=200,
title=alt.Title(
' ',
subtitle="Food similarity (drag to select)",
anchor='start',
)
).mark_circle(size=50).encode(
alt.X("component_1", title="").axis(domain=False, ticks=False, labels=False),
alt.Y("component_2", title="").axis(domain=False, ticks=False, labels=False),
color = alt.condition(brush, alt.Color("food_group:N", title=""), alt.value("lightgray")),
tooltip="food"
).add_params(brush)
return pn.pane.Vega(scatter)
# sort data using hierarchical clustering and optimal leaf-ordering
def sort_similar_foods(filtered_df):
"""
requires that the data matches the input of create_heatmap function
"""
# No sorting needed if there are less than 2 data points
if filtered_df['food'].nunique() < 2:
return []
wide_data = pd.pivot(filtered_df, index="food", columns="nutrient", values="rdi").reset_index()
wide_data.columns = wide_data.columns.get_level_values(0)
# fill NA values with column mean
wide_data = fill_na_mean(wide_data)
X = wide_data.iloc[:, 1:]
# using average method
Z = hierarchy.linkage(X, method="average", optimal_ordering=True)
# find the optimal order of row indexes according to the clustering algorithm
optimal_order = hierarchy.leaves_list(Z)
return wide_data.loc[optimal_order, 'food'].tolist()
# create a heatmap chart using filtered data
def create_heatmap(filtered_df, selection):
pca_df = pca_2_components(filtered_df)
# Inlclude only food items selected in the scatter plot
if selection:
pca_df = pca_df[
pca_df["component_1"].between(selection["component_1"][0], selection["component_1"][1])
& pca_df["component_2"].between(selection["component_2"][0], selection["component_2"][1])
]
filtered_df = filtered_df[filtered_df["food"].isin(pca_df["food"].unique())]
# No need to create a chart if there are no points selected
if filtered_df.shape[0] == 0:
return None
else:
heatmap = alt.Chart(filtered_df).mark_rect().transform_calculate(
tooltip_amount_and_unit = "round(100 * datum.amount) / 100 + ' ' + datum.unit"
).encode(
alt.X(
'nutrient',
title='',
axis=alt.Axis(
orient='top',
labelAngle=-45
)
),
alt.Y('food', title='', axis=alt.Axis(orient='right'), sort=sort_similar_foods(filtered_df)),
alt.Color('rdi', title="Percent of RDI", legend=alt.Legend(format='.0%')),
tooltip=[
alt.Tooltip('food', title='Food'),
alt.Tooltip('nutrient', title='Nutrient'),
alt.Tooltip('rdi', title='RDI', format='.1%'),
alt.Tooltip('tooltip_amount_and_unit:N', title='Amount'),
]
)
return pn.pane.Vega(heatmap)
template = pn.template.BootstrapTemplate(site='Nutrimap',
title='A cure for food label indigestion',
sidebar=[pn.pane.Markdown("## Settings"), food_group, nutrient_group, max_dv]
)
# Re-filter the dataframe and re-create the charts when the eidget values change
@pn.depends(food_group.param.value, nutrient_group.param.value, max_dv.param.value)
def update_charts(food_group, nutrient_group, max_dv):
# Find all the values of each selected groups (e.g. all the food names for type "vegetable")
selected_foods = []
[selected_foods.extend(food_groups[food]) for food in food_group]
selected_nutrients = []
[
selected_nutrients.extend(nutrient_groups[nutrient])
for nutrient in nutrient_group
]
filtered_df = foods.assign(
rdi=lambda df: df['rdi'].clip(upper=max_dv / 100) # From percentage to proportion
).query(
'food.isin(@selected_foods)'
'& nutrient.isin(@selected_nutrients)'
)
# Create the Altair charts
pca_data = pca_2_components(filtered_df)
scatter = make_scatter(pca_data)
# Set the heatmap up to listen to the selection in the scatter plot
heatmap = pn.bind(create_heatmap, filtered_df, scatter.selection.param.brush)
template.sidebar.extend(scatter)
return pn.Column(heatmap, pn.Column(scatter, visible=False))
template.main.append(update_charts)
template.servable()