In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, widgets
import requests
from urllib.parse import quote
import regex as re

import config # Imports a private API Key to test the functions

# Team Casimir Funk Nutritional Content Notebook
In this notebook, we create the our nutritional content functions. Moreover, we add the nutritional data to our food prices dataset.

## Nutritional Content Functions

In [4]:
def handle_query_nc(query, food_df_dict, api_key, num_results = 10):
    '''
    Description
    --------------------------------------------------
    This is a function to handle the Food Query
    for our user.
    
    Inputs
    --------------------------------------------------
    + query : string; keywords to search the USDA 
            FoodData Central API for the user's
            desired food product
    + food_df_dict : dictionary; allows for food_df
            to be implemented and updated in 
            different functions
    + api_key : string; user's FoodData Central API
            key
    + num_results : integer; the number of results 
            the user wants in their query
    
    Outputs
    --------------------------------------------------
    + food_df is displayed
    '''
    encoded_query = quote(query)
    page_number = 1
    page_size = num_results

    url = f'https://api.nal.usda.gov/fdc/v1/foods/search?api_key={api_key}&query={encoded_query}&pageSize={page_size}&pageNumber={page_number}'

    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: {response.status_code}")
    
    food_df = pd.DataFrame(data['foods'])
    
    food_df_dict['food_df'] = food_df
    display(food_df.iloc[:, :10])

In [5]:
def get_nutritional_content(food_df, idx = 0, computation = False):
    '''
    Description
    --------------------------------------------------
    This is a function to let the user see the details
    (detail_df) and nutritional contents 
    (nutritional_df) of individual food products.
    
    Inputs
    --------------------------------------------------
    + food_df : pandas dataframe; contains all the
            details & nutritional information for the
            different food products
    + idx : integer; corresponds to the row index of
            food product the user want to look at
    + computation : boolean; determines whether the
            detail_df & nutritional_df should be 
            displayed (important for the widget), or
            if the nutritional_df should be returned
            (for the projects data analysis section)
    
    Outputs
    --------------------------------------------------
    compuation = False : 
    + detail_df & nutritional_df are displayed
    compuation = True : 
    + nutritional_df : pandas dataframe; nutritional
            content of a single food product of 
            interest
    '''
    desired_cols = ['fdcId', 'description', 'brandOwner', 'brandName', 'marketCountry']
    avail_cols = [col for col in desired_cols if col in food_df.columns]
    
    detail_df = pd.DataFrame(food_df.loc[idx, avail_cols])
    detail_df.rename(columns = {idx : 'Details'}, inplace = True)
    
    
    nutritional_df = pd.DataFrame(food_df.loc[idx, 'foodNutrients'])
    
    desired_cols2 = ['nutrientName', 'value', 'unitName', 'percentDailyValue']
    avail_cols2 = [col for col in desired_cols2 if col in nutritional_df.columns]
    nutritional_df = nutritional_df[avail_cols2]
    
    if computation:
        return nutritional_df
    else:
        display(detail_df)
        display(nutritional_df)

In [6]:
def interactive_query(api_key):
    '''
    Description
    --------------------------------------------------
    This is a function that makes handle_query_nc()
    interactive.

    Inputs
    --------------------------------------------------
    + api_key : string; user's FoodData Central API
            key
    
    Outputs
    --------------------------------------------------
    + food_df_dict : dictionary; allows for food_df
            to be implemented and updated in 
            different functions
    '''
    food_df_dict = {}
    widget = interactive(handle_query_nc, 
                               food_df_dict = widgets.fixed(food_df_dict),
                               query = widgets.Text(description='Enter Query Here'),  
                               num_results = (1, 20, 1),
                               api_key = widgets.fixed(api_key)
                        )
    display(widget)
    return food_df_dict

In [7]:
def interactive_get_nc(food_df_dict):
    '''
    Description
    --------------------------------------------------
    This is a function that makes 
    get_nutritional_content() interactive.

    Inputs
    --------------------------------------------------
    + food_df_dict : dictionary; allows for food_df
            to be implemented and updated in 
            different functions
    
    Outputs
    --------------------------------------------------
    + Interactive widget is displayed
    '''
    widget = interactive(get_nutritional_content, food_df = widgets.fixed(food_df_dict['food_df']), 
                  idx = widgets.IntSlider(value = 0, min = 0, max = len(food_df_dict['food_df']) - 1, step = 1, description='Food Index'), 
                         computation = widgets.fixed(False)
                        )
    display(widget)

## Joining Our Food Price and NC Data

The first step is to load in the Food Price Data that we manually collected before we automate the process of adding the nutritional data for all of the food products at the stores within our analysis scope.

In [10]:
price = pd.read_csv('./data/food_prices_dummy.csv').set_index('Food item').fillna(0)
price.iloc[:5, :11]

Unnamed: 0_level_0,GTIN/UPC,Price at TJs,Quantity,Unit,Brand/Type at TJs,GTIN/UPC.1,Price at Safeway,Quantity,Unit.1,Brand/Type at Safeway,GTIN/UPC.2
Food item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bread,0.0,$3.49,24.0,oz,TJs Sourdough Sandwich Bread,0.0,$4.99,24.0,oz,Signature SELECT Bread Long Sliced Loaf San Fr...,0.0
Rice,0.0,$3.29,3.0,lb,TJs Jasmine Rice,21130500000.0,$6.79,5.0,lb,Mahatma Jasmine Thai Fragrant Long Grain Rice ...,0.0
Pasta,0.0,$0.99,1.0,lb,TJs Orzo Italian Pasta,0.0,$1.50,16.0,oz,Signature SELECT Pasta Penne Rigate Box,0.0
Apples,0.0,$1.29,1.0,each,Honeycrisp Apples,0.0,$2,1.0,each,Honeycrisp apples,0.0
Tomatoes,0.0,$2.99,1.0,lb,TJs Campari Tomatoes,0.0,$3.49,1.0,lb,Red roma tomato,0.0


Examining a subset of our data, it becomes evident that our data is not in a format in which we will be able to quickly pull all of the requisite nutritional data, we have to many columns and each row is not unique to a specific food product at a specific store. Thus, we must reformat the data such that we only have seven columns:
1. `Food Item` : The food product
2. `Store` : The store in which the `Food Item` is sold
3. `GTIN/UPC` : A unique identifier in the USDA FoodCentral database for the `Food Item` from a specific `Store`
4. `Price` : The price of the `Food Item`
5. `Quantity` : The quantity of the `Food Item`
6. `unit` : The units of the `Food Item`
7. `Brand/Type` : The name of the `Food Item`

In order to achieve this desired format we will go through the following process:
1. Create a dictionary `price_df_dict` where we will keep individual dataframes for each of the stores in question with information on the aformentioned variables.
2. Combine all of these sub-dataframes into one large one `price_rf` that has all of our information in our desired format.

In [12]:
price_df_dict = {}

stores = price.filter(regex = '^Brand').columns
stores = [re.findall(r'Brand/Type at (.+)', col)[0] for col in stores]

num_sub_df_cols = 5
num_sub_dfs = int(price.shape[1] / num_sub_df_cols)

for i in range(len(stores)):
    store = stores[i]
    back_idx = num_sub_df_cols*(i + 1)
    sub_df = price.iloc[:, back_idx - num_sub_df_cols : back_idx]
    sub_df['Store'] = store
    sub_df.columns = ['GTIN/UPC', 'Price', 'Quantity', 'Unit', 'Brand/Type', 'Store']
    sub_df = sub_df[['Store', 'GTIN/UPC', 'Price', 'Quantity', 'Unit', 'Brand/Type']]
    sub_df.reset_index(inplace = True)
    price_df_dict[store] = sub_df

price_rf = pd.DataFrame()

for key in price_df_dict.keys():
    if len(price_rf) == 0:
        price_rf =  price_df_dict[key]
    else:
        price_rf = pd.concat([price_rf, price_df_dict[key]]).reset_index(drop = True)

In [13]:
price_rf

Unnamed: 0,Food item,Store,GTIN/UPC,Price,Quantity,Unit,Brand/Type
0,Bread,TJs,0.0,$3.49,24.0,oz,TJs Sourdough Sandwich Bread
1,Rice,TJs,0.0,$3.29,3.0,lb,TJs Jasmine Rice
2,Pasta,TJs,0.0,$0.99,1.0,lb,TJs Orzo Italian Pasta
3,Apples,TJs,0.0,$1.29,1.0,each,Honeycrisp Apples
4,Tomatoes,TJs,0.0,$2.99,1.0,lb,TJs Campari Tomatoes
...,...,...,...,...,...,...,...
131,Bagels,Whole Foods,0.0,$4.99,18.0,oz,Organic Bagels Plain
132,Avocado,Whole Foods,0.0,$2.99,1.0,each,Del Rey Avocados
133,Coffee,Whole Foods,0.0,$9.49,10.0,oz,"Ground Coffee in Bag, French Roast - Bonne Nuit"
134,Chicken Meatball,Whole Foods,0.0,$5.39,10.0,oz,Korean BBQ Chicken Meatballs


Now that our data is properly formated, we can now add our nutritional data to our food prices dataset.

### Pull Dietary Reference Categories
Using the `handle_query_dr` function from our `dietary_references` notebook we are able to get all of the categories for which we need the nutritional content.

In [113]:
%run dietary_references.ipynb

In [115]:
drs = handle_query_dr(1, 'Male')[1][['Nutrition']]
drs

Unnamed: 0,Nutrition
0,Energy
1,Protein
2,"Fiber, total dietary"
3,"Folate, DFE"
4,"Calcium, Ca"
5,"Carbohydrate, by difference"
6,"Iron, Fe"
7,"Magnesium, Mg"
8,Niacin
9,"Phosphorus, P"


### Get Nutritional Contents

In [168]:
def handle_query_nc_calc(query, api_key):
    '''
    Description
    --------------------------------------------------
    This is a function is an adaptation of our
    handle_query_nc() function that is optimized for the
    task of pulling nutritional data
    
    Inputs
    --------------------------------------------------
    + query : string; keywords to search the USDA 
            FoodData Central API for the user's
            desired food product
    + api_key : string; user's FoodData Central API
            key
    
    Outputs
    --------------------------------------------------
    + food_df : pandas dataframe; contains all the
            details & nutritional information for the
            different food products
    '''
    encoded_query = quote(query)
    page_number = 1
    page_size = 1

    url = f'https://api.nal.usda.gov/fdc/v1/foods/search?api_key={api_key}&query={encoded_query}&pageSize={page_size}&pageNumber={page_number}'

    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
    else:
        print(f"Error: {response.status_code}")
    
    food_df = pd.DataFrame(data['foods'])
    
    return food_df

In [174]:
search_col = 'Food item'
fp = price_rf['Food item'][1]
fp

API_KEY = config.API_KEY
food_df = handle_query_nc_calc(fp, API_KEY)

In [178]:
ncs = get_nutritional_content(food_df, idx = 0, computation = True)

In [180]:
ncs_m = drs.merge(ncs, how = 'left', left_on = 'Nutrition', right_on = 'nutrientName')[['Nutrition', 'value', 'unitName']]
ncs_m.columns = ['Nutrient', 'Value', 'Unit']
ncs_m['Value'] = ncs_m['Value'].fillna(0)
ncs_m['Unit'] = ncs_m['Unit'].fillna('-')

In [188]:
ncs_t = ncs_m.transpose()
ncs_t.columns = ncs_t.iloc[0, :]
ncs_t = ncs_t.iloc[1:, :]

In [190]:
ncs_t

Nutrient,Energy,Protein,"Fiber, total dietary","Folate, DFE","Calcium, Ca","Carbohydrate, by difference","Iron, Fe","Magnesium, Mg",Niacin,"Phosphorus, P","Potassium, K",Riboflavin,Thiamin,"Vitamin A, RAE",Vitamin B-12,Vitamin B-6,"Vitamin C, total ascorbic acid",Vitamin E (alpha-tocopherol),Vitamin K (phylloquinone),"Zinc, Zn"
Value,356.0,6.67,0.0,0.0,0.0,77.8,3.2,0.0,3.56,0.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Unit,KCAL,G,G,-,MG,G,MG,-,MG,-,MG,-,MG,-,-,-,MG,-,-,-


In [258]:
ncs_rf = pd.DataFrame(ncs_t.loc['Value', :]).transpose()
for col in drs['Nutrition']:
    ncs_rf[f'{col}_Unit'] = ncs_t.loc['Unit', col]

ncs_rf_col_order = np.array([(col, col + '_Unit') for col in drs['Nutrition']]).flatten()
ncs_rf = ncs_rf[ncs_rf_col_order]

In [262]:
ncs_rf

Nutrient,Energy,Energy_Unit,Protein,Protein_Unit,"Fiber, total dietary","Fiber, total dietary_Unit","Folate, DFE","Folate, DFE_Unit","Calcium, Ca","Calcium, Ca_Unit",...,Vitamin B-6,Vitamin B-6_Unit,"Vitamin C, total ascorbic acid","Vitamin C, total ascorbic acid_Unit",Vitamin E (alpha-tocopherol),Vitamin E (alpha-tocopherol)_Unit,Vitamin K (phylloquinone),Vitamin K (phylloquinone)_Unit,"Zinc, Zn","Zinc, Zn_Unit"
Value,356.0,KCAL,6.67,G,0.0,G,0.0,-,0.0,MG,...,0.0,-,0.0,MG,0.0,-,0.0,-,0.0,-
