In [37]:
# Install packages
!pip install -r requirements.txt >/dev/null
!pip install eep153_tools >/dev/null
!pip install gnupg 
!pip install gspread_pandas >/dev/null

# Import libraries
import numpy as np
import pandas as pd
import warnings
from  scipy.optimize import linprog as lp
import plotly.express as px
from eep153_tools.sheets import read_sheets
import fooddatacentral as fdc


# API Key
apikey = "Y7y8evhwYYL589qVEZzdHBCyKGeBkNdDWQfnWyIV"



## [A] Description of Population of Interest

Our group is interested in the comparison of the minimum cost diets of the inhabitants of **San-Francisco** and **Bakersfield**, two cities chosen based on their differences in cost-of-living, population density, and locale classification (urban vs agricultural, respectively).

## [A] Dietary Reference Intakes

A function that takes as arguments the characteristics of a given person (age, sex) and returns a Pandas.series of Dietary Reference Intakes (DRIs) or "Recommended Daily Allowances" (RDAs) of a variety of nutrients appropriate for the population of interest

In [40]:
# Read diet minimum data
diet_min = pd.read_csv("Spreadsheets/diet_min.csv",index_col=0)
diet_min

FileNotFoundError: [Errno 2] No such file or directory: 'Spreadsheets/diet_min.csv'

In [22]:
# Read diet maximum data
diet_max = pd.read_csv("Spreadsheets/diet_max.csv",index_col=0)
diet_max

FileNotFoundError: [Errno 2] No such file or directory: 'Spreadsheets/diet_max.csv'

In [24]:
def dietary_ref_intake(age,sex,df):
    """Takes in age and sex, and returns the dietary reference intake for the chosen population"""

    if age <= 3:
        col = 'C 1-3'
    age_ranges = [(4,8),(9,13),(14,18),(19,30),(31,50),(50,100)]
    for age_range in age_ranges:
        if age >= age_range[0] and age <= age_range[1]:
            col = sex + ' ' + str(age_range[0]) + '-' + str(age_range[1])
    return pd.Series(df[col])  

In [None]:
# Example of minimum dietary requirements for a male aged 22
dietary_ref_intake(age=22,sex='M',df=diet_min)

In [None]:
# Example of maximum dietary requirements for a male aged 22
dietary_ref_intake(age=22,sex='M',df=diet_max)

## [A] Data on prices for different foods

We constructed a google spreadsheet of the prices of approximately 28 ~ 29 different food products chosen from 2 different methods:  

* **US Consumer-Expenditure Survey Top-Results**: The first bin of foods were chosen as the most popular products per each main food groups (i.e. vegetables, fruits, meats, dairies) as concluded from the US Consumer-Expenditure Survey. This represents the baseline foods we wanted to establish to compare the two cities within our project.  Here is a link to what we looked at: https://www.ers.usda.gov/data-products/ag-and-food-statistics-charting-the-essentials/food-availability-and-consumption/  
* **Insta-Cart 2021 Most Popular Groceries by City**: We then compiled the most popular groceries per region by searching by city through Instacart's 2021 Delivered project: https://www.instacart.com/2021-delivered/

### Identifying and Uploading FDC

In [None]:
df = pd.read_csv('Spreadsheets/grocery_names.csv')
df['Food'] = df['Item']
df

In [None]:
# (IGNORE FOR PROJECT) idea for further automation/ease here: if you inputted a dataframe with your 
# chosen foods having already designated for the "foodCategory" column, that way you could have some 
# kind of if/else statement that filters to find the FIRST index row with that DESIRED (and more accurate) 
# food category as the value in that column of the FDC dataframe. This way you could weed out rando ones, 
# like when we get "Avocado Chunks" as the SECOND result in a search for Hass Avocadoes. 

import fooddatacentral as fdc
import warnings

apikey = 'Y7y8evhwYYL589qVEZzdHBCyKGeBkNdDWQfnWyIV'

def find_FDC_index(API_key, food_list, num_rows): 
    
    fdc_index_results = pd.DataFrame(columns=['Food', 'fdcId', 'description', 'foodCategory'])
    
    for food in food_list: 
        df_food = fdc.search(apikey, food)
        df_food_reduced = df_food[['fdcId', 'description', 'foodCategory']]
        df_food_reduced['Food'] = str(food)
        df_food_4_results = df_food_reduced.iloc[:num_rows]
        fdc_index_results = fdc_index_results.append(df_food_4_results, ignore_index = True)
        
    return fdc_index_results 

In [None]:
get_fdcIds = find_FDC_index(apikey, df['Item'], 1)
get_fdcIds

In [None]:
fdcId_array = get_fdcIds['fdcId']
print(fdcId_array)

In [None]:
file_path = "Spreadsheets/fdc_ids.csv"

# Save the DataFrame to a CSV file
fdcId_array.to_csv(file_path, index=False)

### Notes from FDC ID DataFrame: 
- for Gemelli Pasta, pd.iloc[0] will get fdcId = 2618997, change to fdcId = 1124597
- "pressed Juices" are fucked up, need to do that search separately
- "Bread" is outputting some cookies and biscuits shit, replace with fdcId = 1913550

In [20]:
gemelli_fdcID = 1124597
bread_fdcID = 1913550
pressedjuice_fdcID = 2095092

In [21]:
# getting FDC Codes

import fooddatacentral as fdc
import warnings
df1 = fdc.search(apikey, 'Cold Pressed Juice')
df1.head(5)

ModuleNotFoundError: No module named 'fooddatacentral'

### DataFrames from Google Sheets

In [18]:
food = 'https://docs.google.com/spreadsheets/d/1FTO5JQz5VwdM94m-h3KmOD3FAT7m9yk9NG3ZHEpIkvk/edit?usp=sharing'

def read_sheet(city):
    df = pd.read_excel(food,sheet_name=city)
    df = df.iloc[:,:5].dropna(subset=['FDC'])
    df = df.reset_index(drop=True)
    df['FDC ID'] = df['FDC ID'].astype(int)
    return df

In [19]:
sf = read_sheet('San_Francisco')
bakersfield = read_sheet('Bakersfield')

san_francisco

ValueError: Excel file format cannot be determined, you must specify an engine manually.

### Unit Conversion 

In [25]:
def convert(df):
    # Convert food quantities to FDC units
    df['FDC Quantity'] = df[['Quantity','Units']].T.apply(lambda x : fdc.units(x['Quantity'],x['Units']))

    # Now may want to filter df by time or place--need to get a unique set of food names.
    df['FDC Price'] = df['Price']/df['FDC Quantity']

    df.dropna(how='any') # Drop food with any missing data

    # To use minimum price observed
    Prices = df.groupby('Item',sort=False)['FDC Price'].min()

    return Prices

In [26]:
sf_price = convert(sf)
bakersfield_price = convert(bakersfield)

NameError: name 'sf' is not defined

# [A] Nutritional content of different foods

Now, using the FDC ID, we look up the nutritional content of the grocery lists we've created to construct DataFrames containing the results

In [30]:
def content(city):
    df = read_sheet(city)
    D = {}
    count = 0
    for food in  df.Food.tolist():
        try:
            FDC = df.loc[df.Food==Item,:].FDC[count]
            count+=1
            D[food] = fdc.nutrients(apikey,FDC).Quantity
        except AttributeError: 
            warnings.warn("Couldn't find FDC Code %s for food %s." % (food,FDC))
    return pd.DataFrame(D,dtype=float)

In [31]:
sf_content = content('San_Francisco')
bakersfield_content = content('Bakersfield')

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [32]:
sf_content

NameError: name 'sf_content' is not defined

## [A] Solution

Here we create a single function `solve_subsistence_problem`.  By
isolating the logic of constructing and solving the subsistence
problem into a stand-alone function we reduce the scope for bugs, and
this modular approach at the same time makes testing easier.

We take the different pieces of the puzzle we&rsquo;ve developed and
put them together in the form of a linear program we can solve.
Recall that the mathematical problem we&rsquo;re trying to solve is
$$
    \min_x p'x
$$
such that
$$
     Ax \geq b
$$
If we buy a bag of groceries with quantities given by $x$, the total
cost of the bag of groceries is the inner product of prices and
quantities.  Since we&rsquo;ve converted our units above, this gives us a
vector of prices where quantities are all in 100 g or ml units.

The following code block defines a function
`solve_subsistence_problem`, which takes as arguments a dataframe
mapping different foods to nutrients; a series of prices for those
same foods; a series giving dietary recommended intake (DRI) minimums;
and a series giving dietary recommended maximums.



In [None]:
def solve_subsistence_problem(FoodNutrients,Prices,dietmin,dietmax,max_weight=None,tol=1e-6):
    """Solve Stigler's Subsistence Cost Problem.

    Inputs:
       - FoodNutrients : A pd.DataFrame with rows corresponding to foods, columns to nutrients.
       - Prices : A pd.Series of prices for different foods
       - diet_min : A pd.Series of DRIs, with index corresponding to columns of FoodNutrients,
                    describing minimum intakes.
       - diet_max : A pd.Series of DRIs, with index corresponding to columns of FoodNutrients,
                    describing maximum intakes.
       - max_weight : Maximum weight (in hectograms) allowed for diet.
       - tol : Solution values smaller than this in absolute value treated as zeros.
       
    """
    try: 
        p = Prices.apply(lambda x:x.magnitude)
    except AttributeError:  # Maybe not passing in prices with units?
        warnings.warn("Prices have no units.  BE CAREFUL!  We're assuming prices are per hectogram or deciliter!")
        p = Prices

    p = p.dropna()

    # Compile list that we have both prices and nutritional info for; drop if either missing
    use = p.index.intersection(FoodNutrients.columns)
    p = p[use]

    # Drop nutritional information for foods we don't know the price of,
    # and replace missing nutrients with zeros.
    Aall = FoodNutrients[p.index].fillna(0)

    # Drop rows of A that we don't have constraints for.
    Amin = Aall.loc[Aall.index.intersection(dietmin.index)]
    Amin = Amin.reindex(dietmin.index,axis=0)
    idx = Amin.index.to_frame()
    idx['type'] = 'min'
    #Amin.index = pd.MultiIndex.from_frame(idx)
    #dietmin.index = Amin.index
    
    Amax = Aall.loc[Aall.index.intersection(dietmax.index)]
    Amax = Amax.reindex(dietmax.index,axis=0)
    idx = Amax.index.to_frame()
    idx['type'] = 'max'
    #Amax.index = pd.MultiIndex.from_frame(idx)
    #dietmax.index = Amax.index

    # Minimum requirements involve multiplying constraint by -1 to make <=.
    A = pd.concat([Amin,
                   -Amax])

    b = pd.concat([dietmin,
                   -dietmax]) # Note sign change for max constraints

    # Make sure order of p, A, b are consistent
    A = A.reindex(p.index,axis=1)
    A = A.reindex(b.index,axis=0)

    if max_weight is not None:
        # Add up weights of foods consumed
        A.loc['Hectograms'] = -1
        b.loc['Hectograms'] = -max_weight
        
    # Now solve problem!  (Note that the linear program solver we'll use assumes
    # "less-than-or-equal" constraints.  We can switch back and forth by
    # multiplying $A$ and $b$ by $-1$.)

    result = lp(p, -A, -b, method='interior-point')

    result.A = A
    result.b = b
    
    if result.success:
        result.diet = pd.Series(result.x,index=p.index)
    else: # No feasible solution?
        warnings.warn(result.message)
        result.diet = pd.Series(result.x,index=p.index)*np.nan  

    return result

In [None]:
def solution(city,group):
    Prices, FoodNutrients = match(store)
    tol = 1e-6

    result = solve_subsistence_problem(FoodNutrients,Prices,diet_min[group],diet_max[group],tol=tol)

    print("Cost of diet for %s is $%4.2f per day.\n" % (group,result.fun))

    # Put back into nice series
    diet = result.diet

    print("\nDiet (in 100s of grams or milliliters):")
    print(diet[diet >= tol])  # Drop items with quantities less than precision of calculation.
    print()

    tab = pd.DataFrame({"Outcome":np.abs(result.A).dot(diet),"Recommendation":np.abs(result.b)})
    print("\nWith the following nutritional outcomes of interest:")
    print(tab)
    print()

    print("\nConstraining nutrients are:")
    excess = tab.diff(axis=1).iloc[:,1]
    print(excess.loc[np.abs(excess) < tol*100].index.tolist())


In [41]:
solution('San_Francisco', 'M 14-18')

NameError: name 'solution' is not defined

In [42]:
solution('Bakersfield', 'M 14-18')

NameError: name 'solution' is not defined