In [19]:
!pip install pint

import pandas as pd
import pint
from pint import UnitRegistry
import re
ureg = UnitRegistry()
from tqdm import notebook



In [0]:
from fractions import Fraction

def fraction_to_decimal(string):
  return float(sum(Fraction(s) for s in string.split()))


In [0]:
I_DF = pd.read_csv('/content/drive/My Drive/Data Science Projects/RecipeApp/data/myData/INGR_FINAL_COMPL_MERGE.csv', index_col=0)

In [5]:
# BF_DF: The branded food table from FDC DB that contains the data on serving size
BF_DF = pd.read_csv('/content/drive/My Drive/Data Science Projects/RecipeApp/data/FoodData_Central/branded_food.csv')
BF_DF.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,fdc_id,brand_owner,gtin_upc,ingredients,serving_size,serving_size_unit,household_serving_fulltext,branded_food_category,data_source,modified_date,available_date
0,346464,Conagra Brands,14900012704,"Meat Ingredients (Beef and Pork), Water, Tomat...",248.0,g,1 cup (248g),,GDSN,2018-10-26,2018-11-17
1,346470,Conagra Brands,14900432991,"Meat Ingredients (Beef and Pork), Water, Tomat...",248.0,g,1 cup (248g),,GDSN,2018-12-22,2018-12-23


In [0]:
BF_DF['household_qty'] = BF_DF.household_serving_fulltext.str.split(" ",expand=True).iloc[:,0]
BF_DF['household_unit'] = ''

for index, row in BF_DF.iterrows():
  try:
    if not isinstance(row['household_serving_fulltext'], str):
      continue
    BF_DF.at[index, 'household_unit'] = row['household_serving_fulltext'].replace(row['household_qty'], '')
  except:
    raise
    print("ERROR")



In [0]:
F_DF = pd.read_csv('/content/drive/My Drive/Data Science Projects/RecipeApp/data/FoodData_Central/food.csv')

In [0]:
I_BF_DF = I_DF.merge(BF_DF, on='fdc_id', how='inner')

In [0]:
I_BF_DF = I_BF_DF.drop(['brand_owner',
       'branded_food_category', 'data_source', 'modified_date',
       'available_date'], axis=1)
I_BF_DF['household_unit'].fillna('',inplace=True)
I_BF_DF['unit'].fillna('',inplace=True)
I_BF_DF = I_BF_DF.drop_duplicates()

In [0]:
I_BF_DF = I_BF_DF.fillna(value={'unit': '', 'quantity': '', 
                                'sub_unit': '', 'sub_qty': 0, 'sub_unit': '',
                                'household_unit': '','household_qty': 0})

In [14]:
def to_decimal(string):
  if re.match('.*-.*', string):
    left = fraction_to_decimal(string.split('-')[0])
    right = fraction_to_decimal(string.split('-')[1])
    return left + right
  try:
    return fraction_to_decimal(string)
  except:
    return 0

to_decimal('1-1/2')

1.5

In [0]:
I_BF_DF['quantity'] = I_BF_DF['quantity'].replace(to_replace=r'n', value='', regex=True)
I_BF_DF['quantity'] = I_BF_DF['quantity'].replace(to_replace=r'a', value='', regex=True)
I_BF_DF['quantity'] = I_BF_DF['quantity'].astype(str).apply(to_decimal).astype(float)
I_BF_DF['household_qty'] = I_BF_DF['household_qty'].astype(str).apply(to_decimal).astype(float)
I_BF_DF['ingr_mass'] = ''
I_BF_DF['sub_qty'] = I_BF_DF['sub_qty'].astype(str).apply(to_decimal).astype(float)


## Define Cases


1.   there is neither a qty nor unit for ingredient `('', '')('cooking spray')`
>SOLUTION: use serving size for branded food

2.   there is a qty but no unit for ingredient `('1', '') ('1', 'lemon')`
>SOLUTION: multiply ingredient quantity with serving quanity and multiply by serving size

3.   there is a unit but no qty for ingredient `('', 'cup')`  UNCOMMON

4.   there is a qty but no unit, AND a subunit for ingredient`('1', '', '(16 oz)')`
>SOLUTION: multiply ingredient qty by subunit and use that as the ingredient measure

5.   ingredient unit is of weight type and household unit is of volume type `('1', 'oz', '2', 'cup')`
>SOLUTION: directly convert from weight to serving size (if in grams)

6.   ingredient unit is volume type and household unit is weight type `('1', cup', '2', oz')`
>SOLUTION: get a general ballpark conversion from volume to weight and use that

7.   ingredient unit not a valid unit
>SOLUTION: use serving size of the food


In [0]:
def volume_to_grams(measure):
  """
  Just convert the volume to ml to get a ballpark estimate without knowing the density
  """
  return measure.to('ml').magnitude


def convert_to_grams(i_qty, i_unit, h_qty, h_unit, serving_size, serving_unit):
  """
  Convert to grams the measures that are included in the ingredients that were scraped for each recipe
  from allrecipes.com

  PARAMETERS:
  i_qty:        ingredient quantity in branded_joined table
  i_unit:       ingredient unit in branded_joined table
  h_qty:        household quantity in branded_joined table
  h_unit:       household unit in branded_joined table
  serving_size: serving size in branded_joined table
  serving_unit: serving unit in branded_joined table

  RETURNS: number of grams in the allrecipes ingredient
  """
  try:
    h_measure = h_qty * ureg(h_unit)
    i_measure = i_qty * ureg(i_unit)
    s_measure = serving_size * ureg(serving_unit)

    # CASE 5: ingredient unit is of mass type and household unit is of volume type ('1', 'oz', '2', 'cup')
    if i_measure.check('[mass]') and h_measure.check('[volume]'):
      # SOLUTION: directly convert from mass to serving size (if in grams)
      i_to_s = i_measure.to('grams')
      to_grams = i_to_s.magnitude / s_measure.magnitude
      grams = to_grams * serving_size

      return grams

    # CASE 6: ingredient unit is volume type and household unit is weight type ('1', cup', '2', oz')
    elif i_measure.check('[volume]') and h_measure.check('[mass]'):
    #SOLUTION: get a general ballpark conversion from volume to weight and use that
      # convert ingredient volume to mass
      return volume_to_grams(i_measure)

    # Normal case
    else:
      i_to_h = i_measure.to(h_measure)
      to_grams = i_to_h.magnitude / h_measure.magnitude
      grams = to_grams * serving_size
      return grams
  except:
    return 100


In [0]:
UNITS = {"cup": ["cups", "cup", "c.", "c"], "fluid_ounce": ["fl. oz.", "fl oz", "fluid ounce", "fluid ounces"],
         "gallon": ["gal", "gal.", "gallon", "gallons"], "ounce": ["oz", "oz.", "ounce", "ounces", "onz", "oza"],
         "pint": ["pt", "pt.", "pint", "pints"], "pound": ["lb", "lb.", "pound", "pounds"],
         "quart": ["qt", "qt.", "qts", "qts.", "quart", "quarts"],
         "tablespoon": ["tbsp.", "tbsp", "T", "T.", "tablespoon", "tablespoons", "tbs.", "tbs"],
         "teaspoon": ["tsp.", "tsp", "t", "t.", "teaspoon", "teaspoons"],
         "gram": ["g", "g.", "gr", "gr.", "grm", "gram", "grams"], "kilogram": ["kg", "kg.", "kilogram", "kilograms"],
         "liter": ["l", "l.", "liter", "liters"], "milligram": ["mg", "mg.", "milligram", "milligrams"],
         "milliliter": ["ml", "ml.", "milliliter", "milliliters"]}

def unit_lookup(unit):
  """
  Check if ingredient unit and h_unit are in UNITS dictionary

  PARAMS:
    unit:   the unit of measurement to check

  RETURNS: 
    True/False:  if unit not in the dictionary
    unit:        the key of the unit if it was in the dictionary


  """
  match = False
  for key, values in UNITS.items():
    # If household unit exists in dictionary:
    if unit in values:
      match = True
      unit = key

  return match, unit

In [0]:

def standardize_quantities(branded_joined):
  for index, row in notebook.tqdm(branded_joined.iterrows(), total=len(branded_joined)): 
    try: 
      h_unit = row['household_unit'].strip().lower() 
      i_unit = row['unit'].strip().lower()
      iMatch, i_unit = unit_lookup(i_unit)
      hMatch, h_unit = unit_lookup(h_unit)

      # Case 7: ingredient unit not a valid unit
      if not iMatch and not hMatch:
        # SOLUTION: just use serving size and multiply by ingredient quantity
        branded_joined.at[index, 'ingr_mass'] = row['serving_size'] * row['quantity']
      elif iMatch and hMatch:
        branded_joined.at[index, 'ingr_mass'] = convert_to_grams(row['quantity'], i_unit, row['household_qty'], h_unit, row['serving_size'],row['serving_size_unit'])
      
      # CASE 4: If there is a qty but no unit, AND a subunit for ingredient('1', '', '(16 oz)')
      elif (row['quantity'] != 0 and row['unit'] == '' and row['sub_qty'] != 0):
        # SOLUTION: multiply ingredient qty by subunit qty and use that as the ingredient measure
        qty = row['quantity'] * row['sub_qty']
        unit = row['sub_unit']
        branded_joined.at[index, 'ingr_mass'] = convert_to_grams(qty, unit, row['serving_size'], 'gram', row['serving_size'],row['serving_size_unit'])
      
      # CASE 1: If there is neither a qty nor unit for ingredient ('', '')('cooking spray')
      elif (row['quantity'] == 0 and i_unit == ''):
        # SOLUTION: Use serving size for branded food
        branded_joined.at[index, 'ingr_mass'] = row['serving_size']

      # CASE 2: If there is a qty but no unit for ingredient ('1', '') ('1', 'lemon')
      elif (row['quantity'] != 0 and i_unit == ''):
        # Multiply ingredient quantity with serving quanity and multiply serving size
        branded_joined.at[index, 'ingr_mass'] = row['quantity'] * row['serving_size']
      else:
        if(row['quantity'] != 0):
          branded_joined.at[index, 'ingr_mass'] = row['serving_size'] * row['quantity']
        else:
          branded_joined.at[index, 'ingr_mass'] = row['serving_size']
    # If error (all other cases):
    except Exception as e:
      # Just use serving size as total grams
      # raise
      branded_joined.at[index, 'ingr_mass'] = row['serving_size']
      continue

  return branded_joined

  


In [0]:
BF_DF = standardize_quantities(I_BF_DF)

In [0]:
BF_DF = BF_DF[['name', 'quantity', 'unit', 'recipe_index','fdc_id', 'sub_qty','sub_unit', 'ingr_mass']].rename(columns={'ingr_mass':'grams'})

In [0]:
BF_DF.to_csv('/content/drive/My Drive/Data Science Projects/RecipeApp/data/myData/BRANDED_INGR_MASS.csv')