# CookieCutter
The goal of this project is to predict the calories per serving of a recipe based on the ingredients list.

Training data was scraped from AllRecipes.com

# Data Wrangling

## Import packages

In [None]:
import numpy as np
import pandas as pd
import re

## Import functions

In [None]:
def multireplace(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str
    """
    # If case insensitive, normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    
    if ignore_case:

        def normalize_old(s):
            return s.lower()

        re_mode = re.IGNORECASE

    else:

        def normalize_old(s):
            return s

        re_mode = 0

    replacements = {
        normalize_old(key): val
        for key, val in replacements.items()
    }

    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)

    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)

    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(
        lambda match: replacements[normalize_old(match.group(0))], string)


def string_replace(orig_string):
    """
    Replace whitespace characters with semicolon
    """
    new_string = re.sub(' {2,}', ' ', orig_string).replace("\n", ";").replace("; ;", ";")
    return (new_string)


def get_ingredients(orig_string):
    """
    Separate numeric and text characters in a string
    """
    ing_regex = ('(\d+/*\d*\s*\d*/*\d*)\s(\w+\s*.*?);')
    all_ing = re.findall(ing_regex, orig_string)
    return (all_ing)


def get_quantity(regex_tuple):
    """
    Separate tupule into two columns
    """
    quantity = [y[0] for y in regex_tuple]
    units_with_ingredient = [y[1] for y in regex_tuple]
    df_of_units = pd.DataFrame({
        'quantity': quantity,
        'ingredient': units_with_ingredient
    })
    return (df_of_units)


def match_uids(originaldf, longdf):
    """
    Merge two dataframs using unique identifier
    """
    for row in range(0, len(originaldf)):
        longdf[row]['recipe_key'] = originaldf['recipe_key'][row]
        longdf[row]['calPerServing'] = originaldf['calPerServing'][row]
        longdf[row]['totalCal'] = originaldf['totalCal'][row]
        longdf[row]['servings'] = originaldf['servings'][row]
        longdf[row]['name'] = originaldf['name'][row]
    return (longdf)


def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove anything in parentheses
    2. Lowercase all text
    3. Remove all hypenated words
    4. Remove all punctuation
    5. Remove all whitespace
    6. Remove numbers
    7. Remove plurals
    8. Remove all english stopwords & unwanted text
    9. Returns a list of the cleaned text
    """
    import string
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()

    def lemmatize(string):
        for word in re.findall(r"[a-z]+", string):
            string = string.replace(
                word,
                wnl.lemmatize(word, 'n') if 's' in word[-3:] else word)
        return string

    unwanted_text = [
        'dash', 'pinch', 'teaspoon', 'tablespoon', 'fluid', 'cup', 'pint',
        'quart', 'ounce', 'oz', 'pound', 'rack', 'small', 'medium', 'large',
        'crushed', 'grated', 'skinless', 'boneless', 'melted', 'fresh',
        'diced', 'minced', 'thinly', 'dry', 'dried', 'halved', 'taste',
        'frying', 'lean', 'drained', 'jars', 'grated', 'clove', 'slice',
        'eaches', 'whole', 'cube', 'thick', 'unit', 'freshly', 'finely',
        'splash', 'semisweet', 'chip', 'extract', 'spread', 'powder', 'room',
        'temperature', 'brown', 'cooking', 'yolk', 'ground', 'package', 'mix',
        'cake', 'plain', 'goody', 'light', 'wheat', 'piece', 'substitute',
        'mini', 'kosher', 'crispy', 'minature', 'chunk', 'dark', 'bit',
        'square', 'boiling', 'bag', 'crumb', 'popsicle', 'stick', 'zest',
        'cereal', 'bar', 'tart', 'nib', 'tennessee', 'turbinado', 'baking',
        'pack', 'spice', 'moist', 'miniarature', 'crunchy', 'morsel', 'nugget',
        'candy', 'crisp', 'super', 'fine', 'decoration', 'sucralose', 'puree',
        'pureed', 'rainbow', 'cut', 'frozen', 'broken', 'round', 'concentrate',
        'miniature', 'cooky', 'virgin', 'dusting', 'half', 'baby', 'food',
        'jar', 'seedless', 'container', 'box', 'granule', 'filling', 'cold',
        'super', 'ripe', 'moisture', 'packet', 'instant', 'mint', 'ripe',
        'sea', 'coarse', 'fun', 'size', 'funsize', 'bulk', 'chopped', 'torn',
        'inch', 'shell', 'quality', 'strap', 'bittersweet', 'gallon', 'pure',
        'cane', 'liquid', 'drop', 'hard', 'yellow', 'black', 'strap', 'kiss',
        'protein', 'supplement', 'dessert', 'topping'
    ]

    # Remove anything in parenthesis
    mess = re.sub(r"\([^\)]+\)", '', mess)
    # Make everything lowercase
    mess = mess.lower()
    # Remove non-word punctuation
    mess = ' '.join(re.findall(
        r"[-,''\w]+", mess))  # This leaves some commas as a character #
    mess = re.sub(r"\,", ' ', mess)
    # Remove hypenated words
    mess = re.sub(r"(?=\S*['-])([a-zA-Z'-]+)", '',
                  mess)  # remove hypenated words
    # Remove numbers
    mess = ''.join([i for i in mess if not i.isdigit()])
    # Remove plurals
    mess = lemmatize(mess)
    #clean excess whitespace
    mess = re.sub(r"\s+", ' ', mess).strip()
    # Remove stopwords
    mess = [
        word for word in mess.split()
        if word.lower() not in stopwords.words('english')
    ]
    mess = [word for word in mess if word.lower() not in unwanted_text]
    mess = ' '.join(mess)
    return (mess.split())


def convert_fractions(quantity):
    """
    Convert fractions into decimals
    """
    from fractions import Fraction
    return float(sum(Fraction(s) for s in quantity.split()))

## Clean ingredient text string

In [None]:
# Load data
df = pd.read_csv('cookie_recipes.csv') # all cookie recipes

# Create unique id
df['recipe_key'] = df['url'].apply(lambda x:int(re.findall(r"\d+", x)[0]))

# Calculate total calories per recipe
df['totalCal'] = df['calPerServing']*df['servings']

# Filter for recipes with 12-64 servings and < 10,000 total calories
df = df[(df['servings']<=64) & 
        (df['servings']>=12) & 
        (df['totalCal']<10000)] 
df.reset_index(inplace=True, drop=True)

# Clean ingredient text
dict_unicode = {'\u2009': '', '½':' 1/2', '⅓':'1/3', '⅔':'2/3', '¼':'1/4', '¾':'3/4', '⅕':'1/5', 
                '⅖':'2/5', '⅗':'3/5', '⅘':'4/5', '⅙':'1/6', '⅚':'5/6', '⅐':'1/7', '⅛':'1/8', 
                '⅜':'3/8', '⅝':'5/8', '⅞':'7/8', '⅑':'1/9', '⅒':'1/10'}
df['ingredients'] = [item + ';' for item in df['ingredients']] # add semicolon at end of each string for easier regex filtering
df['ingredients'] = [multireplace(x, dict_unicode) for x in df['ingredients']] # replace unicode characters
df['ingredients'] = [string_replace(x) for x in df['ingredients']] # remove whitespace
ing = [get_ingredients(x) for x in df['ingredients']] # separate ingredients into list of list of tupules of ingredient strings

# Separate quantity from ingredient string
df_ing = [get_quantity(x) for x in ing]

clean_df = match_uids(df, df_ing) # pull unique id, calorie (outcome variable), number of servings, and number of ingredients from original dataframe
clean_df = pd.concat(clean_df) # concat list of pandas dataframes into one dataframe
clean_df['quantity'] = [convert_fractions(x) for x in clean_df['quantity']] # convert fractions into integers
clean_df = clean_df.reset_index()

## Save clean data

In [None]:
clean_df.to_csv('clean_df.csv')

# Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["patch.force_edgecolor"] = True # Plot edges on bar plots

In [None]:
ax = df['calPerServing'].plot(kind='hist',ylim=(0,500),bins=20)
ax.set_xlabel('Calories per Serving')
ax.set_ylabel('Number of Recipes')

In [None]:
sns.distplot(clean_df['totalCal'],kde=False,bins=20)

In [None]:
sns.regplot(x='servings', y='calPerServing', data=clean_df)