# CookieCutter
The goal of this project is to predict the calories per serving of a recipe based on the ingredients list.

Training data was scraped from AllRecipes.com

## Import packages

In [None]:
import numpy as np
import pandas as pd
import re

## Import functions

In [None]:
def pos_tagger(tokens, pos_tag):
    """
    Select tokens that have noun part of speech tag
    """
    import nltk
    tagged = nltk.pos_tag(tokens)
    return ([token[0] for token in tagged if token[1] in pos_tag])


def word_quantity(ing_column, norm_quant_column, orig_dataframe):
    """
    Repeat word based on quantity of ingredient.
    """
    dummy_df = orig_dataframe.copy()
    dummy_df['ingredient'] = dummy_df[ing_column].astype(str) + ' '
    zipped = list(zip(dummy_df[ing_column], dummy_df[norm_quant_column]))
    inglist = [t[0] * t[1] for t in zipped]
    final_df = pd.DataFrame(inglist, columns=['ingredient'])
    final_df[[
        'recipe_key', 'totalCal', 'calPerServing', 'name', 'ingredient_key'
    ]] = orig_dataframe[[
        'recipe_key', 'totalCal', 'calPerServing', 'name', 'index'
    ]]

    # Create multiIndex / hierarchical Dataframe
    tuples = list(zip(*[final_df['recipe_key'], final_df['ingredient_key']]))
    index = pd.MultiIndex.from_tuples(tuples,
                                      names=['recipe_key', 'ingredient_key'])
    final_df.set_index(index, inplace=True)
    final_df.rename(columns={'recipe_key': 'key'}, inplace=True)
    #     return(final_df)

    X_ing = final_df.groupby('recipe_key')['ingredient'].apply(
        ' '.join)  # join list into one string per recipe
    X_ing = pd.DataFrame(X_ing)
    return (X_ing)

## Load clean data

In [None]:
clean_df = pd.read_csv('clean_df.csv')

# NLP

## Tokenize ingredient text

In [None]:
# Convert measurements to normalized unit  (1 Unit= 1 grams)
clean_df['unit'] = np.where(clean_df.ingredient.str.contains("dash"), .3,
            np.where(clean_df.ingredient.str.contains("pinch"), .6,
            np.where(clean_df.ingredient.str.contains("teaspoon"), 5, 
            np.where(clean_df.ingredient.str.contains("tablespoon"), 3,
            np.where(clean_df.ingredient.str.contains("fluid"), 30,
            np.where(clean_df.ingredient.str.contains("cup"), 240, 
            np.where(clean_df.ingredient.str.contains("pint"), 473,
            np.where(clean_df.ingredient.str.contains("quart"), 980,
            np.where(clean_df.ingredient.str.contains("ounce"), 28,
            np.where(clean_df.ingredient.str.contains("oz"), 28, 
            np.where(clean_df.ingredient.str.contains("pound"), 454,
            np.where(clean_df.ingredient.str.contains("rack"), 908,
            np.where(clean_df.ingredient.str.contains("small"), 50,
            np.where(clean_df.ingredient.str.contains("medium"), 60,
            np.where(clean_df.ingredient.str.contains("large"), 70,
            3))))))))))))))) 

# Tokenization
import string
from nltk.corpus import stopwords
clean_df['ingredient']=[text_process(x) for x in clean_df['ingredient']]

# Total quantity of each ingredient needed for recipe (grams* quantity) and condense into a list.
clean_df['norm_quant'] = round(clean_df['unit']*clean_df['quantity'])
clean_df['norm_quant'] = clean_df['norm_quant'].astype(int)

# One word per ingredient - keep only nouns, join multiple words as one string
clean_df['ingredient'] = [pos_tagger(tokens, ['NN']) for tokens in clean_df['ingredient']]
clean_df['ingredient'] = [''.join(tokens) for tokens in clean_df['ingredient']]

# Repeat word by normalized quantity
X_ing = word_quantity('ingredient','norm_quant',clean_df)
X_ing[['orig_ing', 'name', 'servings']] = df.set_index('recipe_key')[['ingredients', 'name', 'servings']]

## Train test split

In [None]:
# Create feature and outcome dataframe
y_cal = df.set_index('recipe_key')[['totalCal', 'calPerServing', 'name','servings']].sort_index().copy()
X_keys = df.reset_index(drop=True)['recipe_key']

# Train test split (80:20)
from sklearn.model_selection import train_test_split
key_train, key_test, y_train, y_test = train_test_split(
    X_keys, y_cal, test_size=0.2, random_state=101)

# Separate feature and outcome dataframes based on key
X_train = X_ing.loc[key_train]
X_test = X_ing.loc[key_test]
y_train = y_cal.loc[key_train]
y_test = y_cal.loc[key_test]

X_train.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_test.sort_index(inplace=True)

# Remove extreme edge cases
X_test.drop([10392, 16571, 17337], inplace=True)
y_test.drop([10392, 16571, 17337], inplace=True)

print("Training set contains {} recipes in total".format(len(key_train)))
print("Test set contains {} recipes in total".format(len(key_test)))

## Bag of words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
bow_transformer = CountVectorizer(analyzer=text_process, min_df = 10).fit(X_train['ingredient']) # Bag of Words
print(len(bow_transformer.vocabulary_)) # Print total number of vocab words
print(bow_transformer.get_feature_names()) # Print all words

In [None]:
# Transform data to bag of words
ingredient_bow_train = bow_transformer.transform(X_train['ingredient']) # Transform train dataset to Bag of Words
ingredient_bow_test = bow_transformer.transform(X_test['ingredient']) # Transform test dataset to Bag of Words
print('Shape of Sparse Matrix: ', ingredient_bow_train.shape) # matrix size (number of recipes, total number of words)
print('Amount of Non-Zero occurences: ', ingredient_bow_train.nnz) 
sparsity = (100.0 * ingredient_bow_train.nnz / (ingredient_bow_train.shape[0] * ingredient_bow_train.shape[1]))
print('sparsity: {}'.format(sparsity)) # matrix sparsity

## Save bag of words vectorization

In [None]:
import pickle
pickle.dump(bow_transformer,open('bow_transformer_3.sav','wb'))
pickle.dump(ingredient_bow_train,open('ingredient_bow_train_3.sav','wb'))
pickle.dump(ingredient_bow_test,open('ingredient_bow_test_3.sav','wb'))