# CookieCutter
The goal of this project is to predict the calories per serving of a recipe based on the ingredients list.

Training data was scraped from AllRecipes.com

# Data Wrangling

## Import packages

In [None]:
import numpy as np
import pandas as pd
import re

## Import functions

In [None]:
def multireplace(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str
    """
    # If case insensitive, normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    
    if ignore_case:

        def normalize_old(s):
            return s.lower()

        re_mode = re.IGNORECASE

    else:

        def normalize_old(s):
            return s

        re_mode = 0

    replacements = {
        normalize_old(key): val
        for key, val in replacements.items()
    }

    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)

    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)

    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(
        lambda match: replacements[normalize_old(match.group(0))], string)


def string_replace(orig_string):
    """
    Replace whitespace characters with semicolon
    """
    new_string = re.sub(' {2,}', ' ', orig_string).replace("\n", ";").replace("; ;", ";")
    return (new_string)


def get_ingredients(orig_string):
    """
    Separate numeric and text characters in a string
    """
    ing_regex = ('(\d+/*\d*\s*\d*/*\d*)\s(\w+\s*.*?);')
    all_ing = re.findall(ing_regex, orig_string)
    return (all_ing)


def get_quantity(regex_tuple):
    """
    Separate tupule into two columns
    """
    quantity = [y[0] for y in regex_tuple]
    units_with_ingredient = [y[1] for y in regex_tuple]
    df_of_units = pd.DataFrame({
        'quantity': quantity,
        'ingredient': units_with_ingredient
    })
    return (df_of_units)


def match_uids(originaldf, longdf):
    """
    Merge two dataframs using unique identifier
    """
    for row in range(0, len(originaldf)):
        longdf[row]['recipe_key'] = originaldf['recipe_key'][row]
        longdf[row]['calPerServing'] = originaldf['calPerServing'][row]
        longdf[row]['totalCal'] = originaldf['totalCal'][row]
        longdf[row]['servings'] = originaldf['servings'][row]
        longdf[row]['name'] = originaldf['name'][row]
    return (longdf)


def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove anything in parentheses
    2. Lowercase all text
    3. Remove all hypenated words
    4. Remove all punctuation
    5. Remove all whitespace
    6. Remove numbers
    7. Remove plurals
    8. Remove all english stopwords & unwanted text
    9. Returns a list of the cleaned text
    """
    import string
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()

    def lemmatize(string):
        for word in re.findall(r"[a-z]+", string):
            string = string.replace(
                word,
                wnl.lemmatize(word, 'n') if 's' in word[-3:] else word)
        return string

    unwanted_text = [
        'dash', 'pinch', 'teaspoon', 'tablespoon', 'fluid', 'cup', 'pint',
        'quart', 'ounce', 'oz', 'pound', 'rack', 'small', 'medium', 'large',
        'crushed', 'grated', 'skinless', 'boneless', 'melted', 'fresh',
        'diced', 'minced', 'thinly', 'dry', 'dried', 'halved', 'taste',
        'frying', 'lean', 'drained', 'jars', 'grated', 'clove', 'slice',
        'eaches', 'whole', 'cube', 'thick', 'unit', 'freshly', 'finely',
        'splash', 'semisweet', 'chip', 'extract', 'spread', 'powder', 'room',
        'temperature', 'brown', 'cooking', 'yolk', 'ground', 'package', 'mix',
        'cake', 'plain', 'goody', 'light', 'wheat', 'piece', 'substitute',
        'mini', 'kosher', 'crispy', 'minature', 'chunk', 'dark', 'bit',
        'square', 'boiling', 'bag', 'crumb', 'popsicle', 'stick', 'zest',
        'cereal', 'bar', 'tart', 'nib', 'tennessee', 'turbinado', 'baking',
        'pack', 'spice', 'moist', 'miniarature', 'crunchy', 'morsel', 'nugget',
        'candy', 'crisp', 'super', 'fine', 'decoration', 'sucralose', 'puree',
        'pureed', 'rainbow', 'cut', 'frozen', 'broken', 'round', 'concentrate',
        'miniature', 'cooky', 'virgin', 'dusting', 'half', 'baby', 'food',
        'jar', 'seedless', 'container', 'box', 'granule', 'filling', 'cold',
        'super', 'ripe', 'moisture', 'packet', 'instant', 'mint', 'ripe',
        'sea', 'coarse', 'fun', 'size', 'funsize', 'bulk', 'chopped', 'torn',
        'inch', 'shell', 'quality', 'strap', 'bittersweet', 'gallon', 'pure',
        'cane', 'liquid', 'drop', 'hard', 'yellow', 'black', 'strap', 'kiss',
        'protein', 'supplement', 'dessert', 'topping'
    ]

    # Remove anything in parenthesis
    mess = re.sub(r"\([^\)]+\)", '', mess)
    # Make everything lowercase
    mess = mess.lower()
    # Remove non-word punctuation
    mess = ' '.join(re.findall(
        r"[-,''\w]+", mess))  # This leaves some commas as a character #
    mess = re.sub(r"\,", ' ', mess)
    # Remove hypenated words
    mess = re.sub(r"(?=\S*['-])([a-zA-Z'-]+)", '',
                  mess)  # remove hypenated words
    # Remove numbers
    mess = ''.join([i for i in mess if not i.isdigit()])
    # Remove plurals
    mess = lemmatize(mess)
    #clean excess whitespace
    mess = re.sub(r"\s+", ' ', mess).strip()
    # Remove stopwords
    mess = [
        word for word in mess.split()
        if word.lower() not in stopwords.words('english')
    ]
    mess = [word for word in mess if word.lower() not in unwanted_text]
    mess = ' '.join(mess)
    return (mess.split())


def convert_fractions(quantity):
    """
    Convert fractions into decimals
    """
    from fractions import Fraction
    return float(sum(Fraction(s) for s in quantity.split()))


def pos_tagger(tokens, pos_tag):
    """
    Select tokens that have noun part of speech tag
    """
    import nltk
    tagged = nltk.pos_tag(tokens)
    return ([token[0] for token in tagged if token[1] in pos_tag])


def word_quantity(ing_column, norm_quant_column, orig_dataframe):
    """
    Repeat word based on quantity of ingredient.
    """
    dummy_df = orig_dataframe.copy()
    dummy_df['ingredient'] = dummy_df[ing_column].astype(str) + ' '
    zipped = list(zip(dummy_df[ing_column], dummy_df[norm_quant_column]))
    inglist = [t[0] * t[1] for t in zipped]
    final_df = pd.DataFrame(inglist, columns=['ingredient'])
    final_df[[
        'recipe_key', 'totalCal', 'calPerServing', 'name', 'ingredient_key'
    ]] = orig_dataframe[[
        'recipe_key', 'totalCal', 'calPerServing', 'name', 'index'
    ]]

    # Create multiIndex / hierarchical Dataframe
    tuples = list(zip(*[final_df['recipe_key'], final_df['ingredient_key']]))
    index = pd.MultiIndex.from_tuples(tuples,
                                      names=['recipe_key', 'ingredient_key'])
    final_df.set_index(index, inplace=True)
    final_df.rename(columns={'recipe_key': 'key'}, inplace=True)
    #     return(final_df)

    X_ing = final_df.groupby('recipe_key')['ingredient'].apply(
        ' '.join)  # join list into one string per recipe
    X_ing = pd.DataFrame(X_ing)
    return (X_ing)

## Clean ingredient text string

In [None]:
# Load data
df = pd.read_csv('cookie_recipes.csv') # all cookie recipes

# Create unique id
df['recipe_key'] = df['url'].apply(lambda x:int(re.findall(r"\d+", x)[0]))

# Calculate total calories per recipe
df['totalCal'] = df['calPerServing']*df['servings']

# Filter for recipes with 12-64 servings and < 10,000 total calories
df = df[(df['servings']<=64) & 
        (df['servings']>=12) & 
        (df['totalCal']<10000)] 
df.reset_index(inplace=True, drop=True)

# Clean ingredient text
dict_unicode = {'\u2009': '', '½':' 1/2', '⅓':'1/3', '⅔':'2/3', '¼':'1/4', '¾':'3/4', '⅕':'1/5', 
                '⅖':'2/5', '⅗':'3/5', '⅘':'4/5', '⅙':'1/6', '⅚':'5/6', '⅐':'1/7', '⅛':'1/8', 
                '⅜':'3/8', '⅝':'5/8', '⅞':'7/8', '⅑':'1/9', '⅒':'1/10'}
df['ingredients'] = [item + ';' for item in df['ingredients']] # add semicolon at end of each string for easier regex filtering
df['ingredients'] = [multireplace(x, dict_unicode) for x in df['ingredients']] # replace unicode characters
df['ingredients'] = [string_replace(x) for x in df['ingredients']] # remove whitespace
ing = [get_ingredients(x) for x in df['ingredients']] # separate ingredients into list of list of tupules of ingredient strings

df_ing = [get_quantity(x) for x in ing] # separate units of measure and ingredients & creates a pandas dataframe for each recipe

clean_df = match_uids(df, df_ing) # pull unique id, calorie (outcome variable), number of servings, and number of ingredients from original dataframe
clean_df = pd.concat(clean_df) # concat list of pandas dataframes into one dataframe
clean_df['quantity'] = [convert_fractions(x) for x in clean_df['quantity']] # convert fractions into integers
clean_df = clean_df.reset_index()

## Normalize quantity of ingredients to grams

In [None]:
# Convert measurements to normalized unit  (1 Unit= 1 grams)
clean_df['unit'] = np.where(clean_df.ingredient.str.contains("dash"), .3,
            np.where(clean_df.ingredient.str.contains("pinch"), .6,
            np.where(clean_df.ingredient.str.contains("teaspoon"), 5, 
            np.where(clean_df.ingredient.str.contains("tablespoon"), 3,
            np.where(clean_df.ingredient.str.contains("fluid"), 30,
            np.where(clean_df.ingredient.str.contains("cup"), 240, 
            np.where(clean_df.ingredient.str.contains("pint"), 473,
            np.where(clean_df.ingredient.str.contains("quart"), 980,
            np.where(clean_df.ingredient.str.contains("ounce"), 28,
            np.where(clean_df.ingredient.str.contains("oz"), 28, 
            np.where(clean_df.ingredient.str.contains("pound"), 454,
            np.where(clean_df.ingredient.str.contains("rack"), 908,
            np.where(clean_df.ingredient.str.contains("small"), 50,
            np.where(clean_df.ingredient.str.contains("medium"), 60,
            np.where(clean_df.ingredient.str.contains("large"), 70,
            3))))))))))))))) 

## Tokenize ingredient text

In [None]:
# Tokenization = convert text string into list of tokens, or words, we want (i.e., cleaned version of words).
import string
from nltk.corpus import stopwords
clean_df['ingredient']=[text_process(x) for x in clean_df['ingredient']]

# Total quantity of each ingredient needed for recipe (grams* quantity) and condense into a list.
clean_df['norm_quant'] = round(clean_df['unit']*clean_df['quantity'])
clean_df['norm_quant'] = clean_df['norm_quant'].astype(int)

# One word per ingredient - keep only nouns, join multiple words as one string
clean_df['ingredient'] = [pos_tagger(tokens, ['NN']) for tokens in clean_df['ingredient']]
clean_df['ingredient'] = [''.join(tokens) for tokens in clean_df['ingredient']]

# Repeat word by normalized quantity
X_ing = word_quantity('ingredient','norm_quant',clean_df)
X_ing[['orig_ing', 'name', 'servings']] = df.set_index('recipe_key')[['ingredients', 'name', 'servings']]

# NLP

## Train test split

In [None]:
# Create feature and outcome dataframe
y_cal = df.set_index('recipe_key')[['totalCal', 'calPerServing', 'name','servings']].sort_index().copy()
X_keys = df.reset_index(drop=True)['recipe_key']

# Train test split (80:20)
from sklearn.model_selection import train_test_split
key_train, key_test, y_train, y_test = train_test_split(
    X_keys, y_cal, test_size=0.2, random_state=101)

# Separate feature and outcome dataframes based on key
X_train = X_ing.loc[key_train]
X_test = X_ing.loc[key_test]
y_train = y_cal.loc[key_train]
y_test = y_cal.loc[key_test]

X_train.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_test.sort_index(inplace=True)

# Remove extreme edge cases
X_test.drop([10392, 16571, 17337], inplace=True)
y_test.drop([10392, 16571, 17337], inplace=True)

print("Training set contains {} recipes in total".format(len(key_train)))
print("Test set contains {} recipes in total".format(len(key_test)))

## Bag of Words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
bow_transformer = CountVectorizer(analyzer=text_process, min_df = 10).fit(X_train['ingredient']) # Bag of Words
print(len(bow_transformer.vocabulary_)) # Print total number of vocab words
print(bow_transformer.get_feature_names()) # Print all words

In [None]:
# Transform data to bag of words
ingredient_bow_train = bow_transformer.transform(X_train['ingredient']) # Transform train dataset to Bag of Words
ingredient_bow_test = bow_transformer.transform(X_test['ingredient']) # Transform test dataset to Bag of Words
print('Shape of Sparse Matrix: ', ingredient_bow_train.shape) # matrix size (number of recipes, total number of words)
print('Amount of Non-Zero occurences: ', ingredient_bow_train.nnz) 
sparsity = (100.0 * ingredient_bow_train.nnz / (ingredient_bow_train.shape[0] * ingredient_bow_train.shape[1]))
print('sparsity: {}'.format(sparsity)) # matrix sparsity

## Save using pickle

In [None]:
# Load pickle files
import pickle
bow_transformer = pickle.load(open('bow_transformer_2.sav','rb'))
print(len(bow_transformer.vocabulary_)) # Print total number of vocab words
print(bow_transformer.get_feature_names()) # Print all words
ingredient_bow_train = pickle.load(open('ingredient_bow_train_2.sav','rb'))
ingredient_bow_test = pickle.load(open('ingredient_bow_test_2.sav','rb'))

# Modeling

In [None]:
def modelfit(alg, dtrain, predictors, bow_transformer, performCV=True, useTrainCV=False, printFeatureImportance=True, cv_folds=5, early_stopping_rounds=50):
    """
    Perform cross-validation on training data. Returns model performance and feature importance. 
    """
    from sklearn.ensemble import GradientBoostingRegressor

    #Fit the algorithm on the data
    alg.fit(dtrain, predictors)
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    
    #Perform cross-validation: performCV = True for Random Forest and Gradient Boosting. False for XGBoost.
    if performCV == True:
        from sklearn.model_selection import cross_val_score
        cv_score = cross_val_score(alg, dtrain, predictors, cv=cv_folds, scoring='neg_root_mean_squared_error')
    else:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=predictors)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    #Print model report:
    print ("\nModel Report")
    from scipy import stats
    from sklearn import metrics
    slope, intercept, r_value, p_value, std_err = stats.linregress(dtrain_predictions,predictors)
    print('Slope: ', round(slope,2))
    print('Intercept: ', round(intercept))
    print('Coefficient of Determinant: ', round(r_value**2,2))
    print('p-value: ', p_value)
    print('Standard Error: ', round(std_err,2)) # standard error of the slope
    print('RMSE:', round(np.sqrt(metrics.mean_squared_error(dtrain_predictions, predictors)),2))

    if performCV:
        print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        
    #Print Feature Importance:
    if printFeatureImportance:
        from sklearn.feature_extraction.text import CountVectorizer
        feat_imp = pd.DataFrame(alg.feature_importances_,bow_transformer.get_feature_names(), columns=['coeff'])
        print(feat_imp.sort_values(by='coeff', ascending=False).head())
        import matplotlib.pyplot as plt
        feat_imp['coeff'].sort_values(ascending=False).head().plot(kind='barh', title='Feature Importances').invert_yaxis()
    plt.xlabel('Feature Importance Score')

    
def model_test(alg, bow_train, y_train, bow_test, y_test, linearmodel = False, plot = True, features = True, resid = False, QQ = False):
    """
    Fit model and validate using test data. Returns model performance, feature importance, residual plot, and QQ plot.
    
    If using linear regression, specify linearmodel=True
    """
    alg.fit(bow_train, y_train['totalCal'])
    predictions = pd.DataFrame(y_test['totalCal'])
    predictions['totalCal'] = alg.predict(bow_test)
    predictions['calPerServing'] = predictions['totalCal']/y_test['servings']

    # Model Performance
    from scipy import stats
    from sklearn import metrics
    slope, intercept, r_value, p_value, std_err = stats.linregress(predictions['calPerServing'],y_test['calPerServing'])
    print('Slope: ', round(slope,2))
    print('Intercept: ', round(intercept))
    print('Coefficient of Determinant: ', round(r_value**2,2))
    print('p-value: ', p_value)
    print('Standard Error: ', round(std_err,2)) # standard error of the slope
    print('RMSE:', round(np.sqrt(metrics.mean_squared_error(y_test['calPerServing'], predictions['calPerServing'])),2))
    
    # Feature Importance
    if linearmodel == True:
        feat_imp = pd.DataFrame(linreg.coef_,bow_transformer.get_feature_names(), columns=['coeff'])
        print(feat_imp.sort_values(by='coeff', ascending=False).head())
    else:
        feat_imp = pd.DataFrame(alg.feature_importances_, bow_transformer.get_feature_names(), columns=['coeff'])
        print('Number of features removed: ', len(feat_imp[feat_imp['coeff']==0])) # number of features removed
        print(feat_imp.sort_values(by='coeff', ascending=False).head())

    # Model visualization
    if plot == True:
        import matplotlib.pyplot as plt
        import seaborn as sns
        fig = plt.figure(figsize=(6,6))

        sns.set_context('poster', font_scale=1)
        ax1 = sns.regplot(y_test['calPerServing'],predictions['calPerServing'])
        ax1.set_xlabel('True Calories per Serving')
        ax1.set_ylabel('Predicted Calories per Serving')

    if features == True:
        fig = plt.figure()
        sns.set_context('poster', font_scale=1)
        ax2 = feat_imp.sort_values(by='coeff', ascending=False).head().plot(kind='barh', title='Feature Importances',legend=False).invert_yaxis()

    if resid == True:
        sns.set_context('notebook', font_scale=1)
        fig = plt.figure()
        ax3 = sns.distplot(y_test['calPerServing']-predictions['calPerServing'])
        ax3.set_xlabel('Residual Calories per Serving', fontsize=20)
        ax3.set_ylabel('Distribution', fontsize=20)
    
    if QQ == True:
        import statsmodels.api as sm
        sns.set_context('notebook', font_scale=1)
        fig = plt.figure()
        ax4 = sm.qqplot(predictions['totalCal'], line='s')
    
    sns.set_context('notebook', font_scale=1)
    return(predictions)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
model_test(linreg, ingredient_bow_train, y_train, ingredient_bow_test, y_test, linearmodel=True)

## Random Forest

### RF baseline model

In [None]:
from sklearn.ensemble import RandomForestRegressor
mid_model = RandomForestRegressor()
modelfit(mid_model, ingredient_bow_train, y_train['totalCal'],bow_transformer)

### Hyperparameter tuning

#### Grid search

In [None]:
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(10,241,5)}
gsearch1 = GridSearchCV(estimator = RandomForestRegressor(max_features='sqrt', random_state=10), 
param_grid = param_test1, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5)
gsearch1.fit(ingredient_bow_train, y_train['totalCal'])
gsearch1.best_params_, gsearch1.best_score_ # Output

In [None]:
param_test2 = {'max_depth':range(45,55,1)}
gsearch2 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=195, max_features='sqrt', random_state=10), 
param_grid = param_test2, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5)
gsearch2.fit(ingredient_bow_train, y_train['totalCal'])
gsearch2.best_params_, gsearch2.best_score_ # Output

In [None]:
param_test3 = {'min_samples_split':range(1,20,1)}
gsearch3 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=195, max_depth=51, max_features='sqrt', random_state=10), 
param_grid = param_test3, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5)
gsearch3.fit(ingredient_bow_train, y_train['totalCal'])
gsearch3.best_params_, gsearch3.best_score_ # Output

In [None]:
param_test4 = {'min_samples_leaf':range(1,15,1)}
gsearch4 = GridSearchCV(estimator = RandomForestRegressor(n_estimators=195, max_depth=51, min_samples_split=2, max_features='sqrt', random_state=10), 
param_grid = param_test4, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5)
gsearch4.fit(ingredient_bow_train, y_train['totalCal'])
gsearch4.best_params_, gsearch4.best_score_ # Output

#### Randomized grid search
This takes ~ 10 minutes.

In [None]:
grid_param = {'n_estimators': [50, 100, 200, 400, 800] , #number of trees
             'max_features': ['auto','sqrt','log2'], # max number of features to consider at every split
             'max_depth': [10, 20, 30, 40, 50], # max levels in a tree
             'min_samples_split': [2,5,10,15,20], # min number of samples required to split a node
             'min_samples_leaf': [1,2,5,10,15] # min number of samples required at each leaf node
             }

from sklearn.model_selection import RandomizedSearchCV
RFR = RandomForestRegressor(random_state=10)
RFR_random = RandomizedSearchCV(estimator=RFR, param_distributions = grid_param, n_iter=500, cv=5, 
                                verbose=2, random_state=42, n_jobs=-1)
RFR_random.fit(ingredient_bow_train, y_train['totalCal'])
RFR_random.best_params_, RFR_random.best_score_ # Output

### Final model prediction

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor(n_estimators=195, min_samples_split=2, min_samples_leaf=1, 
                                  max_features='sqrt', max_depth=51, random_state=10)

model_test(RF_model, ingredient_bow_train, y_train, ingredient_bow_test, y_test)

##  Gradient Boosting Regressor

### GBR baseline model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
mid_model = GradientBoostingRegressor(loss="ls")
modelfit(mid_model, ingredient_bow_train, y_train['totalCal'],bow_transformer)

### Hyperparameter tuning

#### Fix learning rate & Number of estimators for tuning tree-based parameters
`min_samples_split` should be 0.5-1% of the total dataset
`min_samples_leaf`should be ~1/10th of `min_samples_split`
`learning rate` standard is 0.1. Can go up to 0.3.
`n_estimators` should be < 100.

If optimal estimators is around 20, lower learning rate to 0.05 and rerun grid search.
If optimal estimators is too high (~100), increase learning rate. This will cause tuning of other parameters to take a long time.

In [None]:
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(20,241,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.4, min_samples_split=20, min_samples_leaf=2, max_depth=8, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test1, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5)
gsearch1.fit(ingredient_bow_train, y_train['totalCal'])
gsearch1.best_params_, gsearch1.best_score_ # Output

In [None]:
gsearch1.cv_results_

#### Tune tree parameters

In [None]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,2001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.4, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5)
gsearch2.fit(ingredient_bow_train, y_train['totalCal'])
gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test3 = {'min_samples_split':range(900,1200,10), 'min_samples_leaf':range(2,82,4)}
gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.4, n_estimators=60, max_depth=7, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5)
gsearch3.fit(ingredient_bow_train, y_train['totalCal'])
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {'max_features':range(7,30,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.4, n_estimators=60, max_depth=7, min_samples_split=930, min_samples_leaf=6, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5)
gsearch4.fit(ingredient_bow_train, y_train['totalCal'])
gsearch4.best_params_, gsearch4.best_score_

In [None]:
modelfit(gsearch4.best_estimator_, ingredient_bow_train, y_train['totalCal'],bow_transformer)

#### Tune subsample

In [None]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.4, n_estimators=60, max_depth=7, min_samples_split=930, min_samples_leaf=6, subsample=0.8, max_features=27, random_state=10),
param_grid = param_test5, scoring='neg_root_mean_squared_error', n_jobs=4, cv=5)
gsearch5.fit(ingredient_bow_train, y_train['totalCal'])
gsearch5.best_params_, gsearch5.best_score_

#### Tune learning rate and number of trees

In [None]:
# 1/2 learning rate with 2X trees (n_estimators)
gbm_tuned_1 = GradientBoostingRegressor(learning_rate=0.2, n_estimators=120, max_depth=7, min_samples_split=930, min_samples_leaf=6, subsample=0.85, max_features=27, random_state=10)
modelfit(gbm_tuned_1, ingredient_bow_train, y_train['totalCal'],bow_transformer)

In [None]:
# 1/10 learning rate with 10X trees (n_estimators)
gbm_tuned_2 = GradientBoostingRegressor(learning_rate=0.04, n_estimators=600, max_depth=7, min_samples_split=930, min_samples_leaf=6, subsample=0.85, max_features=27, random_state=10)
modelfit(gbm_tuned_2, ingredient_bow_train, y_train['totalCal'],bow_transformer)

In [None]:
# 1/15 learning rate with 15X trees (n_estimators)
gbm_tuned_3 = GradientBoostingRegressor(learning_rate=0.03, n_estimators=900, max_depth=7, min_samples_split=950, min_samples_leaf=6, subsample=0.8, max_features=21, random_state=10)
modelfit(gbm_tuned_3, ingredient_bow_train, y_train['totalCal'],bow_transformer)

### Final model prediction

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR_model = GradientBoostingRegressor(loss="ls", learning_rate=0.04, n_estimators=600, max_depth=7, min_samples_split=930, min_samples_leaf=6, subsample=0.85, max_features=27, random_state=10)
model_test(GBR_model, ingredient_bow_train, y_train, ingredient_bow_test, y_test)

## XGBoost

### XGBoost baseline model

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

XGB_model = XGBRegressor()
modelfit(XGB_model, ingredient_bow_train, y_train['totalCal'],bow_transformer, performCV=False)

### Hyperparameter tuning

#### Tune max depth & min child weight
These parameters have the highst impact on model outcome

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {
 'max_depth':range(5,8,1),
 'min_child_weight':range(5,11,1)
}

gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='neg_root_mean_squared_error',n_jobs=4, cv=5)

gsearch1.fit(ingredient_bow_train, y_train['totalCal'])
gsearch1.best_params_, gsearch1.best_score_

In [None]:
gsearch1.cv_results_

#### Tune gamma

In [None]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch2 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=6,
 min_child_weight=9, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='neg_root_mean_squared_error',n_jobs=4, cv=5)

gsearch2.fit(ingredient_bow_train, y_train['totalCal'])
gsearch2.best_params_, gsearch1.best_score_

#### Tune subsample and colsample_bytree

In [None]:
param_test3 = {
 'subsample':[i/100.0 for i in range(60,90,5)],
 'colsample_bytree':[i/100.0 for i in range(70,90,5)]
}

gsearch3 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=6,
 min_child_weight=9, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring='neg_root_mean_squared_error',n_jobs=4, cv=5)

gsearch3.fit(ingredient_bow_train, y_train['totalCal'])
gsearch3.best_params_, gsearch1.best_score_

#### Tune regularization parameters
This will reduce overfitting

In [None]:
param_test4 = {
 'reg_alpha':[1e-8, 1e-7, 1e-6, 1e-5, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100]
}

gsearch4 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=6,
 min_child_weight=9, gamma=0, subsample=0.65, colsample_bytree=0.7,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4, scoring='neg_root_mean_squared_error',n_jobs=4, cv=5)

gsearch4.fit(ingredient_bow_train, y_train['totalCal'])
gsearch4.best_params_, gsearch1.best_score_

Best parameters: 
* max_depth = 6
* min_child_weight = 9
* gamma = 0
* colsample_bytree=0.8
* subsample = 0.8

In [None]:
model3 = XGBRegressor( learning_rate =0.05, n_estimators=280, max_depth=6,
 min_child_weight=9, gamma=0, subsample=0.65, colsample_bytree=0.7,
 objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, reg_alpha=1e-6, seed=27)

modelfit(model3, ingredient_bow_train, y_train['totalCal'],bow_transformer, performCV=False)

### Final model prediction

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

xgb_model = XGBRegressor(learning_rate =0.05, n_estimators=280, max_depth=6,
     min_child_weight=9, gamma=0, subsample=0.65, colsample_bytree=0.7,
     objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, reg_alpha=1e-6, seed=27)

model_test(xgb_model, ingredient_bow_train, y_train, ingredient_bow_test, y_test)

# Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams["patch.force_edgecolor"] = True # Plot edges on bar plots

In [None]:
ax = df['calPerServing'].plot(kind='hist',ylim=(0,500),bins=20)
ax.set_xlabel('Calories per Serving')
ax.set_ylabel('Number of Recipes')

In [None]:
sns.distplot(clean_df['totalCal'],kde=False,bins=20)

In [None]:
sns.regplot(x='servings', y='calPerServing', data=clean_df)