In [105]:
import numpy as np
import pandas as pd
import re

In [106]:
def multireplace(string, replacements, ignore_case=False):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :param bool ignore_case: whether the match should be case insensitive
    :rtype: str
    """
    # If case insensitive, we need to normalize the old string so that later a replacement
    # can be found. For instance with {"HEY": "lol"} we should match and find a replacement for "hey",
    # "HEY", "hEy", etc.
    if ignore_case:
        def normalize_old(s):
            return s.lower()

        re_mode = re.IGNORECASE

    else:
        def normalize_old(s):
            return s

        re_mode = 0

    replacements = {normalize_old(key): val for key, val in replacements.items()}
    
    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    rep_sorted = sorted(replacements, key=len, reverse=True)
    rep_escaped = map(re.escape, rep_sorted)
    
    # Create a big OR regex that matches any of the substrings to replace
    pattern = re.compile("|".join(rep_escaped), re_mode)
    
    # For each match, look up the new string in the replacements, being the key the normalized old string
    return pattern.sub(lambda match: replacements[normalize_old(match.group(0))], string)

def string_replace(x):
    new_string = re.sub(' {2,}', ' ', x).replace("\n", ";").replace("; ;", ";")
#    new_string = new_string.split(';')
    return(new_string)

def get_ingredients (x):
    ing_regex = ('(\d+/*\d*\s*\d*/*\d*)\s(\w+\s*.*?);')
    all_ing = re.findall(ing_regex, x)
    return(all_ing)

# def amt_ingredients(x):
#     amt_regex = ('(\d+/*\d*\s*\d*/*\d*).*?;')
#     amt_ing = re.findall(amt_regex, x)
#     return(amt_ing)

def num_ingredients(x):
    return(len(x))

def get_quantity(x):
    quantity = [y[0] for y in x] # use for df
    units_with_ingredient = [y[1] for y in x]
    df_of_units = pd.DataFrame({'ingredient':units_with_ingredient, 'quantity':quantity})
    return (df_of_units)

### MAY NOT NEED THIS ANYMORE ###
def get_units(x):
    quantity = [y[0] for y in x] # use for df
    units_with_ingredient = [y[1] for y in x]
    list_of_units = [re.findall("(\w+)\s*(.*)", x)[0] for x in units_with_ingredient]
    df_of_units = pd.DataFrame(list_of_units, columns = ['unit', 'ingredient'])
    df_of_units['quantity'] = quantity
    return (df_of_units)
### MAY NOT NEED THIS ANYMORE ###

def match_uids(originaldf, longdf):
    for row in range(0, len(originaldf)):
        longdf[row]['recipe_key']=originaldf['recipe_key'][row]
        longdf[row]['calPerServing']=originaldf['calPerServing'][row]
        longdf[row]['num_ingredients']=originaldf['num_ingredients'][row]
        longdf[row]['servings']=originaldf['servings'][row]
        longdf[row]['name']=originaldf['name'][row]
    return(longdf)

def convert_fractions (quantity):
    from fractions import Fraction
    return float(sum(Fraction(s) for s in quantity.split()))

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    import string
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    def lemmatize(string):
        for word in re.findall(r"[a-z]+", string):
            string = string.replace(word, wnl.lemmatize(word, 'n') if 's' in word[-3:] else word)
        return string

    # Remove anything in parenthesis
    mess = re.sub(r"\([^\)]+\)", '', mess)
    
    # Make everything lowercase
    mess = mess.lower()
    # Remove non-word punctuation
    mess =' '.join(re.findall(r"[-,''\w]+", mess)) # This leaves some commas as a character #
    mess = re.sub(r"\,", ' ', mess)
    # Remove punctuation and numbers
    #mess = ''.join([char for char in mess if char not in string.punctuation])
    mess = ''.join([i for i in mess if not i.isdigit()])
    # Remove plurals
    mess = lemmatize(mess)
    #clean excess whitespace
    mess = re.sub(r"\s+", ' ', mess).strip()
    # Remove stopwords
    return([word for word in mess.split() if word.lower() not in stopwords.words('english')])

def unit_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    import string
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    wnl = WordNetLemmatizer()
    
    def lemmatize(string):
        for word in re.findall(r"[a-z]+", string):
            string = string.replace(word, wnl.lemmatize(word, 'n') if 's' in word[-3:] else word)
        return string

    # Remove anything in parenthesis
    mess = re.sub(r"\([^\)]+\)", '', mess)
    
    # Make everything lowercase
    mess = mess.lower()
    # Remove non-word punctuation
    mess =' '.join(re.findall(r"[-,''\w]+", mess)) # This leaves some commas as a character #
    mess = re.sub(r"\,", ' ', mess)
    # Remove punctuation and numbers
    #mess = ''.join([char for char in mess if char not in string.punctuation])
    mess = ''.join([i for i in mess if not i.isdigit()])
    # Remove plurals
    mess = lemmatize(mess)
    #clean excess whitespace
    mess = re.sub(r"\s+", ' ', mess).strip()
    # Remove stopwords
    mess = [word for word in mess.split() if word.lower() not in stopwords.words('english')]
    if len(mess)>=1:
        return(mess[0])
    else:
        return('unit')

def unit_to_integer(mess):
    # Keep integers
    mess = ''.join([i for i in mess if  i.isdigit()])

    if len(mess)>=1:
        return(mess)
    else:
        return('1')

def join_strings(mess):
    # Keep integers
    mess = ' '.join(mess)

    if len(mess)>=1:
        return(mess)
    else:
        return('')

In [183]:
# Load data
df = pd.read_csv('dataset_allrecipes-test-v3_2020-06-02_01-37-25-797.csv')

# Clean column names
df.rename(columns = {'calories':'calPerServing', 'cook':'cookTime', 'prep':'prepTime','ready in':'totalTime'}, inplace = True) 

# Create unique id
df['recipe_key'] = df['url'].apply(lambda x:int(re.findall(r"\d+", x)[0]))

# Remove null values and reset index
df.dropna(axis=0, how='any',inplace=True)
df.reset_index(inplace=True, drop=True)

# Clean ingredient text
dict_unicode = {'\u2009': '', '½':' 1/2', '⅓':'1/3', '⅔':'2/3', '¼':'1/4', '¾':'3/4', '⅕':'1/5', 
                '⅖':'2/5', '⅗':'3/5', '⅘':'4/5', '⅙':'1/6', '⅚':'5/6', '⅐':'1/7', '⅛':'1/8', 
                '⅜':'3/8', '⅝':'5/8', '⅞':'7/8', '⅑':'1/9', '⅒':'1/10'}
df['recipeIngredient'] = [item + ';' for item in df['recipeIngredient']] # add semicolon at end of each string for easier regex filtering
df['recipeIngredient'] = [multireplace(x, dict_unicode) for x in df['recipeIngredient']] # replace unicode characters
df['recipeIngredient'] = [string_replace(x) for x in df['recipeIngredient']] # remove whitespace
ing = [get_ingredients(x) for x in df['recipeIngredient']] # separate ingredients into list of list of tupules of ingredient strings
df['num_ingredients'] = [len(x) for x in ing] # count number of ingredients per recipe


In [184]:
# new version of df_ing with get_quantity(x)
#df_ing = [get_quantity(x) for x in ing] # separate units of measure and ingredients & creates a pandas dataframe for each recipe

# original version
df_ing = [get_units(x) for x in ing] # separate units of measure and ingredients & creates a pandas dataframe for each recipe

clean_df = match_uids(df, df_ing) # pull unique id, calorie (outcome variable), number of servings, and number of ingredients from original dataframe
clean_df = pd.concat(clean_df) # concat list of pandas dataframes into one dataframe
clean_df['quantity'] = [convert_fractions(x) for x in clean_df['quantity']] # convert fractions into integers

In [185]:
# Create multiIndex / hierarchical Dataframe
clean_df = clean_df.reset_index()
arrays = [clean_df['recipe_key'],clean_df['index']]
tuples = tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=['recipe_key', 'ingredient_key'])
clean_df.set_index(index,inplace=True)
#clean_df['ingredient'].replace('',clean_df['unit'],inplace=True)
clean_df.drop('index', axis=1, inplace=True)

In [166]:
clean_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient,quantity,recipe_key,calPerServing,num_ingredients,servings,name
recipe_key,ingredient_key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
229099,0,cup crushed corn flakes,0.75,229099,277.7,5,8,Breaded Parmesan Ranch Chicken
229099,1,cup grated Parmesan cheese,0.75,229099,277.7,5,8,Breaded Parmesan Ranch Chicken
229099,2,ounce) envelope ranch salad dressing mix,1.0,229099,277.7,5,8,Breaded Parmesan Ranch Chicken
229099,3,"ounce) skinless, boneless chicken breast halves",4.0,229099,277.7,5,8,Breaded Parmesan Ranch Chicken
229099,4,"cup butter, melted",0.5,229099,277.7,5,8,Breaded Parmesan Ranch Chicken


In [168]:
# save as csv for web app
clean_df.to_csv('clean_df.csv')

In [177]:
### For use with new version of df_ing with get_quantity(x)

# def remove_units(words):
#     stopwords = ['dash','pinch','teaspoon','fluid','cup','pint','quart','ounce','oz','pound','rack',
#                 'small','medium','large','crushed','grated','skinless','boneless','melted','fresh',
#                  'diced','minced','thinly','dry','dried','halved','taste','frying','lean','drained','jars','grated'
#                 'clove','slice','eaches','whole','cube','thick','unit','freshly','finely','splash']
#     ingredient_words = words.split()
#     resultwords  = [word for word in ingredient_words if word.lower() not in stopwords]
#     result = ' '.join(resultwords)
#     return(result)

# clean_df['unit'] = np.where(clean_df.ingredient.str.contains("dash"), "1/3",
#             np.where(clean_df.ingredient.str.contains("pinch"), "2/3",
#             np.where(clean_df.ingredient.str.contains("teaspoon"), "5", 
#             np.where(clean_df.ingredient.str.contains("tablespoon"), "15",
#             np.where(clean_df.ingredient.str.contains("fluid"), "30",
#             np.where(clean_df.ingredient.str.contains("cup"), "240", 
#             np.where(clean_df.ingredient.str.contains("pint"), "473",
#             np.where(clean_df.ingredient.str.contains("quart"), "950",
#             np.where(clean_df.ingredient.str.contains("ounce"), "28",
#             np.where(clean_df.ingredient.str.contains("oz"), "28", 
#             np.where(clean_df.ingredient.str.contains("pound"), "454",
#             np.where(clean_df.ingredient.str.contains("rack"), "908",
#             "1"))))))))))))
# clean_df['ingredient'] = [remove_units(x) for x in clean_df['ingredient']]

In [186]:
# Tokenization = convert text string into list of tokens, or words, we want (i.e., cleaned version of words).
import string
from nltk.corpus import stopwords
clean_df['ingredient']=[text_process(x) for x in clean_df['ingredient']]
# clean_df['unit']=[unit_process(x) for x in clean_df['unit']]

thresh = 600; # calorie threshold; mean = 500, median = 463, 75%quartile = 600
df['label'] = [1 if a_ > thresh else 0 for a_ in df['calPerServing']]
clean_df['label'] = [1 if a_ > thresh else 0 for a_ in clean_df['calPerServing']]

In [189]:
### This code needs to be fixed if using updated df_ing with get_quantity(x)

# Normalize units and present as list of values per column
dict_unit = {'dash': '1/3', 'pinch':'2/3', 'teaspoon':'5', 'tablespoon':'15', 'fluid':'30', 
                 'cup':'240', 'pint':'950', 'quart':'950', 'ounce':'28', 'oz':'28', 
                 'pound':'454','rack':'908',
             #'small':'50', 'medium':'60', 'large':'70', 'unit':'50' # don't take this into account since some ingredients are in the unit column (ex. onion, egg)
            }

### For use with updated df_ing with get_quantity(x)
#unit_int = clean_df['ingredient'].copy()

unit_int = clean_df['unit'].copy()
unit_int = [multireplace(x, dict_unit) for x in unit_int] # Normalize unit measurements to grams
unit_int = [unit_to_integer(x) for x in unit_int] # convert string of values into integers
unit_int = [convert_fractions(x) for x in unit_int] # convert fractions into integers
unit_quantity = pd.DataFrame(unit_int * clean_df['quantity']) # Multiply grams with quantity of unit
unit_quantity = unit_quantity.groupby('recipe_key').agg(lambda col: col.tolist()) # condense list of values to each recipe key

# NLP
1) Bag of words

2) TF-IDF (term frequency-inverse document frequency)

3) Combine word weights (from TF-IDF) with quantitative metrics (quantity, serving size), then model based on that

4) Train test split

5) Machine learning model


In [200]:
# Original working version
df.rename(columns = {'recipe_key':'key'}, inplace = True) 
clean_df.rename(columns = {'recipe_key':'key'}, inplace = True) 
X_ing = clean_df[['ingredient', 'key']].groupby('key').agg(sum) # # condense list of ingredients to each recipe key
X_ing['clean_ing'] = [join_strings(x) for x in X_ing['ingredient']] # joins list of strings into one string per recipe

df['totalCal'] = df['calPerServing']*df['servings']
y_cal = df.set_index('key')[['totalCal','calPerServing','name','label']].copy()

y_label = df.reset_index(drop=True)['label']
X_keys = df.reset_index(drop=True)['key']

from sklearn.model_selection import train_test_split # Train Test Split
key_train, key_test, y_train, y_test = train_test_split(X_keys, y_label, test_size=0.2, random_state=101) # won't work with multilevel index

X_train = X_ing.loc[key_train]
X_test = X_ing.loc[key_test]
y_cal_train = y_cal.loc[key_train]
y_cal_test = y_cal.loc[key_test]

print("Training set contains {} reviews in total".format(len(key_train)))
print("Test set contains {} reviews in total".format(len(key_test)))

Training set contains 688 reviews in total
Test set contains 173 reviews in total


In [211]:
X_train

Unnamed: 0_level_0,ingredient,clean_ing
key,Unnamed: 1_level_1,Unnamed: 2_level_1
8495,"[boneless, chicken, breast, half, salt, ground...",boneless chicken breast half salt ground black...
216758,"[butter, onion, chopped, rib, chopped, chopped...",butter onion chopped rib chopped chopped froze...
17143,"[coarse, kosher, salt, prime, rib, roast, grou...",coarse kosher salt prime rib roast ground blac...
22729,"[noodle, olive, oil, chopped, fresh, mushroom,...",noodle olive oil chopped fresh mushroom choppe...
143458,"[finely, chopped, fresh, cilantro, garlic, min...",finely chopped fresh cilantro garlic minced ch...
...,...,...
148238,"[thick, cut, pork, chop, smoked, gouda, cheese...",thick cut pork chop smoked gouda cheese fresh ...
21528,"[pre-baked, pizza, crust, pesto, tomato, chopp...",pre-baked pizza crust pesto tomato chopped gre...
276690,"[boneless, beef, chuck, roast, cut, -inch, thi...",boneless beef chuck roast cut -inch thick stri...
229123,"[ground, beef, finely, chopped, shredded, ched...",ground beef finely chopped shredded cheddar ch...


In [210]:
y_train

114    0
209    0
331    0
274    0
563    0
      ..
599    0
575    0
838    1
337    0
523    0
Name: label, Length: 688, dtype: int64

In [212]:
y_cal_train

Unnamed: 0_level_0,totalCal,calPerServing,name,label
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8495,1675.2,418.8,Chicken Cordon Bleu I,0
216758,3464.4,577.4,Mom's Fabulous Chicken Pot Pie with Biscuit Crust,0
17143,2346.0,391.0,Kosher Salt Encrusted Prime Rib Roast,0
22729,4320.0,360.0,Spinach Lasagna III,0
143458,4436.8,554.6,Amazing Southwest Cilantro Lime Mango Grilled ...,0
...,...,...,...,...
148238,1804.0,451.0,Gouda and Spinach Stuffed Pork Chops,0
21528,2362.2,393.7,Pesto Pizza,0
276690,2432.0,608.0,Miso-Braised Beef with King Mushrooms,1
229123,4665.6,583.2,Best Beef Enchiladas,0


In [150]:
# For web app
# X_ing = clean_df[['ingredient', 'name']].groupby('name').agg(sum) # # condense list of ingredients to each recipe key
# X_ing['clean_ing'] = [join_strings(x) for x in X_ing['ingredient']] # joins list of strings into one string per recipe
# X_ing = X_ing.head().reset_index()
# X_ing['key'] = df.reset_index(drop=True)['key']
# X_ing = X_ing.set_index('key',drop=True)

# df['totalCal'] = df['calPerServing']*df['servings']
# y_cal = df.set_index('key')[['totalCal','calPerServing','name']].copy()

# y_label = df.reset_index(drop=True)['label']
# X_keys = df.reset_index(drop=True)['key']

# from sklearn.model_selection import train_test_split # Train Test Split
# key_train, key_test, y_train, y_test = train_test_split(X_keys, y_label, test_size=0.2, random_state=101) # won't work with multilevel index

# X_train = X_ing.loc[key_train]
# X_test = X_ing.loc[key_test]
# y_cal_train = y_cal.loc[key_train]
# y_cal_test = y_cal.loc[key_test]

# print("Training set contains {} reviews in total".format(len(key_train)))
# print("Test set contains {} reviews in total".format(len(key_test)))

In [213]:
# Save data as csv
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')
y_cal_train.to_csv('y_cal_train.csv')
y_cal_test.to_csv('y_cal_test.csv')

In [132]:
df.head()

Unnamed: 0,calPerServing,carbohydratesg,cholesterolmg,cookTime,directions,ingredients,name,nutrition,prepTime,proteing,...,recipeDirection,recipeIngredient,servings,sodiummg,totalFatg,url,key,num_ingredients,label,totalCal
0,277.7,277.7 calories; 26.8 g protein; ...,101.7,45 mins,1. Preheat oven to 350 degrees F (175 degrees ...,"¾ cup crushed corn flakes, ¾ cup grated Parmes...",Breaded Parmesan Ranch Chicken,277.7 calories; 26.8 g protein; ...,10 mins,26.8,...,Preheat oven to 350 degrees F (175 degrees C)....,3/4 cup crushed corn flakes; 3/4 cup grated Pa...,8,277.7 calories; 26.8 g protein; ...,277.7 calories; 26.8 g protein; ...,https://www.allrecipes.com/recipe/229099/bread...,229099,5,0,2221.6
1,1138.8,1138.8 calories; 80.4 g protein; ...,283.6,50 mins,1. Preheat oven to 425 degrees F (220 degrees ...,"4 pounds skin-on, bone-in chicken thighs, 1 ta...",Greek Lemon Chicken and Potatoes,1138.8 calories; 80.4 g protein; ...,10 mins,80.4,...,Preheat oven to 425 degrees F (220 degrees C)....,"4 pounds skin-on, bone-in chicken thighs; 1 ta...",4,1138.8 calories; 80.4 g protein; ...,1138.8 calories; 80.4 g protein; ...,https://www.allrecipes.com/recipe/242352/greek...,242352,12,1,4555.2
2,524.0,524 calories; 34.5 g total fat; ...,153.0,45 mins,"1. Combine bread crumbs, onion, egg, parsley, ...","¼ cup fresh bread crumbs, ¼ cup finely diced o...",Instant Pot® Salisbury Steak with Onion and Mu...,524 calories; 34.5 g total fat; ...,10 mins,34.5,...,"Combine bread crumbs, onion, egg, parsley, Wor...",1/4 cup fresh bread crumbs; 1/4 cup finely dic...,4,524 calories; 34.5 g total fat; ...,524 calories; 34.5 g total fat; ...,https://www.allrecipes.com/recipe/265280/insta...,265280,18,0,2096.0
3,828.8,828.8 calories; 36.4 g protein; ...,189.3,30 mins,1. Heat the frying oil in a deep-fryer or larg...,"4 cups vegetable oil for frying, 3 eggs, ½ c...",Deb's General Tso's Chicken,828.8 calories; 36.4 g protein; ...,25 mins,36.4,...,Heat the frying oil in a deep-fryer or large s...,4 cups vegetable oil for frying; 3 eggs; 1/2 c...,6,828.8 calories; 36.4 g protein; ...,828.8 calories; 36.4 g protein; ...,https://www.allrecipes.com/recipe/92761/debs-g...,92761,14,1,4972.8
4,734.8,734.8 calories; 28.9 g protein; ...,159.2,10 mins,1. Bring a large pot of lightly salted water t...,"1 (16 ounce) package egg noodles, 1 pound lean...",Simple Hamburger Stroganoff,734.8 calories; 28.9 g protein; ...,20 mins,28.9,...,Bring a large pot of lightly salted water to a...,1 (16 ounce) package egg noodles; 1 pound lean...,6,734.8 calories; 28.9 g protein; ...,734.8 calories; 28.9 g protein; ...,https://www.allrecipes.com/recipe/23260/simple...,23260,8,1,4408.8


In [135]:
X_ing = clean_df[['ingredient', 'name']].groupby('name').agg(sum) # # condense list of ingredients to each recipe key
X_ing['clean_ing'] = [join_strings(x) for x in X_ing['ingredient']] # joins list of strings into one string per recipe
X_ing = X_ing.head().reset_index()
X_ing['key'] = df.reset_index(drop=True)['key']
X_ing = X_ing.set_index('key',drop=True)
X_ing

Unnamed: 0_level_0,name,ingredient,clean_ing
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
229099,"""Instant"" Mac and Cheese","[whole, milk, kosher, salt, taste, cayenne, pe...",whole milk kosher salt taste cayenne pepper dr...
242352,"""Pantry Raid"" Chicken Enchilada Casserole","[tomato, sauce, water, taco, seasoning, mix, c...",tomato sauce water taco seasoning mix chili po...
265280,A Scotsman's Shepherd Pie,"[mashed, boiled, potato, sour, cream, cream, c...",mashed boiled potato sour cream cream cheese b...
92761,Absolute Best Liver and Onions,"[sliced, beef, liver, milk, needed, butter, di...",sliced beef liver milk needed butter divided v...
23260,Accidental Fish,"[fillet, mahi, mahi, olive, oil, salted, butte...",fillet mahi mahi olive oil salted butter garli...


In [137]:
y_cal_train.loc[229099]

totalCal                                 2221.6
calPerServing                             277.7
name             Breaded Parmesan Ranch Chicken
Name: 229099, dtype: object

In [136]:
y_cal_test.loc[12009]

totalCal                             2218.4
calPerServing                        1109.2
name             Creamy Cajun Chicken Pasta
Name: 12009, dtype: object

In [94]:
y_cal_test[y_cal_test['name']=='Creamy Cajun Chicken Pasta']

Unnamed: 0_level_0,totalCal,calPerServing,name
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12009,2218.4,1109.2,Creamy Cajun Chicken Pasta


# NLP - separate steps
#### Bag of words vectorization

In [194]:
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
bow_transformer = CountVectorizer(analyzer=text_process).fit(X_train['clean_ing'])
print(len(bow_transformer.vocabulary_)) # Print total number of vocab words

# Example bag of words on one recipe
# bow0 = bow_transformer.transform([X_train['clean_ing'].loc[8495]])
# print(bow0) # tells us what words appear and the frequency
# print(bow_transformer.get_feature_names()[145]) # This tells us what the word is for a given index

ingredient_bow = bow_transformer.transform(X_train['clean_ing'])

906


In [195]:
# Bag of word counts are saved in a sparse matrix which compresses the information to save computer memory
print('Shape of Sparse Matrix: ', ingredient_bow.shape) # matrix size (number of recipes, total number of words)
print('Amount of Non-Zero occurences: ', ingredient_bow.nnz) 
sparsity = (100.0 * ingredient_bow.nnz / (ingredient_bow.shape[0] * ingredient_bow.shape[1]))
print('sparsity: {}'.format(sparsity)) # total number of words / (number of recipes * total number of words)*100% compares non-zero words versus total number of words

Shape of Sparse Matrix:  (688, 906)
Amount of Non-Zero occurences:  15980
sparsity: 2.56365829868063


#### TF-IDF (term frequency-inverse document frequency)

In [196]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(ingredient_bow) # weight of words over whole document

# Example TF-IDF on one recipe
# tfidf0 = tfidf_transformer.transform(bow0)
# print(tfidf0)

# Transform the entire bag-of-words corpus into TF-IDF corpus at once:
ingredient_tfidf = tfidf_transformer.transform(ingredient_bow)
print(ingredient_tfidf.shape)

(688, 906)


In [17]:
# To see document frequency of a specific word
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['rack']])

6.436628982345549


#### Train & test model

In [45]:
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Model
recipe_model = MultinomialNB().fit(ingredient_tfidf, y_train)

In [46]:
predictions = recipe_model.predict(ingredient_tfidf)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_train))

              precision    recall  f1-score   support

           0       1.00      0.76      0.86       685
           1       0.02      1.00      0.04         3

    accuracy                           0.76       688
   macro avg       0.51      0.88      0.45       688
weighted avg       1.00      0.76      0.86       688



### Save ML model using pickle

In [206]:
import pickle
pickle.dump(ingredient_bow,open('ingredient_bow.sav','wb'))
pickle.dump(ingredient_tfidf,open('ingredient_tfidf.sav','wb'))

In [209]:
ingredient_bow = pickle.load(open('ingredient_bow.sav','rb'))
predictions = ingredient_bow.predict(X_test,y_test)
predictions

AttributeError: predict not found

#### Save BoW and TF-IDF vectorization as csv - doesn't work

In [47]:
ingredient_bow.to_csv('ingredient_bow.csv')
ingredient_tfidf.to_csv('ingredient_tfidf.csv')

AttributeError: to_csv not found

# Logistic Regression

In [192]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

[1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0]
              precision    recall  f1-score   support

           0       0.90      0.86      0.88       147
           1       0.36      0.46      0.41        26

    accuracy                           0.80       173
   macro avg       0.63      0.66      0.64       173
weighted avg       0.82      0.80      0.81       173



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

# Linear Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LinearRegression()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_cal_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

In [None]:
predictions

# Naive Bayes

bag of words gives good precision but very bad recall

TF-IDF is assigning all recipes to one label, so recall=0.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Model
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Model
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

# Decision tree

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', DecisionTreeClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', DecisionTreeClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

# Random Forest 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier(n_estimators=200)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
    
pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier(n_estimators=200)),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
    
pipeline.fit(X_train['clean_ing'],y_train) # Fit Model using training data
predictions = pipeline.predict(X_test['clean_ing']) # Predict using test data
print(predictions)

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(predictions,y_test))

# Note to self:

ML priority
* Priority is to to TF-IDF on the list of words.
* Then, add the other features (quantity, label) before doing the machine learning.
* Then, do machine learning.

Additional features
* Make "total calories" as a feature (calPerServing * servings)
* Remove/Tag certain word classes

In [None]:
small_df = clean_df[['ingredient', 'key']]
X_ing = small_df.groupby('key').agg(sum)
y_label = df.reset_index(drop=True)['label']
X_keys = df.reset_index(drop=True)['recipe_key']

# IF need to sum integer values per key
clean_df.groupby('key').agg({'b': 'sum', 'c': lambda x: ' '.join(x)})

In [None]:
# flattens list of list into one list. but lose the recipe key
# [val for sublist in X_ing['ingredient'] for val in sublist]

# Data Exploration

In [None]:
clean_df.describe()

In [None]:
clean_df.index.get_level_values(0) # now this is a multilevel index / hierarchical dataframe

# This is how to select row of a multilevel index: df.loc[recipe_key, ingredient_key]
clean_df.loc[229099,0]
# Data Exploration
# Look at what units are used for a specific ingredients
clean_df[clean_df['ingredient']=='water']['unit'].unique()

# Unit normalization - work in progress...

Currently have the common units normalized to ml or grams (assuming 1 mL = 1 g).

If have time, assign an integer value to the specific units in the dictionary, instead of pulling out first word of the ingredient string.

In [None]:
# Histogram of unit count
from collections import Counter
letter_counts = Counter(clean_df['unit'])
count = pd.DataFrame.from_dict(letter_counts, orient='index')
count = count.sort_values(by=[0], ascending=False)

count[0:60].plot(kind='bar',figsize=(12,5))

In [None]:
clean_df[clean_df['unit']=='boneless']

# Clean text -  work in progress ...

1) For ingredients that got replaced with the "unit"value, need to replace the "unit" value as unit.
if ingredient[row]=nan, then ingredient[row]=unit[row], and unit[row]='unit'
* DON'T NEED TO DO THIS ANYMORE

2) Remove plural words using stemmer instead of wnl.lemmatize (ex. lammantize function doesnt change children to child)

3) Fix parse_text to improve removal of non-word punctuation


# Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.distplot(df['calPerServing'])

In [None]:
sns.distplot(df[df['servings']==4]['calPerServing'])

In [None]:
sns.distplot(df['num_ingredients'])
sns.distplot(df['servings'])

In [None]:
sns.distplot(df['servings'])

In [None]:
sns.regplot(x='num_ingredients', y='calPerServing', data=clean_df)

In [None]:
sns.regplot(x='servings', y='calPerServing', data=clean_df)

In [None]:
sns.regplot(x='servings', y='num_ingredients', data=clean_df)