Baking Multi-class Classification

In this notebook, we
1. Categorize a text-based dataset of 200,000+ recipes into 3 classes: cookies, brownies and cakes
2. Utilize regular expressions for data pre-processing/data extraction
3. Perform feature engineering and scaling to improve model accuracy
4. Implement several scikit-learn machine learning models including Multinomial Logistic Regression, K Nearest Neighbors, Decision Tree and an ensemble Random Forest classifier
5. Apply hyperparameter tuning on our best model to bolster results
6. Achieve an accuracy score of 0.786 on test set using our best model


In [1]:
#import libraries 
import re
import pandas as pd
import numpy as np
from fractions import Fraction
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
#read in data - source: https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m/data
df = pd.read_csv("recipes_data.csv")

#set up target vlsariable
df['class'] = np.where(df['title'].str.find('Cookies') != -1, "Cookies", 
                       np.where(df['title'].str.find('Brownies') != -1, "Brownies", 
                                np.where((df['title'].str.find('Cake') != -1), "Cake", ""))) 
df = df[~(df['title'].str.find('Crab') != -1)] #remove crab cakes
df = df[df['class']!=""].reset_index(drop=True)

Data Pre-Processing

In [3]:
#convert acronyms to full measurement names 
df['ingredients'] = df['ingredients'].str.lower()
df['ingredients'] = df['ingredients'].str.replace('c.', 'cup')
df['ingredients'] = df['ingredients'].str.replace(' c ', ' cup ')
df['ingredients'] = df['ingredients'].str.replace('cp.', 'cup')
df['ingredients'] = df['ingredients'].str.replace('/c', 'cup')
df['ingredients'] = df['ingredients'].str.replace('g/', ' gram ')
df['ingredients'] = df['ingredients'].str.replace('grams', 'gram ')
df['ingredients'] = df['ingredients'].str.replace('gr ', 'gram ')
df['ingredients'] = df['ingredients'].str.replace(' g ', ' gram ')
df['ingredients'] = df['ingredients'].str.replace('gr.', 'gram ')
df['ingredients'] = df['ingredients'].str.replace('tsp.', 'teaspoon')
df['ingredients'] = df['ingredients'].str.replace('tsp', 'teaspoon')
df['ingredients'] = df['ingredients'].str.replace('teaspoons', 'teaspoon')
df['ingredients'] = df['ingredients'].str.replace('tbsp.', 'tablespoon')
df['ingredients'] = df['ingredients'].str.replace('tbsp', 'tablespoon')
df['ingredients'] = df['ingredients'].str.replace('tbs', 'tablespoon')
df['ingredients'] = df['ingredients'].str.replace('tablespoons', 'tablespoon')
df['ingredients'] = df['ingredients'].str.replace('tbsl', 'tablespoon')
df['ingredients'] = df['ingredients'].str.replace('lb.', 'pound')
df['ingredients'] = df['ingredients'].str.replace('lb', 'pound')
df['ingredients'] = df['ingredients'].str.replace('blocks', 'stick')
df['ingredients'] = df['ingredients'].str.replace('lb', 'pound')
df['ingredients'] = df['ingredients'].str.replace('dozen', '12')
df['ingredients'] = df['ingredients'].str.replace('doz', '12')
df['ingredients'] = df['ingredients'].str.replace('oz.', 'ounce')
df['ingredients'] = df['ingredients'].str.replace('oz ', 'ounce')
df['ingredients'] = df['ingredients'].str.replace('ounces', 'ounce')
df['ingredients'] = df['ingredients'].str.replace('ounce,', 'ounce')
df['ingredients'] = df['ingredients'].str.replace('ounce.', 'ounce')
df['ingredients'] = df['ingredients'].str.replace('pkg.', 'package')

#convert similar words that could misrepresent counts
df['ingredients'] = df['ingredients'].str.replace('sugar cones', 'cones')
df['ingredients'] = df['ingredients'].str.replace('buttermilk', 'milk')
df['ingredients'] = df['ingredients'].str.replace('butter milk', 'milk')
df['ingredients'] = df['ingredients'].str.replace('butterfinger', 'candy')
df['ingredients'] = df['ingredients'].str.replace('butterscotch', 'candy')
df['ingredients'] = df['ingredients'].str.replace('nutter butter', '')
df['ingredients'] = df['ingredients'].str.replace('nutterbutter', '')
df['ingredients'] = df['ingredients'].str.replace('peanut butter', 'pb')


In [4]:
#data cleaning
df['ingredients'] = df['ingredients'].str.replace('9x13', '').str.replace('8x8', '').str.replace('350\\u00b0f. butter & flour 2 9', '').str.replace('\\u00b0', u'\N{DEGREE SIGN}').str.replace(
    '11o grams', '110 gram').str.replace('10x sugar', 'sugar').str.replace('4x sugar', 'sugar').str.replace('10x powdered sugar', 'powdered sugar').str.replace('34 cup', '1 cup') 

#implement regex fixes
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) #get rid of extra instructions in parentheses
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)c', r'\1 cup', x)) #make sure cups are recognized as measurement
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)g', r'\1 gram', x)) #make sure grams are recognized as measurement
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)t', r'\1 teaspoon', x)) #make sure teaspoons are recognized as measurement
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)(?![\\\s\/,;:\'\"!?()`])', r'\1 ', x)) #add space after digits except special characters 
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('([a-z]+)(\d+)', r'\1 \2', x)) #add space before digits if preceeded by letter
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)([a-z]+)', r'\1 \2', x)) #add space after digits if followed by letter
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(,)(\d+)', r'\1 \2', x)) #add space after commas for those not preceeded by digits
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)(,)', r'\1 \2', x)) #add space before commas 
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('([a-z])(\/)(\d+)', r'\1 replace_this \3', x)) #remove slash instructions
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('( )(\/)(\d+)', r' replace_this \3', x)) #remove slash instructions
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)(\s+)(\/)(\d+)', r'\1\3\4', x)) #remove spaces in between fractions
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)(\/)(\s+)(\d+)', r'\1\2\4', x)) #remove spaces in between fractions
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d)(\/)(\d)(\/)(\d)', r'\1 \3\4\5', x)) #remove weird extra slashes between fractions

#more replacements
df['ingredients'] = df['ingredients'].str.replace('-', ' ').str.replace('&', ' ').str.replace('`', '').str.replace('amp;', '').str.replace('?', ' ').str.replace('+', ' ').str.replace('*', '').str.replace("#", '')
df['ingredients'] = df['ingredients'].str.replace('w/', 'with ').str.replace('or more', '').str.replace('plus1', 'plus 1').str.replace('..', ', ').str.replace(
     '\\t\\t\\t\\t', ' ').str.replace('\\t\\t', ' ').str.replace('\\t', ' ').str.replace('  \\\\', '/').str.replace('\\\\', '/').str.replace('mix.1', '').str.replace(
        'gm', 'grams').str.replace('powder\'1', 'powder, 1').str.replace('1//4', '1/4').str.replace('3/4/', '3/4 cup').str.replace(
            '1  23  ', '1 2/3 ').str.replace('1  12  ', '1 1/2 ').str.replace('1  14  ', '1 1/4 ').str.replace('2  12  ', '2 1/2 ').str.replace('2  14  ', '2 1/4 ').str.replace('.1/3', '1/3').str.replace(
            '1/ cup', '1 cup').str.replace('1/ cup', '1 cup').str.replace('3.sugar', 'sugar').str.replace('7.sugar', 'sugar').str.replace(
                '4, 3  ounce', '4.3 ounce').str.replace('4, 5  ounce', '4.5 ounce').str.replace('4,5  ounce', '4.5 ounce').str.replace('1, and 1/4', '1 and 1/4')
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub('(\d+)(\/)(\s+)(\d+)', r'\1\2\4', x)) #remove spaces in between fractions

#convert to list
df['ingredients'] = df['ingredients'].apply(lambda x: eval(x))


Feature Engineering

In [5]:
#function for converting to same measurement
def convert_to_cups(measurement, type):
    
    '''Data pre-processing to extract relevant text for measurements 
    followed by feature engineering/standardization of measurements to cups'''
    
    #start with cleaning
    measurement = ''.join([measurement[measurement.find(' replace_this ')+14:] if measurement.find(' replace_this ')!=-1 else measurement])
    #remove brackets now that everything is a string
    measurement = re.sub("[\[].*?[\]]", "", measurement)
    #keep just the relevant measurement 
    measurement = ''.join([measurement[:measurement.find('cup')] if measurement.find('cup')!=-1 else 
                                                       measurement[:measurement.find('teaspoon')+8] if measurement.find('teaspoon')!=-1 else 
                                                       measurement[:measurement.find('box')+3] if measurement.find('box')!=-1 else 
                                                       measurement[:measurement.find('tablespoon')+10] if measurement.find('tablespoon')!=-1 else 
                                                       measurement[:measurement.find('package')+7] if measurement.find('package')!=-1 else 
                                                       measurement[:measurement.find('carton')+6] if measurement.find('carton')!=-1 else 
                                                       measurement[:measurement.find('pinch')+5] if measurement.find('pinch')!=-1 else 
                                                       measurement[:measurement.find('dash')+4] if measurement.find('dash')!=-1 else 
                                                       measurement[:measurement.find('spoon')+5] if measurement.find('spoon')!=-1 else 
                                                       measurement[:measurement.find('ounce')+5] if measurement.find('ounce')!=-1 else 
                                                       measurement[:measurement.find('gram')+4] if measurement.find('gram')!=-1 else 
                                                       measurement[:measurement.find('egg')+3] if measurement.find('egg')!=-1 else 
                                                       measurement[:measurement.find('stick')+5] if measurement.find('stick')!=-1 else 
                                                       measurement[:measurement.find('pound')+5] if measurement.find('pound')!=-1 else measurement])
    #for numbered/vague instructions
    measurement = ''.join([measurement[measurement.find(')')+1:] if measurement.find(')')!=-1 else measurement])
    measurement = ''.join([measurement[measurement.find('or')+2:] if measurement.find('or')!=-1 else measurement])
    measurement = ''.join([measurement[measurement.find('to')+2:] if measurement.find('to')!=-1 else measurement])

    #now conversions! add up fractions first
    if (measurement.find('gram')!=-1) & (any(c.isdigit() for c in measurement)):
        grams = sum([float(Fraction(c)) for c in measurement[:measurement.find('gram')].split(' ') if c.isdigit()])
        density = np.where(type == 'sugar', 0.8453506, np.where(type=='flour', 0.529, np.where(type == 'butter', 0.911, np.where(type =='salt', 1.28, 1))))
        measurement = grams/(236.5882 * density)
    elif ('tablespoon' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find('tablespoon')].split(' ') if c.isdigit()])
        measurement *= 0.0625
    elif ('teaspoon' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find('teaspoon')].split(' ') if c.isdigit()])
        measurement *= 0.0208333
    elif (('pinch' in measurement) | ('dash' in measurement)) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement.split(' ') if c.isdigit()])
        measurement *= 0.005
    elif (('packets' in measurement) | ('spoon' in measurement)) & (any(c.isdigit() for c in measurement)):
        #a packet or spoon of sugar is equivalent to 3-4 grams
        measurement = sum([float(Fraction(c)) for c in measurement.split(' ') if c.isdigit()])
        measurement *= 4
        #grams conversion
        measurement = measurement/(236.5882 *  0.8453506)
    elif ('stick' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find('stick')].split(' ') if c.isdigit()])
        measurement *= 0.5
    elif ('egg' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find('egg')].split(' ') if c.isdigit()])
        measurement *= 0.25
    elif ('ounce' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find('ounce')].split(' ') if c.isdigit()])
        measurement *= 0.125
    elif (' ml' in measurement) & (any(c.isdigit() for c in measurement)):
        measurement = sum([float(Fraction(c)) for c in measurement[:measurement.find(' ml')].split(' ') if c.isdigit()])
        measurement *= 0.00422675
    elif ('pound' in measurement) | ('box' in measurement) | ('package' in measurement)| (('carton' in measurement) & (type=='sugar')):
        measurement = sum([float(Fraction(c)) for c in measurement.split(' ') if c.isdigit()])
        measurement = np.where(measurement==0, 1.0, measurement)
        measurement = float(measurement)
        measurement *= 2.0
    else:
        measurement = sum([float(Fraction(c)) for c in measurement.split(' ') if any(char.isdigit() for char in c) and (
            ":" not in c) and ("%" not in c) and (u'\N{DEGREE SIGN}' not in c)])
    return measurement


In [6]:
#isolate ingredients we need
df['sugar'] = df['ingredients'].apply(lambda x: next((s for s in x if 'sugar' in s), '0'))
df['flour'] = df['ingredients'].apply(lambda x: next((s for s in x if 'flour' in s), '0')) 
df['butter'] = df['ingredients'].apply(lambda x: next((s for s in x if 'butter' in s), '0')) 
df['egg'] = df['ingredients'].apply(lambda x: next((s for s in x if 'egg' in s), '0')) 
df['salt'] = df['ingredients'].apply(lambda x: next((s for s in x if 'salt' in s), '0')) 
df['baking_soda'] = df['ingredients'].apply(lambda x: next((s for s in x if 'baking soda' in s), '0')) 
df['vanilla_extract'] = df['ingredients'].apply(lambda x: next((s for s in x if 'vanilla extract' in s), '0')) 

#standardize all measurements to cups 
df['sugar'] = df['sugar'].apply(lambda x: convert_to_cups(x, 'sugar'))  
df['flour'] = df['flour'].apply(lambda x: convert_to_cups(x, 'flour'))
df['butter'] = df['butter'].apply(lambda x: convert_to_cups(x, 'butter'))
df['egg'] = df['egg'].apply(lambda x: convert_to_cups(x, 'egg'))
df['salt'] = df['salt'].apply(lambda x: convert_to_cups(x, 'salt'))
df['baking_soda'] = df['baking_soda'].apply(lambda x: convert_to_cups(x, 'baking soda'))
df['vanilla_extract'] = df['vanilla_extract'].apply(lambda x: convert_to_cups(x, 'vanilla extract'))

#print out df head
df.head()


Unnamed: 0,title,ingredients,directions,link,source,NER,site,class,sugar,flour,butter,egg,salt,baking_soda,vanilla_extract
0,No-Bake Nut Cookies,"[1 cup firmly packed brown sugar, 1/2 cup ev...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com,Cookies,1.0,0.0,0.125,0.0,0.0,0.0,0.0
1,Rhubarb Coffee Cake,"[1 1/2 cup sugar, 1/2 cup butter, 1 egg, 1...","[""Cream sugar and butter."", ""Add egg and beat ...",www.cookbooks.com/Recipe-Details.aspx?id=210288,Gathered,"[""buttermilk"", ""egg"", ""sugar"", ""vanilla"", ""sod...",www.cookbooks.com,Cake,1.5,2.0,0.5,0.25,0.0,0.0,0.0
2,Easy German Chocolate Cake,[1/2 package chocolate fudge cake mix without...,"[""Mix according to directions and add oil."", ""...",www.cookbooks.com/Recipe-Details.aspx?id=983179,Gathered,"[""chocolate fudge cake"", ""white cake"", ""wesson...",www.cookbooks.com,Cake,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Eggless Milkless Applesauce Cake,"[3/4 cup sugar, 1/2 cup shortening, 1 1/2 ...","[""Mix Crisco with applesauce, nuts and raisins...",www.cookbooks.com/Recipe-Details.aspx?id=343158,Gathered,"[""sugar"", ""shortening"", ""cinnamon"", ""soda"", ""a...",www.cookbooks.com,Cake,0.75,2.0,0.0,0.0,0.0,0.0,0.0
4,Quick Coffee Cake(6 Servings),"[3 tablespoon softened butter or margarine, 1...","[""Mix 3 tablespoons fat with granulated sugar....",www.cookbooks.com/Recipe-Details.aspx?id=532166,Gathered,"[""egg"", ""baking powder"", ""sugar"", ""vanilla"", ""...",www.cookbooks.com,Cake,0.5,1.0,0.1875,0.25,0.0,0.0,0.0


In [7]:
#get rid of any obvious outliers - 99th percentile seems a reasonable cut off for most ingredients
df['egg'] = np.where(df['egg']>4, 4, df['egg']) #4 cups is about 16 eggs, which does appear in some recipes
df['sugar'] = np.where(df['sugar']>df['sugar'].quantile(.99), df['sugar'].quantile(.99), df['sugar'])
df['flour'] = np.where(df['flour']>df['flour'].quantile(.99), df['flour'].quantile(.99), df['flour'])
df['butter'] = np.where(df['butter']>df['butter'].quantile(.99), df['butter'].quantile(.99), df['butter'])
df['salt'] = np.where(df['salt']>df['salt'].quantile(.99), df['salt'].quantile(.99), df['salt'])
df['baking_soda'] = np.where(df['baking_soda']>df['baking_soda'].quantile(.99), df['baking_soda'].quantile(.99), df['baking_soda'])
df['vanilla_extract'] = np.where(df['vanilla_extract']>df['vanilla_extract'].quantile(.99), df['vanilla_extract'].quantile(.99), df['vanilla_extract'])

#get the total quantity of the recipe and scale for ratios   
df['total'] = df['sugar'] + df['flour'] + df['butter'] + df['egg'] + df['salt'] + df['baking_soda'] + df['vanilla_extract']
#drop recipes that don't have any of the ingredients we're using as features
df = df[df['total'] != 0].reset_index(drop=True)
df['egg'] = df['egg'] / df['total']
df['salt'] = df['salt'] / df['total']
df['sugar'] = df['sugar'] / df['total']
df['flour'] = df['flour'] / df['total']
df['butter'] = df['butter'] / df['total']
df['baking_soda'] = df['baking_soda'] / df['total']
df['vanilla_extract'] = df['vanilla_extract'] / df['total']



Training and validatation

In [8]:
#split data into train, val, test sets with 60-20-20 split
X_train, X_test, y_train, y_test = train_test_split(df[['sugar', 'flour', 'egg', 'butter', 'salt', 'baking_soda', 'vanilla_extract']], df[['class']], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

#multinomial regression 
log_model = LogisticRegression()
log_model.fit(X_train, y_train.values.ravel())
y_pred = log_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Logistic Regression Accuracy:", accuracy)

#KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train.values.ravel())
y_pred = knn_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("KNN Classifier Accuracy:", accuracy)

#decision tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train.values.ravel())
y_pred = dt_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Decision Tree Classifier Accuracy:", accuracy)

#ensemble forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train.values.ravel())
y_pred = forest_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Random Forest Classifier Accuracy:", accuracy)

Logistic Regression Accuracy: 0.6229581229581229
KNN Classifier Accuracy: 0.7761360261360262
Decision Tree Classifier Accuracy: 0.7757400257400258
Random Forest Classifier Accuracy: 0.7927432927432927


Hyperparameter Tuning

In [9]:
#test hyperparameters for our best model - random forest

#try another criterion
entropy_model = RandomForestClassifier(criterion='entropy')
entropy_model.fit(X_train, y_train.values.ravel())
y_pred = entropy_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Entropy Model Accuracy:", accuracy)

#shorten max depth
shorter_tree_model = RandomForestClassifier(max_depth=5)
shorter_tree_model.fit(X_train, y_train.values.ravel())
y_pred = shorter_tree_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Shorter Tree Model Accuracy:", accuracy)

#try different weighting to account for class imbalance
weighted_model = RandomForestClassifier(class_weight={"Cake":1, "Cookies":2, "Brownies":12})
weighted_model.fit(X_train, y_train.values.ravel())
y_pred = weighted_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Weighted Accuracy:", accuracy)

#increase estimators & add warm start + entropy 
final_model = RandomForestClassifier(n_estimators = 500, warm_start=True, criterion='entropy')
final_model.fit(X_train, y_train.values.ravel())
y_pred = final_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("More Trees Accuracy:", accuracy)


Entropy Model Accuracy: 0.7927432927432927
Shorter Tree Model Accuracy: 0.7446292446292446
Weighted Accuracy: 0.7285912285912286
More Trees Accuracy: 0.7931145431145431


Final Model Accuracy

In [10]:
#best model has more trees, warm start & entropy criterion
y_pred = final_model.predict(X_test)
#get final accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Classifier Accuracy:", accuracy)


Random Forest Classifier Accuracy: 0.7868280368280368
