In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy
from scipy.sparse import csr_matrix



In [2]:
#import the csv

raw = pd.read_csv('recipes.csv', index_col=0)
raw

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."
...,...,...,...,...,...
13496,Brownie Pudding Cake,"['1 cup all-purpose flour', '2/3 cup unsweeten...",Preheat the oven to 350°F. Into a bowl sift to...,brownie-pudding-cake-14408,"['1 cup all-purpose flour', '2/3 cup unsweeten..."
13497,Israeli Couscous with Roasted Butternut Squash...,"['1 preserved lemon', '1 1/2 pound butternut s...",Preheat oven to 475°F.\nHalve lemons and scoop...,israeli-couscous-with-roasted-butternut-squash...,"['1 preserved lemon', '1 1/2 pound butternut s..."
13498,Rice with Soy-Glazed Bonito Flakes and Sesame ...,['Leftover katsuo bushi (dried bonito flakes) ...,"If using katsuo bushi flakes from package, moi...",rice-with-soy-glazed-bonito-flakes-and-sesame-...,['Leftover katsuo bushi (dried bonito flakes) ...
13499,Spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...,Melt 1 tablespoon butter in a 12-inch heavy sk...,spanakopita-107344,['1 stick (1/2 cup) plus 1 tablespoon unsalted...


In [3]:
#A few of the columns are unnecessary - just taking the two we need and making everything lowercase for consistency
df = raw[['Title', 'Cleaned_Ingredients']]
df = df.rename(columns={'Title': 'name', 'Cleaned_Ingredients': 'ingredients'})
df = df.apply(lambda x: x.str.lower())
df

Unnamed: 0,name,ingredients
0,miso-butter roast chicken with acorn squash pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,crispy salt and pepper potatoes,"['2 large egg whites', '1 pound new potatoes (..."
2,thanksgiving mac and cheese,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,italian sausage and bread stuffing,"['1 (¾- to 1-pound) round italian loaf, cut in..."
4,newton's law,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."
...,...,...
13496,brownie pudding cake,"['1 cup all-purpose flour', '2/3 cup unsweeten..."
13497,israeli couscous with roasted butternut squash...,"['1 preserved lemon', '1 1/2 pound butternut s..."
13498,rice with soy-glazed bonito flakes and sesame ...,['leftover katsuo bushi (dried bonito flakes) ...
13499,spanakopita,['1 stick (1/2 cup) plus 1 tablespoon unsalted...


In [4]:
# I define a list of nuts to which I am allergic and a simple function to determine if any of them are
# present in a string 

allergens = ['walnut', 'pecan', 'macadamia', 'hazelnut', 'brazil nut', 'wal nut']

def find_allergens(string):
    return any(word in string for word in allergens)


In [5]:
# Label the data in new column 'allergen' that shows True when an allergen is present and False otherwise
# On initial observation is that only 825 of 13501 entries contain an allergen (~6%) so the dataset is somewhat skewed
# That may present issues later on that we could try to address with resampling or other methods

df['allergen'] = df.ingredients.apply(lambda x: find_allergens(x))
df.allergen.value_counts()

False    12676
True       825
Name: allergen, dtype: int64

In [6]:
# Quick eyeball of the data listed as containing allergens - looks reasonable
df[df['allergen']]

Unnamed: 0,name,ingredients,allergen
62,nut butter granola bars,"['2 cups raw nuts (such as almonds, walnuts, p...",True
69,chocolate zucchini cake,"['2 1/4 cups sifted all purpose flour', '1/2 c...",True
70,swiss chard pasta with toasted hazelnuts and p...,"['¼ cup hazelnuts', '1 pound bow tie pasta (fa...",True
81,pear and hazelnut frangipane tart,"['1 cup hazelnuts, toasted, loose skins rubbed...",True
103,tahini-walnut magic shell,"['¼ cup raw walnuts', '3 oz. white chocolate, ...",True
...,...,...,...
13477,frisée and endive salad with warm brussels spr...,"['3 tablespoons white-wine vinegar', '2 tables...",True
13480,hazelnut-butter cookies with mini chocolate chips,"['1 1/2 cups all purpose flour', '3/4 teaspoon...",True
13492,cornmeal pancakes with honey-pecan butter,['1/2 cup (1 stick) unsalted european-style bu...,True
13494,ginger-pecan roulade with honey-glazed pecans,"['1/2 stick (1/4 cup) unsalted butter, melted,...",True


In [7]:
# Now I define our variables for the model. Since I will be using a Random Forest first, I start by vectorizing
# the names of the recipes and then split them into train, validation, and test sets in a 60/20/20 ratio.
# The training data will be used to train the model, the validation data will be used to evaluate the model and
# subsequently tweak the parameters, and the test data will be used to evaluate the final model

vectorizer = CountVectorizer()
X = [str(x) for x in df.name]
X = vectorizer.fit_transform(X)
y = df.allergen

X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.4, random_state=3)
X_validation, X_test, y_validation, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=3)


In [8]:
# Check the sizes of X_train, X_validation, X_test, y_train, y_validation, and y_test

print(X_train.shape[0], X_validation.shape[0], X_test.shape[0], y_train.shape[0], y_validation.shape[0], y_test.shape[0])

8100 2700 2701 8100 2700 2701


In [9]:
# Check that the proportion of True in each y set is roughly similar to the population proportion of ~6%

y_train_pct = y_train.sum() / y_train.count()
y_validation_pct = y_validation.sum() / y_validation.count()
y_test_pct = y_test.sum() / y_test.count()

print("Training Set Pct True %.2f%%" % (y_train_pct*100))
print("Validation Set Pct True %.2f%%" % (y_validation_pct*100))
print("Test Set Pct True %.2f%%" % (y_test_pct*100))

Training Set Pct True 5.98%
Validation Set Pct True 6.63%
Test Set Pct True 6.00%


In [10]:
# Create and train the Random Forest Classifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [11]:
# Before we test the model, I define a function to output some key model evaluation statistics
# After writing this line I decided to instead just use sklearn's "classification_report"

def test_model(y_actual, y_predicted):
    accuracy = np.mean(y_actual == y_predicted)
    precision = precision_score(y_actual, y_predicted)
    recall = recall_score(y_actual, y_predicted)
    f1 = f1_score(y_actual, y_predicted)
    print(f"Model Accuracy: {accuracy}")
    print(f"Model Precision: {precision}")
    print(f"Model Recall: {recall}")
    print(f"Model F1 Score: {f1}")
    

In [12]:
# Evaluate the model on the validation set

val_predictions = rf_classifier.predict(X_validation)
print(classification_report(y_validation, val_predictions))


              precision    recall  f1-score   support

       False       0.97      1.00      0.98      2521
        True       0.93      0.52      0.67       179

    accuracy                           0.97      2700
   macro avg       0.95      0.76      0.82      2700
weighted avg       0.96      0.97      0.96      2700



In [13]:
# The model has low recall, which may be a symptom of the skewed nature of the sample.
# To attempt to correct this, I apply class weights in a 1:15 ratio since the Trues made up 6% of the population

class_weights = {0:1.0, 1: 15}
rf_classifier = RandomForestClassifier(class_weight=class_weights)
rf_classifier.fit(X_train, y_train)
val_predictions = rf_classifier.predict(X_validation)
print(classification_report(y_validation, val_predictions))


              precision    recall  f1-score   support

       False       0.97      1.00      0.98      2521
        True       0.93      0.50      0.65       179

    accuracy                           0.96      2700
   macro avg       0.95      0.75      0.82      2700
weighted avg       0.96      0.96      0.96      2700



In [14]:
# Since the class weights actually made the Recall problem worse, now I will try resampling

# Start by defining the combination resampling pipeline
resampling_pipeline = Pipeline([
    ('over_sampler', RandomOverSampler()),
    ('under_sampler', RandomUnderSampler()),
])

# Apply combination resampling to the training data
X_resampled, y_resampled = resampling_pipeline.fit_resample(X_train, y_train)

# Create and train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_resampled, y_resampled)
val_predictions = rf_classifier.predict(X_validation)
print(classification_report(y_validation, val_predictions))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98      2521
        True       0.85      0.54      0.66       179

    accuracy                           0.96      2700
   macro avg       0.91      0.76      0.82      2700
weighted avg       0.96      0.96      0.96      2700



In [15]:
# Since the resampling only improved recall slightly, let's try using Deep Learning via TensorFlow for a more complex model

# Since the output of the vectorizer I used earlier is a sparse matrix, I convert to a dense matrix.
# This consumes a lot of memory but it should be fine for this amount of data

X_train_dense = X_train.toarray()

# Now I create and compile the model
nn_model = Sequential([
    Dense(units = 128, activation = 'relu'),
    Dense(units = 64, activation = 'relu'),
    Dense(units = 32, activation = 'relu'),
    Dense(units = 16, activation = 'relu'),
    Dense(units = 8, activation = 'relu'),
    Dense(units = 1, activation = 'sigmoid')
])

nn_model.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
)

nn_model.fit(X_train_dense, y_train, epochs=20)


Epoch 1/20


2023-07-14 16:18:07.321678: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x17c6cd1f0>

In [16]:
# Now let's evaluate the model using the validation set

X_validation_dense = X_validation.toarray()

val_predictions = nn_model.predict(X_validation_dense)
val_predictions = (val_predictions >0.5)
print(classification_report(y_validation, val_predictions))

              precision    recall  f1-score   support

       False       0.97      0.98      0.97      2521
        True       0.68      0.51      0.59       179

    accuracy                           0.95      2700
   macro avg       0.82      0.75      0.78      2700
weighted avg       0.95      0.95      0.95      2700



The recall is still not where I want it to be. As a next step, I will try to source more data so that we can have more "True" samples. The below code is draft form and will be altered later once the model is completed


In [17]:
# Evaluate the model on the test set

test_predictions = rf_classifier.predict(X_test)
test_accuracy = np.mean(test_predictions == y_test)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.965938541281007


In [18]:
# Define a function to use our model to predict whether a single recipe contains allergens

def test_recipe(recipe):
    recipe_lowercase = recipe.lower()
    vector = vectorizer.transform([recipe_lowercase])
    prediction = rf_classifier.predict(vector)    
    return recipe + ' likely contains allergens' if prediction[0] else recipe + ' is likely allergen-free!'


In [20]:
tester = 'Farro Salad with Beets, Greens and Feta'
print(test_recipe(tester))

Farro Salad with Beets, Greens and Feta is likely allergen-free!
