In [7]:
# pip install transformers torch

In [8]:
# pip install tensorflow

In [9]:
# pip install tensorflow transformers

In [10]:
# pip install tf-keras

In [190]:
import re
import numpy as np
import pandas as pd
from collections import Counter

import keras
import transformers
import tensorflow as tf
from tensorflow.keras.models import load_model
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling1D

# Data Importing

In [12]:
# Import dataset
recipe_link = "https://raw.githubusercontent.com/jade-y-liang/pic16b-project/refs/heads/main/Recipes.csv"
recipes = pd.read_csv(recipe_link)

In [13]:
# Check if the dataset is loaded correctly
recipes.head()

Unnamed: 0,recipe_name,category_name,rating,prep_time,cook_time,total_time,num_servings_per_recipe,ingredients_list,direction_list,calories_per_serving,...,protein (g),carbs (g),fiber (g),sugar (g),cholesterol (mg),vitamin_c (mg),calcium (mg),iron (mg),potassium (mg),recipe_link
0,Air Fryer Spicy Onion Rings,Air Fryer Recipes\n,5.0,20 mins,10 mins,1 hr,4,"sweet onions, sliced 1/2 inch thick,buttermilk...","Whisk together buttermilk, egg, flour, chile ...",230.0,...,10g,53g,2g,,48mg,,,,197mg,https://www.allrecipes.com/recipe/8465728/air-...
1,Stuffed Chicken Cordon Bleu,Chicken Cordon Bleu\n,5.0,20 mins,40 mins,1 hr 15 mins,4,"skinless, boneless chicken breast halves,bacon...",Preheat the oven to 400 degrees F (200 degree...,877.0,...,66g,44g,3g,8g,291mg,3mg,636mg,4mg,969mg,https://www.allrecipes.com/recipe/283793/stuff...
2,Hearty Chicken Cacciatore Soup with Rice,Chicken Cacciatore\n,,10 mins,1 hr 45 mins,1 hr 55 mins,10,"chicken broth,condensed tomato soup,water,dice...","Combine chicken broth, condensed soup, 2 cans...",250.0,...,15g,35g,2g,,38mg,,,,406mg,https://www.allrecipes.com/recipe/8300735/hear...
3,Chicken and Dumplings with Biscuits,Chicken and Dumplings\n,4.1,,,,8,"whole chicken, cut into pieces,salt,freshly gr...",Put chicken pieces in a large pot over medium...,696.0,...,37g,86g,6g,5g,106mg,49mg,64mg,5mg,1398mg,https://www.allrecipes.com/recipe/8810/chicken...
4,Margo's Chicken Adobo,Chicken Adobo\n,4.5,10 mins,1 hr,1 hr 10 mins,8,"canola oil,chicken drumsticks and thighs,onion...",Heat canola oil in a large Dutch oven over me...,323.0,...,30g,7g,1g,2g,96mg,3mg,40mg,3mg,347mg,https://www.allrecipes.com/recipe/218510/margo...


# Data Cleaning

In [14]:
recipes.shape

(15198, 23)

In [15]:
# Check for NaN values
round(recipes.isna().sum() / len(recipes) * 100, 2)

recipe_name                 0.00
category_name               0.00
rating                     11.05
prep_time                   7.69
cook_time                  20.16
total_time                  7.18
num_servings_per_recipe     4.78
ingredients_list            4.70
direction_list              4.72
calories_per_serving        5.96
total_fat (g)               7.05
saturated_fat (g)           9.88
sodium (mg)                 5.98
protein (g)                 6.28
carbs (g)                   6.05
fiber (g)                   8.47
sugar (g)                  11.51
cholesterol (mg)           19.50
vitamin_c (mg)             17.81
calcium (mg)                8.65
iron (mg)                   9.59
potassium (mg)              6.00
recipe_link                 0.00
dtype: float64

In [16]:
# Drop rows where 'ingredients_list' and 'direction_list' columns are NaN
recipes= recipes.dropna(subset=['ingredients_list', 'direction_list'], how = 'any').reset_index(drop=True)


In [17]:
# Check for duplicates
recipes.duplicated().sum()

0

### Modify Category Names

In [18]:
# Remove newline characters from all strings in a specific column
recipes['category_name'] = recipes['category_name'].str.replace('\n', '', regex=False)

### Modify Direction Column

In [19]:
recipes['direction_list'] = recipes['direction_list'].astype(str)

In [20]:
# Apply the replacement of \\n across the direction_list column
recipes['direction_list'] = recipes['direction_list'].str.replace("\n,", '\n')

### Modify Nutrition Columns

In [21]:
nutrition_cols = [ 'total_fat (g)', 'saturated_fat (g)', 'sodium (mg)', 'protein (g)', 
                  'carbs (g)', 'fiber (g)', 'sugar (g)', 'cholesterol (mg)', 
                  'vitamin_c (mg)', 'calcium (mg)', 'iron (mg)', 'potassium (mg)']


In [22]:
# Handle missing values 
# Fill NaN values in the specified columns with 0
recipes[nutrition_cols] = recipes[nutrition_cols].fillna(0)

In [23]:
# Function to remove units and convert to float
def remove_units(value):
    if isinstance(value, str):
        # Extract numeric part using regex
        numeric_part = re.search(r'[\d.]+', value)
        if numeric_part:
            return float(numeric_part.group(0)) 
    return float('nan') 


In [24]:
# Apply the function to each relevant column
for col in nutrition_cols:
    recipes[col] = recipes[col].apply(remove_units)

### Extracting Main Ingredients from Ingredients List by NLP

In [25]:
# Understand the structure of ingredients list
# Stop words to ignore
stop_words = {'and', 'or', 'with', 'in', 'on', 'to', 'of', 'for', 'as', 'the', 'a', 'an'}

# Function to count all word occurrences
def count_word_occurrences(ingredient_strings):
    all_words = []
    for ingredient_string in ingredient_strings:
        if isinstance(ingredient_string, str):
            # Normalize text
            # Remove punctuation
            ingredient_string = re.sub(r'[^\w\s]', '', ingredient_string.lower()) 
            # Tokenize words
            words = ingredient_string.split() 
            # Remove stop words
            filtered_words = [word for word in words if word not in stop_words] 
            all_words.extend(filtered_words) 
    # Count frequencies
    return Counter(all_words)

# Count word occurrences for all recipes
ingredients_counts = count_word_occurrences(recipes['ingredients_list'])

# Convert to a DataFrame for better visualization
ingredients_counts = (
    pd.DataFrame(ingredients_counts.items(), columns=['Word', 'Count'])
    .sort_values(by='Count', ascending=False)  # Sort the DataFrame by 'Count'
    .reset_index(drop=True)  # Reset the index and drop the old one
)

In [26]:
ingredients_counts.head(10)

Unnamed: 0,Word,Count
0,pepper,5230
1,black,4190
2,ground,2741
3,fresh,2672
4,cheese,2627
5,into,2625
6,cut,2398
7,taste,1972
8,more,1796
9,peeled,1758


In [27]:
# Define keywords (ingredients to include) and stop words (to exclude)
keywords = ['chicken', 'onion', 'tomatoes', 'bread', 'bean', 'bell pepper', 'red pepper', 'sweet pepper' 
            'lemon', 'beef', 'pork', 'mushroom', 'rice', 'basil', 'cilantro',
            'egg', 'milk', 'spinach', 'shrimp', 'orange', 'lime', 'sausage', 'bacon', 'pineapple',
            'peanut', 'strawberr', 'coconut', 'pecan', 'apple', 'potato', 'squash', 
            'jalapeno', 'lettuce', 'tortilla', 'pea', 'amaranth', 'apricot', 'avocado',
            'banana', 'barley', 'brisket', 'wheat', 'duck', 'fish', 'flax seed', 'goat',
            'turkey', 'lamb', 'mango', 'oat', 'peach', 'pear', 'plum', 'pomegranate', 
            'salmon', 'shrimp', 'lobster', 'sardine', 'catfish', 'tuna', 'eel ', 
            'anchovy', 'cucumber', 'eggplant', 'kale', 'lemongrass', 'leek', 'radish', 
            'cauliflower', 'cabbage', 'asparagus', 'broccoli', 'endive', 'okra', 'sweet potato',
            'brussels sprouts', 'leek', 'carrot', 'green beans', 'beet', 'bok choy',
            'spinach', 'pumpkin', 'cranberr', 'parsnip', 'grape', 'grapefruit', 'turnip', 'honeydew melon',
            'rhubarb', 'blackberr', 'cantaloupe', 'cherr', 'kiwi', 'plum', 'zucchini', 
            'corn', 'cheese', 'chocolate', 'flour']
# keywords = ['chicken', 'onion', 'tomato', 'bread', 'bean', 'bell pepper',
#             'lemon', 'beef', 'pork', 'mushroom', 'rice', 'basil', 'cilantro',
#             'egg', 'milk', 'spinach', 'shrimp', 'orange', 'lime', 'sausage', 'bacon', 'pineapple',
#             'peanut', 'strawberr', 'coconut', 'pecan', 'apple', 'potato', 'squash', 
#             'jalapeno', 'lettuce', 'tortilla', 'pea', 'amaranth', 'apricot', 'avocado',
#             'banana', 'barley', 'brisket', 'wheat', 'bulgur', 'cherr', 'chia seed', 'duck',
#             'fish', 'flax seed', 'goat', 'turkey', 'lamb', 'mango', 'millet', 'nectarine',
#             'oat', 'peach', 'pear', 'plum', 'pomegranate', 'quinoa', 'seafood', 'shellfish',
#             'shell', 'sirloin', 'spelt', 'veal', 'venison', 'octopus', 'sturgeon', 
#             'squid', 'salmon', 'oyster', 'shrimp', 'lobster', 'caviar', 'carp', 'flounder',
#             'crab', 'sardine', 'catfish', 'tuna', 'eel ', 'anchovy', 'collard', 'coriander',
#             'cucumber', 'eggplant', 'endive', 'escarole', 'daikon', 'delicata', 'eddoe', 'fennel',
#             'kale', 'jicama', 'ginger', 'lemongrass', 'leek', 'habanero', 'radish', 'parsnip',
#             'artichoke', 'cauliflower', 'cabbage', 'asparagus', 'kohlrabi',
#             'broccoli', 'fiddlehead', 'melon', 'endive', 'okra', 'sweet potato',
#             'brussels sprouts', 'leek', 'carrot', 'green beans', 'beet', 'bok choy',
#             'spinach', 'pumpkin', 'cranberr', 'parsnip', 'grape', 'grapefruit', 'turnip', 'honeydew melon',
#             'rhubarb', 'blackberr', 'cantaloupe', 'cherr', 'kiwi', 'plum', 'peppers', 'zucchini', 'shallot',
#             ]
# stop_words = ['pepper', 'black', 'ground', 'fresh', 'into', 'cut', 'taste', 'more', 'peeled', 'such', 
#               'chopped', 'white', 'oil', 'green', 'brown', 'purpose', 'sugar', 'sliced',
#               'parmesan', 'red', 'cheddar', 'finely', 'baking', 'sauce', 'optional',
#               'thinly', 'salt', 'needed', 'boneless', 'cooking', 'drained', 'seeded',
#               'extract', 'cooked', 'room', 'powder', 'juice', 'mozzarella', 'cored',
#               'freshly', 'chile', 'at', 'diced', 'grated', 'unsalted', 'rinse', 'inch',
#               'plus', 'extract', 'degrees', 'leaves', 'halve', 'whip', 'italian',
#               'dried', 'less', 'shred', 'divide', 'sweet', 'whole', 'condense', 'thaw',
#               'pitted', 'jack', 'chop', 'lightly', 'frozen', 'minced', 'garnish', 'melted',
#               'coarsely', 'mix', 'small', '1', 'yellow', 'flakes', 'frying', 'pack', 'soup',
#               'hot', 'light', 'thin', 'crush', 'spray', 'topping', 'squeeze', 'dill', 'sharpe',
#               'thyme', 'steak', 'trim', 'large', 'crack', 'food', 'bouillon', 'size', 'sour', 'breast',
#               'clove', 'beat']
# maybe = ['pepper', 'butter', 'olive', 'cream', 'wine', 'cinnamon', 'cocoa', 'seeds', 'ginger', 'garlic',
#          'soy', 'vanilla', 'walnut', 'cider', 'broth', 'salad', 'lean', 'olive', 'pastry', 'sesame', 'vinegar',
#          'mustard', 'cream']

In [28]:
# Function to extract keywords from ingredients_list
def extract_keywords(ingredient_list, keywords):
    # Check if ingredient_list is a string; if not, treat it as empty
    if not isinstance(ingredient_list, str):
        return ''
    
    # Normalize text and check for keywords
    ingredient_list = ingredient_list.lower()
    main_ingredients = [keyword for keyword in keywords if keyword in ingredient_list]
    
    return sorted(set(main_ingredients))


In [29]:
# Apply function to DataFrame
recipes['main_ingredients'] = recipes['ingredients_list'].apply(lambda x: extract_keywords(x, keywords))


In [30]:
# Replace 'err' with 'erries' in one line
recipes['main_ingredients'] = recipes['main_ingredients'].apply(lambda ingredients: [ingredient.replace('err', 'erries') for ingredient in ingredients])

In [31]:
recipes['main_ingredients'].iloc[0]

['bread', 'egg', 'flour', 'lime', 'milk', 'onion']

### Main Ingredients Frequency

In [243]:
# Flatten all words from the 'main_ingredient' column
all_ingredients = [word for ingredients_list in recipes['main_ingredients'] for word in ingredients_list]

In [244]:
# Count the occurrences of each word
word_counts = Counter(all_ingredients)

In [245]:
# Convert the Counter object to a DataFrame
main_ingredient_freq = pd.DataFrame(word_counts.items(), columns=['Ingredient', 'Count'])

In [250]:
main_ingredient_freq = main_ingredient_freq.sort_values(by='Count', ascending=False).reset_index(drop=True)

In [251]:
main_ingredient_freq.head()

Unnamed: 0,Ingredient,Count
0,onion,5160
1,egg,4789
2,flour,4327
3,cheese,3930
4,milk,2961


In [252]:
# Save the result to a CSV file
main_ingredient_freq.to_csv("ingredient_frequency.csv", index=False)

### Convert Time Columns

In [32]:
time_cols = ['prep_time', 'cook_time', 'total_time']

In [33]:
# Function to convert time string to total minutes
def convert_to_minutes(time_str):
    # Handle NaN or non-string values
    if not isinstance(time_str, str):
        return np.nan 
    
    # Initialize hours and minutes
    hours = 0
    minutes = 0
    
    # Extract hours and minutes using regex
    if 'hr' in time_str:
        hours_match = re.search(r'(\d+)\s*hr', time_str)
        if hours_match:
            hours = int(hours_match.group(1))
    
    if 'min' in time_str:
        minutes_match = re.search(r'(\d+)\s*min', time_str)
        if minutes_match:
            minutes = int(minutes_match.group(1))
    
    # Convert total time to minutes
    total_minutes = hours * 60 + minutes
    return int(total_minutes)

In [34]:
# Apply the function to each time column
for col in time_cols:
    recipes[col] = recipes[col].apply(convert_to_minutes)

In [35]:
# Calculate the additional time column
recipes['additional_time'] = recipes['total_time'] - recipes['cook_time'] - recipes['prep_time']

# Reorder the columns to place 'additional_time' before 'total_time'
columns = recipes.columns.tolist()
columns.insert(columns.index('total_time'), columns.pop(columns.index('additional_time')))
recipes = recipes[columns]

### Estimate Empty Time Columns

In [124]:
# Find all rows with NaN in 'total_time' column
nan_time_test = recipes[recipes['total_time'].isna()].reset_index(drop = True)

In [125]:
# Display directions for reference
for i in range(5):
    print(nan_time_test.iloc[i, 9])

 Put chicken pieces in a large pot over medium heat, and add enough water to cover 3 inches over the chicken. Add the salt, celery, pepper, carrots, celery and potatoes. Bring to a boil and let simmer for 40 minutes.
 In a mixing bowl, prepare biscuit mix according to package directions. After simmering soup for 40 minutes, drop rounded tablespoonfuls of the biscuit mixture into the pot and let them cook a bit between additions of dumplings, so that they do not stick together.
 With a wire whisk, mix together about 1 1/2 cups of water with the flour, then add flour mixture to the soup pot until the broth has the consistency of gravy. Dig in and enjoy the meal!

 Whisk together flour, baking soda, salt, cinnamon, and baking powder.
 In a large bowl, beat the eggs. Gradually beat in sugar, then oil. Add flour mixture alternately with zucchini into the egg mixture. Stir in the raisins, walnuts, and vanilla. Pour batter into two 9 x 5 inch greased and lightly floured loaf pans.
 Bake on lo

In [None]:
cleaned_recipes = recipes.dropna(subset=['total_time']).reset_index(drop=True)

In [171]:
# Split training and validation dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    cleaned_recipes['direction_list'], cleaned_recipes['total_time'], test_size=0.2, random_state=42
)

In [172]:
# Load the BERT tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased', trainable=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [173]:
# Tokenize the text data with numpy arrays
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=256, return_tensors="np")
val_encodings = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=256, return_tensors="np")

In [None]:
# Extract input_ids and attention_mask from the tokenized data
train_input_ids = train_encodings['input_ids']
train_attention_mask = train_encodings['attention_mask']
val_input_ids = val_encodings['input_ids']
val_attention_mask = val_encodings['attention_mask']

# Convert labels to NumPy arrays
train_labels = np.array(train_labels)  
val_labels = np.array(val_labels) 


In [None]:
# train_input_ids = np.array(train_input_ids)
# train_attention_mask = np.array(train_attention_mask)
# train_labels = np.array(train_labels)

In [None]:
# Create the TensorFlow Dataset for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": train_input_ids, "attention_mask": train_attention_mask},
                                                    train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices(({"input_ids": val_input_ids, "attention_mask": val_attention_mask},
                                                  val_labels))

In [181]:
# Shuffle and batch the data
train_dataset = train_dataset.batch(32)
val_dataset = val_dataset.batch(32)

In [None]:
# Define input and attention_mask placeholders
input_ids = tf.keras.layers.Input(shape=(256,), dtype=tf.int64, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(256,), dtype=tf.int64, name="attention_mask")

In [None]:
# Define a custom model to wrap BERT
class BERTWrapper(tf.keras.Model):
    def __init__(self, bert_model, **kwargs):
        super(BERTWrapper, self).__init__(**kwargs)
        self.bert_model = bert_model
        self.pooling_layer = GlobalAveragePooling1D()  
        self.dropout = Dropout(0.5)
        self.dense1 = Dense(64, activation='relu')
        self.output_layer = Dense(1)

    def call(self, inputs, **kwargs):
        # Extracting the last hidden state from the BERT output
        output = self.bert_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        last_hidden_state = output.last_hidden_state 
        pooled_output = self.pooling_layer(last_hidden_state)
        x = self.dropout(pooled_output)
        x = self.dense1(x)
        output = self.output_layer(x) 

        return output

In [184]:
# Create an instance of wrapped BERT model
model = BERTWrapper(bert_model)

In [185]:
# Create a Keras Model using the functional API
outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})
final_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)

In [None]:
# Compile the model
final_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
# Train the model
history = final_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    batch_size=32
)

Epoch 1/5




[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2392s[0m 7s/step - loss: 22267.6953 - mae: 82.9380 - val_loss: 20606.7812 - val_mae: 82.4884
Epoch 2/5
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2548s[0m 7s/step - loss: 19719.8203 - mae: 83.8771 - val_loss: 20582.8984 - val_mae: 82.6142
Epoch 3/5
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2412s[0m 7s/step - loss: 19657.0840 - mae: 83.6883 - val_loss: 20595.3965 - val_mae: 82.1871
Epoch 4/5
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2401s[0m 7s/step - loss: 19665.3770 - mae: 83.5996 - val_loss: 20588.9902 - val_mae: 82.3901
Epoch 5/5
[1m353/353[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2960s[0m 8s/step - loss: 19650.1699 - mae: 83.6804 - val_loss: 20575.9258 - val_mae: 82.6043


In [201]:
# Save the model to a file
final_model.save('time_prediction_model.keras')

In [253]:
final_model.summary()

In [None]:
# Preprocess the testing data
test_encodings = tokenizer(nan_time_test['direction_list'].tolist(), padding=True, truncation=True, max_length=256, return_tensors="np")
test_inputs = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask']
}


In [208]:
# Making predictions on the test set
predictions = final_model.predict(test_inputs)



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 5s/step


In [None]:
# Input prediction into the dataset
nan_time_test['total_time'] = predictions.flatten().astype(int)

In [None]:
# Check the values of prediction
sorted_unique_predictions = np.sort(nan_time_test['total_time'].unique())
sorted_unique_predictions

array([ 52,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 129, 130, 133])

In [230]:
# Merge the datasets on the 'id' column (or any other relevant column)
recipes_updated = recipes.set_index('recipe_name').combine_first(nan_time_test.set_index('recipe_name'))

In [None]:
# Check successful input
recipes_updated['total_time'].isna().sum()

0

In [241]:
# Check number of short cooking recipes
count = recipes_updated[recipes_updated['total_time'] < 60].shape[0]
print(f"Number of recipes where total_time is less than 1 hour: {count}")

Number of recipes where total_time is less than 1 hour: 7316


In [237]:
recipes_updated.to_csv('recipes_updated.csv', index=False)