In [36]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


In [37]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from transformers import RobertaTokenizer, RobertaModel, get_scheduler
import torch
from torch import nn
from tqdm import tqdm
from transformers import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Data Importing

In [38]:
# Import dataset
recipe_link = "https://raw.githubusercontent.com/jade-y-liang/pic16b-project/refs/heads/main/Recipes.csv"
recipes = pd.read_csv(recipe_link)

In [39]:
# Check if the dataset is loaded correctly
recipes.head()

Unnamed: 0,recipe_name,category_name,rating,prep_time,cook_time,total_time,num_servings_per_recipe,ingredients_list,direction_list,calories_per_serving,...,protein (g),carbs (g),fiber (g),sugar (g),cholesterol (mg),vitamin_c (mg),calcium (mg),iron (mg),potassium (mg),recipe_link
0,Air Fryer Spicy Onion Rings,Air Fryer Recipes\n,5.0,20 mins,10 mins,1 hr,4,"sweet onions, sliced 1/2 inch thick,buttermilk...","Whisk together buttermilk, egg, flour, chile ...",230.0,...,10g,53g,2g,,48mg,,,,197mg,https://www.allrecipes.com/recipe/8465728/air-...
1,Stuffed Chicken Cordon Bleu,Chicken Cordon Bleu\n,5.0,20 mins,40 mins,1 hr 15 mins,4,"skinless, boneless chicken breast halves,bacon...",Preheat the oven to 400 degrees F (200 degree...,877.0,...,66g,44g,3g,8g,291mg,3mg,636mg,4mg,969mg,https://www.allrecipes.com/recipe/283793/stuff...
2,Hearty Chicken Cacciatore Soup with Rice,Chicken Cacciatore\n,,10 mins,1 hr 45 mins,1 hr 55 mins,10,"chicken broth,condensed tomato soup,water,dice...","Combine chicken broth, condensed soup, 2 cans...",250.0,...,15g,35g,2g,,38mg,,,,406mg,https://www.allrecipes.com/recipe/8300735/hear...
3,Chicken and Dumplings with Biscuits,Chicken and Dumplings\n,4.1,,,,8,"whole chicken, cut into pieces,salt,freshly gr...",Put chicken pieces in a large pot over medium...,696.0,...,37g,86g,6g,5g,106mg,49mg,64mg,5mg,1398mg,https://www.allrecipes.com/recipe/8810/chicken...
4,Margo's Chicken Adobo,Chicken Adobo\n,4.5,10 mins,1 hr,1 hr 10 mins,8,"canola oil,chicken drumsticks and thighs,onion...",Heat canola oil in a large Dutch oven over me...,323.0,...,30g,7g,1g,2g,96mg,3mg,40mg,3mg,347mg,https://www.allrecipes.com/recipe/218510/margo...


# Data Cleaning

In [40]:
recipes.shape

(15198, 23)

In [41]:
# Check for NaN values
round(recipes.isna().sum() / len(recipes) * 100, 2)

recipe_name                 0.00
category_name               0.00
rating                     11.05
prep_time                   7.69
cook_time                  20.16
total_time                  7.18
num_servings_per_recipe     4.78
ingredients_list            4.70
direction_list              4.72
calories_per_serving        5.96
total_fat (g)               7.05
saturated_fat (g)           9.88
sodium (mg)                 5.98
protein (g)                 6.28
carbs (g)                   6.05
fiber (g)                   8.47
sugar (g)                  11.51
cholesterol (mg)           19.50
vitamin_c (mg)             17.81
calcium (mg)                8.65
iron (mg)                   9.59
potassium (mg)              6.00
recipe_link                 0.00
dtype: float64

In [42]:
# Drop rows where 'ingredients_list' and 'direction_list' columns are NaN
recipes= recipes.dropna(subset=['ingredients_list', 'direction_list'], how = 'any').reset_index(drop=True)


In [43]:
# Check for duplicates
recipes.duplicated().sum()

0

### Modify Category Names

In [44]:
# Remove newline characters from all strings in a specific column
recipes['category_name'] = recipes['category_name'].str.replace('\n', '', regex=False)

### Modify Direction Column

In [45]:
recipes['direction_list'] = recipes['direction_list'].astype(str)

In [46]:
# Apply the replacement of \\n across the direction_list column
recipes['direction_list'] = recipes['direction_list'].str.replace("\n,", '\n')

### Modify Nutrition Columns

In [47]:
nutrition_cols = [ 'total_fat (g)', 'saturated_fat (g)', 'sodium (mg)', 'protein (g)', 
                  'carbs (g)', 'fiber (g)', 'sugar (g)', 'cholesterol (mg)', 
                  'vitamin_c (mg)', 'calcium (mg)', 'iron (mg)', 'potassium (mg)']


In [48]:
# Handle missing values 
# Fill NaN values in the specified columns with 0
recipes[nutrition_cols] = recipes[nutrition_cols].fillna(0)

In [49]:
# Function to remove units and convert to float
def remove_units(value):
    if isinstance(value, str):
        # Extract numeric part using regex
        numeric_part = re.search(r'[\d.]+', value)
        if numeric_part:
            return float(numeric_part.group(0)) 
    return float('nan') 


In [50]:
# Apply the function to each relevant column
for col in nutrition_cols:
    recipes[col] = recipes[col].apply(remove_units)

### Extracting Main Ingredients from Ingredients List by NLP

In [51]:
# Understand the structure of ingredients list
# Stop words to ignore
stop_words = {'and', 'or', 'with', 'in', 'on', 'to', 'of', 'for', 'as', 'the', 'a', 'an'}

# Function to count all word occurrences
def count_word_occurrences(ingredient_strings):
    all_words = []
    for ingredient_string in ingredient_strings:
        if isinstance(ingredient_string, str):
            # Normalize text
            # Remove punctuation
            ingredient_string = re.sub(r'[^\w\s]', '', ingredient_string.lower()) 
            # Tokenize words
            words = ingredient_string.split() 
            # Remove stop words
            filtered_words = [word for word in words if word not in stop_words] 
            all_words.extend(filtered_words) 
    # Count frequencies
    return Counter(all_words)

# Count word occurrences for all recipes
ingredients_counts = count_word_occurrences(recipes['ingredients_list'])

# Convert to a DataFrame for better visualization
ingredients_counts = (
    pd.DataFrame(ingredients_counts.items(), columns=['Word', 'Count'])
    .sort_values(by='Count', ascending=False)  # Sort the DataFrame by 'Count'
    .reset_index(drop=True)  # Reset the index and drop the old one
)

In [52]:
ingredients_counts.head(10)

Unnamed: 0,Word,Count
0,pepper,5230
1,black,4190
2,ground,2741
3,fresh,2672
4,cheese,2627
5,into,2625
6,cut,2398
7,taste,1972
8,more,1796
9,peeled,1758


In [53]:
# Define keywords (ingredients to include) and stop words (to exclude)
keywords = ['chicken', 'onion', 'tomatoes', 'bread', 'bean', 'bell pepper', 'red pepper', 'sweet pepper' 
            'lemon', 'beef', 'pork', 'mushroom', 'rice', 'basil', 'cilantro',
            'egg', 'milk', 'spinach', 'shrimp', 'orange', 'lime', 'sausage', 'bacon', 'pineapple',
            'peanut', 'strawberr', 'coconut', 'pecan', 'apple', 'potato', 'squash', 
            'jalapeno', 'lettuce', 'tortilla', 'pea', 'amaranth', 'apricot', 'avocado',
            'banana', 'barley', 'brisket', 'wheat', 'duck', 'fish', 'flax seed', 'goat',
            'turkey', 'lamb', 'mango', 'oat', 'peach', 'pear', 'plum', 'pomegranate', 
            'salmon', 'shrimp', 'lobster', 'sardine', 'catfish', 'tuna', 'eel ', 
            'anchovy', 'cucumber', 'eggplant', 'kale', 'lemongrass', 'leek', 'radish', 
            'cauliflower', 'cabbage', 'asparagus', 'broccoli', 'endive', 'okra', 'sweet potato',
            'brussels sprouts', 'leek', 'carrot', 'green beans', 'beet', 'bok choy',
            'spinach', 'pumpkin', 'cranberr', 'parsnip', 'grape', 'grapefruit', 'turnip', 'honeydew melon',
            'rhubarb', 'blackberr', 'cantaloupe', 'cherr', 'kiwi', 'plum', 'zucchini', 
            'corn', 'cheese', 'chocolate', 'flour']
# keywords = ['chicken', 'onion', 'tomato', 'bread', 'bean', 'bell pepper',
#             'lemon', 'beef', 'pork', 'mushroom', 'rice', 'basil', 'cilantro',
#             'egg', 'milk', 'spinach', 'shrimp', 'orange', 'lime', 'sausage', 'bacon', 'pineapple',
#             'peanut', 'strawberr', 'coconut', 'pecan', 'apple', 'potato', 'squash', 
#             'jalapeno', 'lettuce', 'tortilla', 'pea', 'amaranth', 'apricot', 'avocado',
#             'banana', 'barley', 'brisket', 'wheat', 'bulgur', 'cherr', 'chia seed', 'duck',
#             'fish', 'flax seed', 'goat', 'turkey', 'lamb', 'mango', 'millet', 'nectarine',
#             'oat', 'peach', 'pear', 'plum', 'pomegranate', 'quinoa', 'seafood', 'shellfish',
#             'shell', 'sirloin', 'spelt', 'veal', 'venison', 'octopus', 'sturgeon', 
#             'squid', 'salmon', 'oyster', 'shrimp', 'lobster', 'caviar', 'carp', 'flounder',
#             'crab', 'sardine', 'catfish', 'tuna', 'eel ', 'anchovy', 'collard', 'coriander',
#             'cucumber', 'eggplant', 'endive', 'escarole', 'daikon', 'delicata', 'eddoe', 'fennel',
#             'kale', 'jicama', 'ginger', 'lemongrass', 'leek', 'habanero', 'radish', 'parsnip',
#             'artichoke', 'cauliflower', 'cabbage', 'asparagus', 'kohlrabi',
#             'broccoli', 'fiddlehead', 'melon', 'endive', 'okra', 'sweet potato',
#             'brussels sprouts', 'leek', 'carrot', 'green beans', 'beet', 'bok choy',
#             'spinach', 'pumpkin', 'cranberr', 'parsnip', 'grape', 'grapefruit', 'turnip', 'honeydew melon',
#             'rhubarb', 'blackberr', 'cantaloupe', 'cherr', 'kiwi', 'plum', 'peppers', 'zucchini', 'shallot',
#             ]
# stop_words = ['pepper', 'black', 'ground', 'fresh', 'into', 'cut', 'taste', 'more', 'peeled', 'such', 
#               'chopped', 'white', 'oil', 'green', 'brown', 'purpose', 'sugar', 'sliced',
#               'parmesan', 'red', 'cheddar', 'finely', 'baking', 'sauce', 'optional',
#               'thinly', 'salt', 'needed', 'boneless', 'cooking', 'drained', 'seeded',
#               'extract', 'cooked', 'room', 'powder', 'juice', 'mozzarella', 'cored',
#               'freshly', 'chile', 'at', 'diced', 'grated', 'unsalted', 'rinse', 'inch',
#               'plus', 'extract', 'degrees', 'leaves', 'halve', 'whip', 'italian',
#               'dried', 'less', 'shred', 'divide', 'sweet', 'whole', 'condense', 'thaw',
#               'pitted', 'jack', 'chop', 'lightly', 'frozen', 'minced', 'garnish', 'melted',
#               'coarsely', 'mix', 'small', '1', 'yellow', 'flakes', 'frying', 'pack', 'soup',
#               'hot', 'light', 'thin', 'crush', 'spray', 'topping', 'squeeze', 'dill', 'sharpe',
#               'thyme', 'steak', 'trim', 'large', 'crack', 'food', 'bouillon', 'size', 'sour', 'breast',
#               'clove', 'beat']
# maybe = ['pepper', 'butter', 'olive', 'cream', 'wine', 'cinnamon', 'cocoa', 'seeds', 'ginger', 'garlic',
#          'soy', 'vanilla', 'walnut', 'cider', 'broth', 'salad', 'lean', 'olive', 'pastry', 'sesame', 'vinegar',
#          'mustard', 'cream']

In [54]:
# Function to extract keywords from ingredients_list
def extract_keywords(ingredient_list, keywords):
    # Check if ingredient_list is a string; if not, treat it as empty
    if not isinstance(ingredient_list, str):
        return ''
    
    # Normalize text and check for keywords
    ingredient_list = ingredient_list.lower()
    main_ingredients = [keyword for keyword in keywords if keyword in ingredient_list]
    
    return sorted(set(main_ingredients))


In [55]:
# Apply function to DataFrame
recipes['main_ingredients'] = recipes['ingredients_list'].apply(lambda x: extract_keywords(x, keywords))


In [56]:
# Replace 'err' with 'erries' in one line
recipes['main_ingredients'] = recipes['main_ingredients'].apply(lambda ingredients: [ingredient.replace('err', 'erries') for ingredient in ingredients])

In [57]:
recipes['main_ingredients'].iloc[0]

['bread', 'egg', 'flour', 'lime', 'milk', 'onion']

### Convert Time Columns

In [58]:
time_cols = ['prep_time', 'cook_time', 'total_time']

In [59]:
# Function to convert time string to total minutes
def convert_to_minutes(time_str):
    # Handle NaN or non-string values
    if not isinstance(time_str, str):
        return np.nan 
    
    # Initialize hours and minutes
    hours = 0
    minutes = 0
    
    # Extract hours and minutes using regex
    if 'hr' in time_str:
        hours_match = re.search(r'(\d+)\s*hr', time_str)
        if hours_match:
            hours = int(hours_match.group(1))
    
    if 'min' in time_str:
        minutes_match = re.search(r'(\d+)\s*min', time_str)
        if minutes_match:
            minutes = int(minutes_match.group(1))
    
    # Convert total time to minutes
    total_minutes = hours * 60 + minutes
    return int(total_minutes)

In [60]:
# Apply the function to each time column
for col in time_cols:
    recipes[col] = recipes[col].apply(convert_to_minutes)

In [61]:
# Calculate the additional time column
recipes['additional_time'] = recipes['total_time'] - recipes['cook_time'] - recipes['prep_time']

# Reorder the columns to place 'additional_time' before 'total_time'
columns = recipes.columns.tolist()
columns.insert(columns.index('total_time'), columns.pop(columns.index('additional_time')))
recipes = recipes[columns]

### Estimate Empty Time Columns

In [62]:
# Find all rows with NaN in 'total_time' column
nan_time = recipes[recipes['total_time'].isna()].reset_index(drop = True)

In [63]:
# Display directions for reference
for i in range(5):
    print(nan_time.iloc[i, 9])

 Put chicken pieces in a large pot over medium heat, and add enough water to cover 3 inches over the chicken. Add the salt, celery, pepper, carrots, celery and potatoes. Bring to a boil and let simmer for 40 minutes.
 In a mixing bowl, prepare biscuit mix according to package directions. After simmering soup for 40 minutes, drop rounded tablespoonfuls of the biscuit mixture into the pot and let them cook a bit between additions of dumplings, so that they do not stick together.
 With a wire whisk, mix together about 1 1/2 cups of water with the flour, then add flour mixture to the soup pot until the broth has the consistency of gravy. Dig in and enjoy the meal!

 Whisk together flour, baking soda, salt, cinnamon, and baking powder.
 In a large bowl, beat the eggs. Gradually beat in sugar, then oil. Add flour mixture alternately with zucchini into the egg mixture. Stir in the raisins, walnuts, and vanilla. Pour batter into two 9 x 5 inch greased and lightly floured loaf pans.
 Bake on lo

In [64]:
# Define a function to preprocess text data
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    return text

In [65]:
# Define a function to normalize target 
def normalize_target(targets):
    scaler = MinMaxScaler() 
    normalized_targets = scaler.fit_transform(targets.reshape(-1, 1))
    return normalized_targets, scaler

In [66]:
# Extract a cleaned dataset without NaN values inside total_time
recipes_clean = recipes.dropna(subset=['total_time']).reset_index(drop=True)
recipes_clean['processed_directions'] = recipes_clean['direction_list'].apply(preprocess_text)
normalized_targets, target_scaler = normalize_target(recipes_clean['total_time'].values)

##### Tokenization

In [67]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [68]:
# Tokenize the data with a function
def tokenize_function(texts):
    return tokenizer(
        texts,
        padding = "max_length",
        truncation = True,
        max_length = 500,
        return_tensors = "pt",
    )

In [69]:
# Tokenize the processed text using the transformer tokenizer
encoded_inputs = tokenizer(
    recipes_clean['processed_directions'].tolist(), 
    padding = True, 
    truncation = True, 
    return_tensors = 'pt'
)

In [70]:
train_texts, val_texts, train_targets, val_targets = train_test_split(
    encoded_inputs['input_ids'],
    normalized_targets,
    test_size = 0.2,
    random_state = 123
)

train_texts = train_texts.tolist()
val_texts = val_texts.tolist()

# # Reset the index for train and validation targets
# train_targets = train_targets.reset_index(drop=True)
# val_targets = val_targets.reset_index(drop=True)

##### Prepare Training & Validation Dataset

In [71]:
# Create a PyTorch dataset class
class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, targets):
        self.encodings = encodings
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.targets[idx], dtype=torch.float)
        return item

In [72]:
train_df = RecipeDataset(train_encodings, train_targets)
val_df = RecipeDataset(val_encodings, val_targets)

NameError: name 'train_encodings' is not defined

In [None]:
train_loader = DataLoader(train_df, batch_size = 8, shuffle = True)
val_loader = DataLoader(val_df, batch_size = 9)

##### Define RoBERTa Model with Regression

In [None]:
class RobertaForRegression(nn.Module):
    def __init__(self):
        super(RobertaForRegression, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)  # Output: single value for regression

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # CLS token output
        return self.regressor(pooled_output)

##### Training Process

In [None]:
# Initialize the Model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = RobertaForRegression().to(device)
loss_fn = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr = 1e-5)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    model.train()  
    train_loss = 0  

    with tqdm(train_loader, desc=f"Epoch {epoch + 1}", unit="batch") as pbar:
        for step, batch in enumerate(pbar):
            optimizer.zero_grad()  
            input_ids = batch["input_ids"].to(device) 
            attention_mask = batch["attention_mask"].to(device) 
            targets = batch["labels"].to(device) 

            # Forward pass through the model
            outputs = model(input_ids, attention_mask)  
            loss = loss_fn(outputs.squeeze(), targets)  
            train_loss += loss.item()
            # Backpropagation
            loss.backward()  # Backpropagation
            # Update the model parameters
            optimizer.step()  
            # Adjust the learning rate
            scheduler.step()  

            current_lr = scheduler.get_last_lr()[0] 
            pbar.set_postfix(loss=train_loss / (step + 1), lr=current_lr)  

    avg_train_loss = train_loss / len(train_loader)  
    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}")
    current_lr = scheduler.get_last_lr()[0] 
    print(f"Epoch {epoch + 1}, Final Learning Rate: {current_lr}")

Epoch 1:   4%|▎         | 52/1411 [08:43<3:47:55, 10.06s/batch, loss=3.98e+4, lr=0.000491]


KeyboardInterrupt: 

###

###