In [None]:
# Import standard libraries
import os               # Provides functions for interacting with the operating system
import time             # Provides time-related functions
import string           # Contains string constants and utilities
import unicodedata      # Unicode character database

# Import scientific computing libraries
import numpy as np      # Provides support for arrays and mathematical functions
import pandas as pd     # Provides data structures and data analysis tools

# Import PyTorch libraries
import torch                            # PyTorch library for tensor computation and deep learning
import torch.nn as nn                   # Neural network module of PyTorch
import torch.nn.functional as F         # Functional interface of PyTorch neural network module
from torch.nn.utils.rnn import pad_sequence     # Function for padding sequences
from torch.utils.data import DataLoader, TensorDataset, random_split  # Utilities for data loading and manipulation

# Import machine learning libraries
from sklearn.model_selection import train_test_split  # Function for splitting data into training and test sets
from nltk.translate.bleu_score import sentence_bleu   # Function for computing BLEU score
 

# Import visualization libraries
import matplotlib.pyplot as plt  # Provides a MATLAB-like plotting framework

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available

In [None]:
# Specify the relative file path to the CSV dataset
file_path = '../Datasets/Food Ingredients and Recipe Dataset with Image Name Mapping.csv'

# Use pandas' read_csv function to load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows of the DataFrame using the head() function
df.head()

In [None]:
# Extract the 'Title' column from the DataFrame 'df' and assign it to the variable 'titles'
titles = df['Title']

# Extract the 'Ingredients' column from the DataFrame 'df' and assign it to the variable 'ingredients'
ingredients = df['Ingredients']

# Print the first 5 rows of 'titles'
print(titles.head(), '\n')

# Print the first 5 rows of 'ingredients'
print(ingredients.head(), '\n')

In [None]:
# Remove any rows in 'titles' that contain missing values
titles = titles.dropna()

# Remove any duplicate rows in 'titles'
titles = titles.drop_duplicates()

# Initialize an empty set to store the unique characters in the titles
chars = set()

# Enumerate all the characters in the dataset
for title in titles:
    for char in title:
        # Add each character to the 'chars' set. Since sets only contain unique elements, any duplicate characters will be ignored.
        chars.add(char)

# Sort the characters in alphabetical order
chars = sorted(chars)

# Print the number of unique characters in the dataset
print('Unique characters in the dataset:', len(chars))

# Print the sorted list of unique characters
print(chars, '\n')

# Define a function to filter out unexpected characters from the titles
def filter_unexpected_chars(data):
    # Define the set of expected characters as all ASCII letters, digits, spaces, and some punctuation marks
    expected_chars = set(string.ascii_letters + string.digits + ' ' + '!"#%&\'(),-.:?')
    
    # Initialize an empty list to store the filtered titles
    filtered_data = []
    
    for title in data:
        # For each title, remove any characters that are not in the set of expected characters and are not control or separator characters
        filtered_title = ''.join(char for char in title if unicodedata.category(char)[0] not in ['C', 'Z'] or char in expected_chars)
        
        # Add the filtered title to the list of filtered titles
        filtered_data.append(filtered_title)
    
    # Return the list of filtered titles
    return filtered_data

# Apply the filter function to the titles
filtered_titles = filter_unexpected_chars(titles)

# Print the first 5 filtered titles
for title in filtered_titles[:5]:
    print(title)

In [None]:
# Convert each title in 'filtered_titles' into a list of individual characters
titles_split = [list(title) for title in filtered_titles]

# Print the first 5 titles in 'titles_split'. Each title is a list of characters.
for title in titles_split[:5]:
    print(title)

In [None]:
# Define a class for special tokens. These tokens have special meanings in the context of sequence-to-sequence models.
class SpecialTokens:
    PAD = '<PAD>'  # Padding token. Used to fill in sequences to make them all the same length.
    SOS = '<SOS>'  # Start-of-sequence token. Indicates the start of a sequence.
    EOS = '<EOS>'  # End-of-sequence token. Indicates the end of a sequence.
    UNK = '<UNK>'  # Unknown token. Used to represent characters not in the vocabulary.
    Tokens = [PAD, SOS, EOS, UNK]  # List of all special tokens.

# Extend the vocabulary with the special tokens.
vocab = SpecialTokens.Tokens + list(chars)

# Print the vocabulary, which now includes the special tokens.
print(vocab)

# Add the start-of-sequence and end-of-sequence tokens to each title. 
# Each title is now a list that starts with the SOS token, ends with the EOS token, and has the original characters in between.
titles_split_with_special_tokens = [[SpecialTokens.SOS] + title + [SpecialTokens.EOS] for title in titles_split]

# Print the first 5 titles, which now include the special tokens.
print(titles_split_with_special_tokens[:5])

In [None]:
# Create a dictionary that maps each character in the vocabulary to a unique integer index
char_to_idx = {char: idx for idx, char in enumerate(vocab)}

# Create a dictionary that maps each unique integer index to the corresponding character in the vocabulary
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Convert each title into a sequence of integer indices. If a character is not in the vocabulary, use the index of the special UNK (unknown) token
titles_ints = [[char_to_idx.get(char, char_to_idx[SpecialTokens.UNK]) for char in title] for title in titles_split_with_special_tokens]

# Pad each sequence of integer indices to the same length, using the index of the special PAD (padding) token for padding. Convert each sequence to a PyTorch tensor of type long
titles_ints_padded = pad_sequence([torch.tensor(title, dtype=torch.long) for title in titles_ints], batch_first=True, padding_value=char_to_idx[SpecialTokens.PAD])

# Convert the list of padded sequences into a 2D PyTorch tensor (optional)
titles_ints_tensor = torch.tensor(titles_ints_padded, dtype=torch.long)

# Print the shape of the titles_ints tensor. This should be (n, m), where n is the number of titles and m is the length of the longest title
print(titles_ints_tensor.shape)

# Print the first 5 sequences of integer indices
for title in titles_ints_tensor[:5]:
    print(title)

In [None]:
# Define a function to convert sequences of integer indices back to text
def text_from_ids(ids, idx_to_char):
    # For each sequence of indices, map each index to its corresponding character using the 'idx_to_char' dictionary
    # Join the characters together to form a string, and add this string to a list
    # Return the list of strings
    return [''.join(idx_to_char[idx.item()] for idx in seq) for seq in ids]

# Use the 'text_from_ids' function to convert the 'titles_ints_tensor' tensor back to text
# The 'idx_to_char' dictionary is used to map each index to its corresponding character
titles_text = text_from_ids(titles_ints_tensor, idx_to_char)

# Print the first 5 titles after conversion
for title in titles_text[:5]:
    print(title, '\n')

In [None]:
# Convert the 'titles_ints_tensor' tensor to a NumPy array and assign it to 'titles_ints_array'
titles_ints_array = titles_ints_tensor.numpy()

# Calculate the sizes of the train, validation, and test sets
# The train set will contain 60% of the data, the validation set will contain 20%, and the test set will contain the remaining 20%
train_size = int(0.6 * len(titles_ints_array))
val_size = int(0.2 * len(titles_ints_array))
test_size = len(titles_ints_array) - train_size - val_size

# Use the 'random_split' function to split 'titles_ints_array' into a train set and a combined validation/test set
train_ids, val_test_ids = random_split(titles_ints_array, [train_size, val_size + test_size])

# Further split the combined validation/test set into separate validation and test sets
val_ids, test_ids = random_split(val_test_ids, [val_size, test_size])

In [None]:
# Convert the 'train_ids', 'val_ids', and 'test_ids' lists to NumPy arrays of 64-bit integers
train_ids_array = np.array(train_ids, dtype=np.int64)
val_ids_array = np.array(val_ids, dtype=np.int64)
test_ids_array = np.array(test_ids, dtype=np.int64)

# Create PyTorch TensorDataset objects from the NumPy arrays
# TensorDataset is a utility that wraps tensors into a dataset
train_ids_ds = TensorDataset(torch.from_numpy(train_ids_array))
val_ids_ds = TensorDataset(torch.from_numpy(val_ids_array))
test_ids_ds = TensorDataset(torch.from_numpy(test_ids_array))

# Print the number of examples in the training, validation, and test sets
print('Number of training examples:', len(train_ids_ds))
print('Number of validation examples:', len(val_ids_ds))
print('Number of test examples:', len(test_ids_ds))

# Loop over the first 5 elements in the 'train_ids_array' array
for ids in train_ids_array[:5]:
    # Convert the scalar integer to a list containing a single integer
    ids_list = [ids]
    # Use the 'text_from_ids' function to convert the list of integer indices back to text
    # The 'idx_to_char' dictionary is used to map each index to its corresponding character
    print('\n', text_from_ids(ids_list, idx_to_char))

In [None]:
# Define a function to split a sequence into input and target sequences
def split_input_target(sequence):
    # The input sequence consists of all but the last character in the sequence
    input_text = sequence[:-1]
    # The target sequence consists of all but the first character in the sequence
    target_text = sequence[1:]
    # Return the input and target sequences
    return input_text, target_text

# Use a list comprehension to apply the 'split_input_target' function to each sequence in the training, validation, and test sets
# This creates a pair of input-target sequences for each sequence
train_ds = [(split_input_target(ids)) for ids, in train_ids_ds]
val_ds = [(split_input_target(ids)) for ids, in val_ids_ds]
test_ds = [(split_input_target(ids)) for ids, in test_ids_ds]

In [None]:
# Define the batch size, which is the number of examples to process in a single pass
BATCH_SIZE = 64

# Define the buffer size, which is the number of examples from which to randomly sample
BUFFER_SIZE = 10000

# Create a DataLoader for the training dataset
# The DataLoader batches the data, shuffles it, and drops the last batch if it's not full
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# Create a DataLoader for the validation dataset
# The DataLoader batches the data, shuffles it, and drops the last batch if it's not full
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# Create a DataLoader for the test dataset
# The DataLoader batches the data, shuffles it, and drops the last batch if it's not full
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# Print the number of batches in the training, validation, and test loaders
print('Number of training batches:', len(train_loader))
print('Number of validation batches:', len(val_loader))
print('Number of test batches:', len(test_loader))

# Print the shape of the first batch of input sequences in the training loader
# The shape should be (batch_size, sequence_length)
for input_seq, target_seq in train_loader:
    print('\nInput sequence shape:', input_seq.shape)
    break

In [None]:
# Define a custom PyTorch module for a recurrent neural network (RNN) model
class RecipeRNN(nn.Module):
    # The constructor takes the vocabulary size, embedding dimension, and number of RNN units as arguments
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        # Call the constructor of the parent class
        super(RecipeRNN, self).__init__()
        # Store the parameters
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.rnn_units = rnn_units
        
        # Define an embedding layer that maps words to vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # Define a GRU layer with the specified number of units
        self.gru = nn.GRU(embedding_dim, rnn_units, batch_first=True)
        # Define a dense (fully connected) layer that maps the RNN output to the vocabulary size
        self.dense = nn.Linear(rnn_units, vocab_size)

    # Define the forward pass of the model
    def forward(self, inputs, state=None):
        # Pass the inputs through the embedding layer
        x = self.embedding(inputs)
        # If no initial state is provided, initialize it
        if state is None:
            state = self.init_hidden(x.size(0))
        # Pass the embeddings and the state through the GRU layer
        x, state = self.gru(x, state)
        # Pass the GRU output through the dense layer
        x = self.dense(x)
        # Return the dense layer output and the state
        return x, state

    # Define a method to initialize the hidden state of the GRU layer
    def init_hidden(self, batch_size):
        # The hidden state is a tensor of zeros with a size of (1, batch size, number of RNN units)
        # It's placed on the same device as the model
        return torch.zeros(1, batch_size, self.rnn_units, device=device)  # Assuming single layer GRU

In [None]:
# Define the size of the vocabulary, which is the number of unique characters in the text
# This is used in the StringLookup layer, which converts strings to integer indices
vocab_size = len(vocab)

# Define the dimensionality of the embedding space
# This is the size of the vector space in which words will be embedded
# It defines the size of the output vectors from this layer for each word
embedding_dim = 256

# Define the number of recurrent units in the RNN layer
# This is the dimensionality of the output space of the RNN layer
rnn_units = 512

# Instantiate the RecipeRNN model with the defined vocabulary size, embedding dimension, and number of RNN units
# RecipeRNN is a custom model class that presumably implements a recurrent neural network for generating recipe names
model = RecipeRNN(vocab_size, embedding_dim, rnn_units).to(device)

# Assuming `model` is your RecipeRNN instance and `train_loader` is your DataLoader
for input_example_batch, target_example_batch in train_loader:
    input_example_batch, target_example_batch = input_example_batch.to(device), target_example_batch.to(device)
    example_batch_predictions, _ = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    break  # Only take predictions for the first batch

In [None]:
# Define the loss function for the model
# The loss function is the negative log-likelihood loss with logits
# It is suitable for training a classification problem with multiple classes
loss_fn = nn.CrossEntropyLoss()

# Define the optimizer for the model
# The Adam optimizer is a popular optimizer that adapts the learning rate during training
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define the number of epochs, which is the number of times the model will iterate over the entire training dataset
EPOCHS = 10

In [None]:
# Initialize lists to store the training and validation losses
train_losses = []
val_losses = []

# Start the training loop
for epoch in range(EPOCHS):
    # Set the model to training mode
    model.train()
    
    # Initialize a variable to accumulate the training loss
    running_loss = 0.0
    
    # Iterate over the batches in the training data loader
    for input_batch, target_batch in train_loader:
        # Move the input and target batches to the GPU
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)

        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform a forward pass through the model
        predictions, _ = model(input_batch)
        
        # Compute the loss
        loss = loss_fn(predictions.transpose(1, 2), target_batch)
        
        # Perform a backward pass to compute gradients
        loss.backward()
        
        # Update the model parameters
        optimizer.step()
        
        # Accumulate the training loss
        running_loss += loss.item()
        
    # Compute the average training loss for the epoch
    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)
    
    # Switch the model to evaluation mode for validation
    model.eval()
    
    # Initialize a variable to accumulate the validation loss
    running_loss = 0.0
    
    # Iterate over the batches in the validation data loader
    for input_batch, target_batch in val_loader:
        # Move the input and target batches to the GPU
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)
        
        # Perform a forward pass through the model
        predictions, _ = model(input_batch)
        
        # Compute the loss
        loss = loss_fn(predictions.transpose(1, 2), target_batch)
        
        # Accumulate the validation loss
        running_loss += loss.item()

    # Compute the average validation loss for the epoch
    val_loss = running_loss / len(val_loader)
    val_losses.append(val_loss)

    # Print the training and validation losses for this epoch
    print(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {format(train_loss, '.3f')}, Validation Loss: {format(val_loss, '.3f')}")

In [None]:
# Save the model checkpoint
# torch.save(model.state_dict(), 'RecipeRNN.pth')

In [None]:
# Plotting the training and validation loss curves
plt.plot(range(1, EPOCHS+1), train_losses, label='Training Loss')
plt.plot(range(1, EPOCHS+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.legend()
plt.show()

In [None]:
# Define a class for generating recipe titles
class RecipeTitleGenerator:
    # The constructor takes a model, mappings from indices to characters and vice versa, and a temperature
    def __init__(self, model, idx_to_char, char_to_idx, temperature=1.0):
        # Store the parameters
        self.temperature = temperature
        self.model = model.to(device)  # Move the model to the device
        self.idx_to_char = idx_to_char
        self.char_to_idx = char_to_idx
        self.start_token_idx = char_to_idx['<SOS>']  # Start of sequence token
        self.end_token_idx = char_to_idx['<EOS>']  # End of sequence token
        self.device = device  # Device to use for computation (CPU or GPU)
        # Move end_token_idx to the device
        self.end_token_idx = torch.tensor(self.end_token_idx, dtype=torch.long).to(device)

    # Define a method to generate one step of the sequence
    def generate_one_step(self, input_ids, state=None):
        # Ensure that gradients are not computed
        with torch.no_grad():
            # Move the input IDs to the device
            input_ids = input_ids.to(self.device)
            # Perform a forward pass through the model
            logits, state = self.model(input_ids, state)
            # Scale the logits by the temperature and apply a softmax to get probabilities
            logits = logits[:, -1, :] / self.temperature
            logits = F.log_softmax(logits, dim=-1)
            # Sample from the probabilities to get the predicted ID
            predicted_id = torch.multinomial(logits.exp(), num_samples=1)
        # Return the predicted ID and the state
        return predicted_id, state

    # Define a method to generate a sequence
    def generate(self, prompt=None, max_tokens=200, prepend_start_token=True, repeats=1):
        # Initialize the next ID and the state
        next_id = None
        state = None

        # If the start token should be prepended, generate one step with the start token as input
        if prepend_start_token:
            next_id = torch.tensor([[self.start_token_idx]], dtype=torch.long, device=self.device)
            next_id, state = self.generate_one_step(next_id, state)

        # If a prompt is provided, convert it to IDs and use it as the next ID
        if prompt is not None:
            prompt_ids = [self.char_to_idx[char] for char in prompt]
            prompt_tensor = torch.tensor(prompt_ids, dtype=torch.long, device=self.device).unsqueeze(0).repeat(repeats, 1)
            next_id = prompt_tensor

        # Initialize a list to store the result IDs
        result_ids = [next_id]

        # Initialize a tensor to track which sequences are done
        done = torch.zeros_like(next_id, dtype=torch.bool, device=self.device)
        # Generate up to max_tokens steps
        for _ in range(max_tokens):
            # Generate one step
            next_id, state = self.generate_one_step(next_id, state)
            # Add the predicted ID to the result IDs
            result_ids.append(next_id)
            # Update the done tensor
            done |= (next_id == self.end_token_idx.to(self.device))  # Move self.end_token_idx to the same device as next_id
            # If all sequences are done, break the loop
            if done.all():
                break

        # Concatenate the result IDs into a tensor
        result_ids = torch.cat(result_ids, dim=-1)
        # Convert the result IDs to strings
        result = ["".join([self.idx_to_char[idx.item()] for idx in seq]) for seq in result_ids]
        # Return the result
        return result

In [None]:
generated_titles = RecipeTitleGenerator(model, idx_to_char, char_to_idx, temperature=0.9).generate(prompt=None, max_tokens=100, prepend_start_token=True, repeats=10)
for title in generated_titles:
    print(title)

In [None]:
start_time = time.time()

# Define the prompt
prompt = 'Polenta with'

# Convert the prompt to a tensor
prompt_tensor = torch.tensor([char_to_idx[char] for char in prompt], dtype=torch.long).unsqueeze(0)

# Create an instance of RecipeTitleGenerator
recipe_title_generator = RecipeTitleGenerator(model, idx_to_char, char_to_idx,)

# Generate recipe titles
result = recipe_title_generator.generate(prompt=prompt, repeats=1)

# End the timer
end_time = time.time()

# Print the generated results and runtime
print(result, '\n\n' + '_' * 80)
print('\nRun time:', end_time - start_time)

In [750]:
# Define the reference titles from the training dataset
reference_titles = []

# Loop over the training dataset
for input_seq, target_seq in test_ds:
    # Convert the target sequence to text
    target_text = ''.join([idx_to_char[idx.item()] for idx in target_seq if idx != char_to_idx['<PAD>']])
    # Append the target text to the reference titles list
    reference_titles.append(target_text)

# Generate recipe titles
result = recipe_title_generator.generate(prompt=None, repeats=1)

# Compute BLEU score for each generated title
bleu_scores = []
for generated_title in result:
    bleu_score = sentence_bleu(reference_titles, generated_title)
    bleu_scores.append(bleu_score)

# Calculate the average BLEU score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print the generated results, BLEU scores, and runtime
print("Reference titles:", reference_titles)
print("Generated titles:", result)
print("BLEU scores:", bleu_scores)
print("Average BLEU score:", average_bleu_score)
print("Runtime:", end_time - start_time)

Reference titles: ['Sforzando (Whiskey and Mezcal Cocktail)<EOS>', 'Cashew Nut Nog<EOS>', 'Popped Amaranth and Toasted Wheat Berry Fool<EOS>', 'Walnut-Thyme Honey<EOS>', 'Pork Volcánes al Pastor<EOS>', 'Beef Stock<EOS>', 'Fruit Juice "Gummies"<EOS>', 'Strawberry Coconut Cake<EOS>', 'Cassava Bacon<EOS>', 'Polenta and Sausage Stuffing<EOS>', 'Spiced Pumpkin Cheesecake with Caramel-Bourbon Sauce<EOS>', 'Campanelle with Tomatoes and Feta<EOS>', 'Grilled Poussins with Lemon Herb Butter<EOS>', 'Chocolate Malted Ice Cream<EOS>', 'Cornbread Muffins With Whipped Sweet Corn<EOS>', 'Shrimp Bisque<EOS>', 'Crispy Apple-Oat Fritters<EOS>', 'Chicken Pizza<EOS>', 'Frozen Yogurt Bark<EOS>', 'Christmas Coconut Cake<EOS>', 'Lamb Chops with Everything-Bagel Yogurt and Chickpeas<EOS>', 'Fennel- and Dill-Rubbed Grilled Salmon<EOS>', 'Fresh Coconut Layer Cake<EOS>', 'Pastrami and Potato Hash with Fried Eggs<EOS>', 'Pork Banh Mi with Quick-Pickled Radishes and Carrots<EOS>', 'Dry-Rubbed Flank Steak with Grill