# Data cleaning
- adapted from https://www.kaggle.com/code/omkarlone/final-code

1.Verify Image Folder Content:

In [1]:
import os
import pandas as pd
from PIL import Image

# Path to the CSV and image directory
csv_file = 'archive/Food Ingredients and Recipe Dataset with Image Name Mapping.csv'
image_folder = 'archive/Food Images/Food Images'

# Load the CSV file
df = pd.read_csv(csv_file)

# Get the list of image files in the folder
image_files = os.listdir(image_folder)

# Ensure all images are in the same format (e.g., .jpg or .png)
image_formats = set([os.path.splitext(img)[1].lower() for img in image_files])

# Check if all images have the same format
if len(image_formats) == 1:
    print(f"All images are in the same format: {image_formats.pop()}")
else:
    print(f"Different formats found: {image_formats}")

# Check if all image names in the CSV are present in*  /home/chelse the image folder
missing_images = []
for img_name in df['Image_Name']:
    img_file = img_name + '.jpg'  # or '.png' depending on the format
    if img_file not in image_files:
        missing_images.append(img_name)

if missing_images:
    print(f"Missing images: {missing_images}")
else:
    print("All images in the CSV are present in the folder.")


All images are in the same format: .jpg
Missing images: ['#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?', '#NAME?']


Remove rows with invalid image names (#NAME?)

In [2]:
# Remove rows with invalid 'Image_Name' entries (e.g., '#NAME?')
df_cleaned = df[df['Image_Name'] != '#NAME?']

# Check the remaining missing images
missing_images_cleaned = []
for img_name in df_cleaned['Image_Name']:
    img_file = img_name + '.jpg'  # Assuming all images are .jpg
    if img_file not in image_files:
        missing_images_cleaned.append(img_name)

if missing_images_cleaned:
    print(f"Missing images after cleaning: {missing_images_cleaned}")
else:
    print("No missing images after cleaning.")


No missing images after cleaning.


In [3]:
# Remove null titles
print(len(df))
df_cleaned = df[df['Title'].notnull()]
print(len(df_cleaned))


13501
13496


drop the Ingredients column from the CSV file  Contains the ingredients as they were scraped from the website.

In [4]:
# Drop the 'Ingredients' column from the dataframe
df_cleaned = df_cleaned.drop(columns=['Ingredients'])

# Save the updated dataframe back to a CSV file
output_csv = 'archive/cleaned_data.csv'  # Specify the desired output file path
df_cleaned.to_csv(output_csv, index=False)

# Check the result
print("Ingredients column dropped. Here's the updated CSV:")
df_cleaned.head()


Ingredients column dropped. Here's the updated CSV:


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


check for consistent image sizes and resolution:

In [5]:
from PIL import Image
import os

# Path to the image folder
image_folder = 'archive/Food Images/Food Images'
image_folder = os.path.abspath(image_folder)

# List all image files in the folder
image_files = os.listdir(image_folder)

# Function to get image size
def get_image_size(image_path):
    try:
        with Image.open(image_path) as img:
            return img.size  # Returns (width, height)
    except Image.UnidentifiedImageError as e:
        print(f"cannot identify image file: {image_path}")

# Initialize a list to store the sizes of the images
image_sizes = []

broken = []
# Check the size of each image
for img_file in image_files:
    if img_file.endswith('.jpg'):  # Ensures only .jpg images are considered
        img_path = os.path.join(image_folder, img_file)
        size = get_image_size(img_path)
        if not size:
            broken.append(img_file)
        image_sizes.append(size)

# Find unique sizes
unique_sizes = set(image_sizes)

# Check if all images have the same size
if len(unique_sizes) == 1:
    print(f"All images are of the same size: {unique_sizes.pop()}")
else:
    print(f"Different image sizes found: {unique_sizes}")


cannot identify image file: /home/chelsea/Documents/GATech/DL_group_project/archive/Food Images/Food Images/pan-seared-salmon-on-baby-arugula-242445.jpg
Different image sizes found: {None, (274, 169)}


In [6]:
for broke in broken:
    print (f"removed {broke}")
    df_cleaned=df_cleaned[df_cleaned['Image_Name'] != broke ]
df_cleaned.to_csv(output_csv, index=False)

removed pan-seared-salmon-on-baby-arugula-242445.jpg


frequency count of the different image sizes

In [7]:
from PIL import Image
import os
from collections import Counter

# Path to the image folder
# image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# List all image files in the folder
image_files = os.listdir(image_folder)

# # Function to get image size
# def get_image_size(image_path):
#     with Image.open(image_path) as img:
#         return img.size  # Returns (width, height)

# Initialize a list to store the sizes of the images
image_sizes = []

# Check the size of each image
for img_file in image_files:
    if img_file.endswith('.jpg'):  # Ensures only .jpg images are considered
        img_path = os.path.join(image_folder, img_file)
        size = get_image_size(img_path)
        image_sizes.append(size)

# Count the frequency of each unique size
size_counts = Counter(image_sizes)

# Print the counts of each size
for size, count in size_counts.items():
    print(f"Size {size}: {count} images")


cannot identify image file: /home/chelsea/Documents/GATech/DL_group_project/archive/Food Images/Food Images/pan-seared-salmon-on-baby-arugula-242445.jpg
Size (274, 169): 13581 images
Size None: 1 images


resize the outlier image to match the standard size (274, 169):

In [8]:
# from PIL import Image
# import os

# # Path to the image folder (input directory)
# # image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# # Output directory where resized images will be saved
# output_folder = '/kaggle/working/resized_images'

# # Create the output folder if it doesn't exist
# os.makedirs(output_folder, exist_ok=True)

# # Function to resize an image to a specific size and save as .jpg
# def resize_image(image_path, target_size=(274, 169), output_path=None):
#     with Image.open(image_path) as img:
#         # Convert any non-RGB images (including RGBA) to RGB for saving as .jpg
#         if img.mode != 'RGB':
#             img = img.convert('RGB')
        
#         # Resize the image
#         resized_img = img.resize(target_size)
        
#         # Save the image as JPG
#         resized_img.save(output_path, format='JPEG')  # Save as JPEG

# # List all image files in the folder
# # image_files = os.listdir(image_folder)

# # Resize the single outlier image (with different size)
# for img_file in image_files:
#     if img_file.endswith('.jpg'):
#         img_path = os.path.join(image_folder, img_file)
#         size = get_image_size(img_path)
#         if size == (702, 722):  # The outlier image
#             print(f"Resizing image: {img_file}")
#             output_path = os.path.join(output_folder, img_file)  # Save resized image in the output folder
#             resize_image(img_path, output_path=output_path)  # Resize to (274, 169)
#             print(f"Image {img_file} resized and saved to {output_path}.")


text fields for Title, Instructions, and Cleaned_Ingredients are appropriately handled for language

In [9]:
# # Install langid if it's not already installed
# !pip install langid


In [9]:

import pandas as pd
import langid  # Language detection library

# Load the CSV file
# csv_file = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Ingredients and Recipe Dataset with Image Name Mapping.csv'
df = pd.read_csv(output_csv)

# Function to detect the language of a text field
def detect_language(text):
    try:
        # langid returns a tuple (language, confidence)
        return langid.classify(text)[0]
    except:
        return "unknown"  # If language detection fails, mark as "unknown"

# Apply language detection to Title, Instructions, and Cleaned_Ingredients columns
df['Title_Language'] = df['Title'].apply(detect_language)
df['Instructions_Language'] = df['Instructions'].apply(detect_language)
df['Cleaned_Ingredients_Language'] = df['Cleaned_Ingredients'].apply(detect_language)

# Save the updated DataFrame with language information and Image_Name column
# output_csv_path = '/kaggle/working/cleaned_data.csv'
output_csv_path = output_csv
df.to_csv(output_csv_path, index=False)

print(f"Updated CSV with language information saved to {output_csv_path}")


Updated CSV with language information saved to archive/cleaned_data.csv


In [10]:
# Load the cleaned CSV file to display its contents
updated_df = pd.read_csv(output_csv)

# Display the first few rows of the CSV
updated_df.head()


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,Title_Language,Instructions_Language,Cleaned_Ingredients_Language
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...",en,en,en
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...",en,en,en
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...",en,en,en
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in...",en,en,en
4,4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",en,en,en


check if the Title, Instructions, and Cleaned_Ingredients columns contain any special characters or non-standard symbols

In [11]:
import pandas as pd
import re

# Load the existing DataFrame
output_csv = 'archive/cleaned_data.csv'
updated_df = pd.read_csv(output_csv)

# Function to check for special characters or non-standard symbols
def contains_special_chars(text):
    if pd.isna(text):  # Check if the text is NaN
        return False  # No special chars in NaN, treat as False
    # Regex to match any character that is not a letter, number, or basic punctuation
    pattern = r'[^a-zA-Z0-9\s,\.!?:;\'"()-]'
    return bool(re.search(pattern, text))

# Apply the function to check for special characters in the Title, Instructions, and Cleaned_Ingredients columns
updated_df['Title_Special_Chars'] = updated_df['Title'].apply(contains_special_chars)
updated_df['Instructions_Special_Chars'] = updated_df['Instructions'].apply(contains_special_chars)
updated_df['Cleaned_Ingredients_Special_Chars'] = updated_df['Cleaned_Ingredients'].apply(contains_special_chars)

# Save the updated DataFrame to a new CSV
output_csv_path = 'archive/updated_data_with_special_chars.csv'
updated_df.to_csv(output_csv_path, index=False)

# Show the first few rows of the updated DataFrame
updated_df.head()


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,Title_Language,Instructions_Language,Cleaned_Ingredients_Language,Title_Special_Chars,Instructions_Special_Chars,Cleaned_Ingredients_Special_Chars
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...",en,en,en,False,True,True
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...",en,en,en,False,True,True
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...",en,en,en,False,True,True
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in...",en,en,en,False,True,True
4,4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",en,en,en,False,False,True


Convert Ingredients to List Format

In [19]:
import pandas as pd
import ast

# Load the cleaned DataFrame
output_csv = 'archive/cleaned_data.csv'
updated_df = pd.read_csv(output_csv)

# Function to convert ingredients into a list format
def convert_to_list(ingredient):
    if isinstance(ingredient, str):
        try:
            # Check if the string is a list-like string (e.g., "['1 cup sugar', '2 eggs']")
            return ast.literal_eval(ingredient)  # Safely convert string to list
        except (ValueError, SyntaxError):
            # If not a valid list string, split by common separators like commas
            return [item.strip() for item in ingredient.split(',')]
    return []

# Apply the function to the Cleaned_Ingredients column to convert to list format
updated_df['Cleaned_Ingredients'] = updated_df['Cleaned_Ingredients'].apply(convert_to_list)

# Save the updated DataFrame with all columns except 'Ingredients' in the new format
output_csv_path = 'archive/updated_data_with_list_ingredients.csv'
updated_df.to_csv(output_csv_path, index=False)

# Show the first few rows of the updated DataFrame
updated_df.head()


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,Title_Language,Instructions_Language,Cleaned_Ingredients_Language
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa...",en,en,en
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo...",en,en,en
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts...",en,en,en
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int...",en,en,en
4,4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w...",en,en,en


In [12]:
import pandas as pd
import ast

# Load the cleaned DataFrame
output_csv = 'archive/cleaned_data.csv'
updated_df = pd.read_csv(output_csv)

# Function to convert ingredients into a list format
def convert_to_list(ingredient):
    if isinstance(ingredient, str):
        try:
            # Check if the string is a list-like string (e.g., "['1 cup sugar', '2 eggs']")
            return ast.literal_eval(ingredient)  # Safely convert string to list
        except (ValueError, SyntaxError):
            # If not a valid list string, split by common separators like commas
            return [item.strip() for item in ingredient.split(',')]
    return []

# Apply the function to the Cleaned_Ingredients column to convert to list format
updated_df['Cleaned_Ingredients'] = updated_df['Cleaned_Ingredients'].apply(convert_to_list)
updated_df['Instructions'] = updated_df['Instructions'].apply(convert_to_list)

# Save the updated DataFrame with all columns except 'Ingredients' in the new format
output_csv_path = 'archive/updated_data_with_lists.csv'
updated_df.to_csv(output_csv_path, index=False)

# Show the first few rows of the updated DataFrame
updated_df.head()


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"[Pat chicken dry with paper towels, season all...",miso-butter-roast-chicken-acorn-squash-panzanella,"[1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher sa..."
1,1,Crispy Salt and Pepper Potatoes,[Preheat oven to 400°F and line a rimmed bakin...,crispy-salt-and-pepper-potatoes-dan-kluger,"[2 large egg whites, 1 pound new potatoes (abo..."
2,2,Thanksgiving Mac and Cheese,[Place a rack in middle of oven; preheat to 40...,thanksgiving-mac-and-cheese-erick-williams,"[1 cup evaporated milk, 1 cup whole milk, 1 ts..."
3,3,Italian Sausage and Bread Stuffing,[Preheat oven to 350°F with rack in middle. Ge...,italian-sausage-and-bread-stuffing-240559,"[1 (¾- to 1-pound) round Italian loaf, cut int..."
4,4,Newton's Law,[Stir together brown sugar and hot water in a ...,newtons-law-apple-bourbon-cocktail,"[1 teaspoon dark brown sugar, 1 teaspoon hot w..."


Updated resize_image function:

In [13]:
# def resize_image(image_path, output_folder, target_size=(274, 169)):
#     with Image.open(image_path) as img:
#         # Convert the image to 'RGB' if it's in 'P' mode or any other non-RGB mode
#         if img.mode != 'RGB':
#             img = img.convert('RGB')
        
#         resized_img = img.resize(target_size)
#         # Save resized image in the output folder
#         output_path = os.path.join(output_folder, os.path.basename(image_path))
#         resized_img.save(output_path, format='JPEG')  # Explicitly save as JPEG
#         return output_path


In [14]:
# import os
# import pandas as pd

# # Load the CSV into updated_df
# csv_file_path = 'archive/cleaned_data.csv'  # Path to your cleaned CSV file
# updated_df = pd.read_csv(csv_file_path)

# # Set image folder and output folder paths
# # image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'
# output_folder = 'archive/resized_images'

# # Create output folder if it doesn't exist
# os.makedirs(output_folder, exist_ok=True)

# # Function to resize images
# def resize_image(image_path, target_size=(274, 169)):
#     from PIL import Image
#     with Image.open(image_path) as img:
#         # Convert the image to 'RGB' if it's not already in that mode
#         if img.mode != 'RGB':
#             img = img.convert('RGB')
        
#         resized_img = img.resize(target_size)
#         output_path = os.path.join(output_folder, os.path.basename(image_path))
#         resized_img.save(output_path, format='JPEG')  # Save as JPEG
#         return output_path

# # List to hold resized image paths
# resized_image_paths = []

# # Loop through the rows in updated_df to check images
# for _, row in updated_df.iterrows():
#     image_name = row['Image_Name'] + '.jpg'  # Assuming images are in .jpg format
#     image_path = os.path.join(image_folder, image_name)
    
#     if os.path.exists(image_path):
#         # Resize and save the image
#         resized_image_path = resize_image(image_path)
#         resized_image_paths.append(resized_image_path)  # Add to list of resized paths
#     else:
#         # If image is missing, mark as NaN in resized_image_paths
#         resized_image_paths.append(None)

# # Add resized image paths to the DataFrame
# updated_df['Resized_Image_Path'] = resized_image_paths

# # Save the updated DataFrame to a new CSV file
# updated_csv_path = 'archive/updated_data_with_images.csv'
# updated_df.to_csv(updated_csv_path, index=False)

# # Display a snippet of the updated DataFrame
# updated_df.head()


To match the images with the rows in the CSV file based on the Image_Name column and check for coverage (count of matching images)

In [15]:
# import os
# import pandas as pd
# from PIL import Image

# # Load the CSV into updated_df
# csv_file_path =  'archive/cleaned_data.csv' # Path to your cleaned CSV file
# updated_df = pd.read_csv(csv_file_path)

# # Set image folder and output folder paths
# # image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'
# # output_folder = '/kaggle/working/resized_images'

# # Create output folder if it doesn't exist
# os.makedirs(output_folder, exist_ok=True)

# # Function to resize images
# def resize_image(image_path, target_size=(274, 169)):
#     with Image.open(image_path) as img:
#         # Convert the image to 'RGB' if it's not already in that mode
#         if img.mode != 'RGB':
#             img = img.convert('RGB')
        
#         resized_img = img.resize(target_size)
#         output_path = os.path.join(output_folder, os.path.basename(image_path))
#         resized_img.save(output_path, format='JPEG')  # Save as JPEG
#         return output_path

# # List to hold resized image paths and log missing images
# resized_image_paths = []
# missing_images = []

# # Loop through the rows in updated_df to check images
# for _, row in updated_df.iterrows():
#     image_name = row['Image_Name'] + '.jpg'  # Assuming images are in .jpg format
#     image_path = os.path.join(image_folder, image_name)
    
#     # Check if the image exists in the folder
#     if os.path.exists(image_path):
#         # Resize and save the image
#         resized_image_path = resize_image(image_path)
#         resized_image_paths.append(resized_image_path)  # Add to list of resized paths
#     else:
#         # If image is missing, mark as NaN in resized_image_paths and log missing image
#         resized_image_paths.append(None)
#         missing_images.append(image_name)  # Log missing image

# # Add resized image paths to the DataFrame
# updated_df['Resized_Image_Path'] = resized_image_paths

# # Save the updated DataFrame to a new CSV file
# updated_csv_path = 'archive/updated_data_with_images.csv'
# updated_df.to_csv(updated_csv_path, index=False)

# # Print out missing images for troubleshooting
# if missing_images:
#     print(f"Missing images: {missing_images}")
# else:
#     print("All images found and resized successfully.")

# # Display a snippet of the updated DataFrame
# updated_df.head()


In [16]:
import os
import pandas as pd

# Load the CSV into updated_df
csv_file_path = 'archive/cleaned_data.csv'  # Path to your cleaned CSV file
updated_df = pd.read_csv(csv_file_path)

# Set image folder path
# image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# Initialize counters
matching_images_count = 0
total_images_in_csv = len(updated_df)

# Loop through the rows in updated_df to check images
for _, row in updated_df.iterrows():
    image_name = row['Image_Name'] + '.jpg'  # Assuming images are in .jpg format
    image_path = os.path.join(image_folder, image_name)
    
    if os.path.exists(image_path):
        matching_images_count += 1

# Calculate coverage
coverage_percentage = (matching_images_count / total_images_in_csv) * 100

# Output the results
print(f"Number of matching images: {matching_images_count}")
print(f"Total number of images in CSV: {total_images_in_csv}")
print(f"Coverage: {coverage_percentage:.2f}%")


Number of matching images: 13471
Total number of images in CSV: 13471
Coverage: 100.00%


remove any rows in the CSV where information about Title, Instructions, Image_Name, or Cleaned_Ingredients is missing, and also delete the corresponding images

In [17]:
import os
import pandas as pd

# Load the CSV into updated_df
csv_file_path = 'archive/cleaned_data.csv'  # Path to your cleaned CSV file
updated_df = pd.read_csv(csv_file_path)

# Set image folder path
# image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# Step 1: Remove rows with missing values in Title, Instructions, Image_Name, or Cleaned_Ingredients
updated_df = updated_df.dropna(subset=['Title', 'Instructions', 'Image_Name', 'Cleaned_Ingredients'])

# Step 2: Filter out rows where images don't exist in the image folder
valid_image_rows = []
deleted_images_count = 0

for _, row in updated_df.iterrows():
    image_name = row['Image_Name'] + '.jpg'  # Assuming images are in .jpg format
    image_path = os.path.join(image_folder, image_name)
    
    if os.path.exists(image_path):
        valid_image_rows.append(row)  # Keep the row if image exists
    else:
        deleted_images_count += 1  # Count missing images

# Create a DataFrame with only valid image rows
updated_df = pd.DataFrame(valid_image_rows)

# Step 3: Match remaining images with the CSV
matching_images_count = len(updated_df)
total_images_in_csv = len(updated_df)

# Calculate coverage
coverage_percentage = (matching_images_count / total_images_in_csv) * 100

# Output the results
print(f"Rows after cleaning: {len(updated_df)}")
print(f"Images deleted (missing): {deleted_images_count}")
print(f"Number of matching images: {matching_images_count}")
print(f"Total number of images in CSV: {total_images_in_csv}")
print(f"Coverage: {coverage_percentage:.2f}%")

# Save the cleaned CSV file
cleaned_csv_path = 'archive/cleaned_data_after_removal.csv'
updated_df.to_csv(cleaned_csv_path, index=False)
updated_df.head()



Rows after cleaning: 13463
Images deleted (missing): 0
Number of matching images: 13463
Total number of images in CSV: 13463
Coverage: 100.00%


Unnamed: 0.1,Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients,Title_Language,Instructions_Language,Cleaned_Ingredients_Language
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...",en,en,en
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (...",en,en,en
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ...",en,en,en
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in...",en,en,en
4,4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",en,en,en


## Preprocess the Image Data

In [None]:
import os
import numpy as np
from PIL import Image
import pandas as pd

# Image preprocessing function
def preprocess_image(image_path, target_size=(224, 224)):
    with Image.open(image_path) as img:
        img = img.convert('RGB')  # Convert to RGB if not already
        img = img.resize(target_size)  # Resize image to the target size
        img_array = np.array(img)  # Convert to numpy array
        img_array = img_array / 255.0  # Normalize pixel values to [0, 1]
    return img_array

# Generator function to process images in batches
def image_batch_generator(image_paths, batch_size=32, target_size=(224, 224)):
    batch_images = []
    for image_path in image_paths:
        img = preprocess_image(image_path, target_size)
        batch_images.append(img)
        
        if len(batch_images) == batch_size:
            yield np.array(batch_images)
            batch_images = []  # Reset for next batch
    
    # Yield remaining images if there are any
    if batch_images:
        yield np.array(batch_images)

# Load the CSV
csv_file_path = 'archive/cleaned_data_after_removal.csv'
updated_df = pd.read_csv(csv_file_path)

# Set the image folder path
# image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# Prepare image paths list
image_paths = [os.path.join(image_folder, row['Image_Name'] + '.jpg') for _, row in updated_df.iterrows()]

# Example usage: Process the images in batches
batch_size = 32
for image_batch in image_batch_generator(image_paths, batch_size):
    # Here you can feed image_batch to your model for training or inference
    print(f"Processed batch of {len(image_batch)} images.")


Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed 

UnidentifiedImageError: cannot identify image file '/home/chelsea/Documents/GATech/DL_group_project/archive/Food Images/Food Images/pan-seared-salmon-on-baby-arugula-242445.jpg'

## Batch Processing for Model Training

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras import layers, models

# Image preprocessing function
def preprocess_image(image_path, target_size=(224, 224)):
    with Image.open(image_path) as img:
        img = img.convert('RGB')  # Convert to RGB if not already
        img = img.resize(target_size)  # Resize image to the target size
        img_array = np.array(img)  # Convert to numpy array
        img_array = img_array / 255.0  # Normalize pixel values to [0, 1]
    return img_array

# Generator function to process images in batches
def image_batch_generator(image_paths, batch_size=32, target_size=(224, 224)):
    batch_images = []
    for image_path in image_paths:
        img = preprocess_image(image_path, target_size)
        batch_images.append(img)
        
        if len(batch_images) == batch_size:
            yield np.array(batch_images)
            batch_images = []  # Reset for next batch
    
    # Yield remaining images if there are any
    if batch_images:
        yield np.array(batch_images)

# Load the CSV
csv_file_path = 'archive/cleaned_data_after_removal.csv'
updated_df = pd.read_csv(csv_file_path)

# Set the image folder path
# image_folder = '/kaggle/input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images'

# Prepare image paths list
image_paths = [os.path.join(image_folder, row['Image_Name'] + '.jpg') for _, row in updated_df.iterrows()]

# Multi-GPU setup
strategy = tf.distribute.MirroredStrategy()

print(f"Number of devices: {strategy.num_replicas_in_sync}")

# Now, within the strategy scope, you can create and compile your model
with strategy.scope():
    # Build a simple model (You can replace this with your actual model)
    model = models.Sequential([
        layers.InputLayer(input_shape=(224, 224, 3)),
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(10, activation='softmax')  # Assuming 10 classes, modify as per your case
    ])
    
    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])

# Example usage: Process the images in batches using the batch generator
batch_size = 32
for image_batch in image_batch_generator(image_paths, batch_size):
    # You can now feed image_batch to the model in the distributed context
    # Here, you would typically call model.fit or model.predict
    print(f"Processed batch of {len(image_batch)} images.")

    # This is where you would handle the batch with your model
    # Example: model.fit(image_batch, labels) or model.predict(image_batch)


Number of devices: 2




Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed batch of 32 images.
Processed 