In [1]:
import pandas as pd
import ast

In [2]:
# Load the ingredients labels file
labels_df = pd.read_csv('../labels/ingredients.csv')

# Create a dictionary for ingredient synonyms and their categories
ingredient_categories = {}
for _, row in labels_df.iterrows():
    synonyms = row['Ingredient Synonyms'].split('; ')
    for synonym in synonyms:
        ingredient_categories[synonym.lower()] = row['Category']
    # Add the aliased ingredient name as well
    ingredient_categories[row['Aliased Ingredient Name'].lower()] = row['Category']

# Function to categorize ingredients in a recipe based on NER
def categorize_ingredients(ner_list):
    categories = set()
    for ingredient in ner_list:
        normalized_ingredient = ingredient.lower()
        for key in ingredient_categories:
            if key == 'dish' or key == 'cereal':
                continue
            if key in normalized_ingredient:
                category = ingredient_categories[key]
                if category.lower() != 'dish' and category.lower() != 'cereal':
                    categories.add(category)
    return list(categories)

# Process the dataset in chunks of 10,000
chunk_size = 10000
output_file = '../recipes_with_categories.csv'
first_chunk = True

for i, chunk in enumerate(pd.read_csv('../recipes_data.csv', chunksize=chunk_size)):
    chunk['NER'] = chunk['NER'].apply(ast.literal_eval)  # Convert NER column to lists
    chunk['categories'] = chunk['NER'].apply(categorize_ingredients)

    # Save each chunk, appending after the first one
    chunk.to_csv(output_file, mode='w' if first_chunk else 'a', header=first_chunk, index=False)
    first_chunk = False  # Only write the header for the first chunk

    print(f'Processed {((i + 1) * chunk_size):,} rows')

print("Processing complete.")

Processed 10,000 rows
Processed 20,000 rows
Processed 30,000 rows
Processed 40,000 rows
Processed 50,000 rows
Processed 60,000 rows
Processed 70,000 rows
Processed 80,000 rows
Processed 90,000 rows
Processed 100,000 rows
Processed 110,000 rows
Processed 120,000 rows
Processed 130,000 rows
Processed 140,000 rows
Processed 150,000 rows
Processed 160,000 rows
Processed 170,000 rows
Processed 180,000 rows
Processed 190,000 rows
Processed 200,000 rows
Processed 210,000 rows
Processed 220,000 rows
Processed 230,000 rows
Processed 240,000 rows
Processed 250,000 rows
Processed 260,000 rows
Processed 270,000 rows
Processed 280,000 rows
Processed 290,000 rows
Processed 300,000 rows
Processed 310,000 rows
Processed 320,000 rows
Processed 330,000 rows
Processed 340,000 rows
Processed 350,000 rows
Processed 360,000 rows
Processed 370,000 rows
Processed 380,000 rows
Processed 390,000 rows
Processed 400,000 rows
Processed 410,000 rows
Processed 420,000 rows
Processed 430,000 rows
Processed 440,000 ro