In [98]:
import pandas as pd

df_raw = pd.read_csv('recipes.csv')

print(df_raw.columns)  # Print all column names
print(df_raw.head(2).to_dict())  # Convert the first two rows to a dictionary for easier readability



Index(['RecipeId', 'Name', 'AuthorId', 'AuthorName', 'CookTime', 'PrepTime',
       'TotalTime', 'DatePublished', 'Description', 'Images', 'RecipeCategory',
       'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts',
       'AggregatedRating', 'ReviewCount', 'Calories', 'FatContent',
       'SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
       'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
       'RecipeServings', 'RecipeYield', 'RecipeInstructions'],
      dtype='object')
{'RecipeId': {0: 38, 1: 39}, 'Name': {0: 'Low-Fat Berry Blue Frozen Dessert', 1: 'Biryani'}, 'AuthorId': {0: 1533, 1: 1567}, 'AuthorName': {0: 'Dancer', 1: 'elly9812'}, 'CookTime': {0: 'PT24H', 1: 'PT25M'}, 'PrepTime': {0: 'PT45M', 1: 'PT4H'}, 'TotalTime': {0: 'PT24H45M', 1: 'PT4H25M'}, 'DatePublished': {0: '1999-08-09T21:46:00Z', 1: '1999-08-29T13:12:00Z'}, 'Description': {0: 'Make and share this Low-Fat Berry Blue Frozen Dessert recipe from Food.com.', 1: 'Make

In [167]:
# Pick the columns to keep and rename them
columns_to_keep = {
    'Name': 'title',
    'RecipeCategory': 'category',
    'Keywords': 'keywords',
    'RecipeServings': 'servings',
    'RecipeIngredientParts': 'ingredients',
    'RecipeIngredientQuantities': 'ingredient_quantities',
    'RecipeInstructions': 'steps',
    'AggregatedRating': 'rating',
    'ReviewCount': 'review_count',
#    'Description': 'description',
#    'CookTime': 'cook_time',
#    'PrepTime': 'prep_time',
#    'TotalTime': 'total_time',
#    'RecipeId': 'recipe_id',
#    'AuthorId': 'author_id',
#    'AuthorName': 'author_name',
#    'DatePublished': 'date_published',
#    'Images': 'images',
#    'Calories': 'calories',
#    'FatContent': 'fat_content',
#    'SaturatedFatContent': 'saturated_fat_content',
#    'CholesterolContent': 'cholesterol_content',
#    'SodiumContent': 'sodium_content',
#    'CarbohydrateContent': 'carbohydrate_content',
#    'FiberContent': 'fiber_content',
#    'SugarContent': 'sugar_content',
#    'ProteinContent': 'protein_content',
#    'RecipeYield': 'recipe_yield'
}

# Select and rename the columns
df = df_raw[list(columns_to_keep.keys())].rename(columns=columns_to_keep)

df.info()
#df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   title                  522517 non-null  object 
 1   category               521766 non-null  object 
 2   keywords               505280 non-null  object 
 3   servings               339606 non-null  float64
 4   ingredients            522517 non-null  object 
 5   ingredient_quantities  522514 non-null  object 
 6   steps                  522517 non-null  object 
 7   rating                 269294 non-null  float64
 8   review_count           275028 non-null  float64
dtypes: float64(3), object(6)
memory usage: 35.9+ MB


In [168]:
def calculate_missing_percentage(df):
    # Calculate the percentage of missing values for each column and print them
    missing_percentage = df.isna().mean() * 100
    missing_info = missing_percentage.sort_values(ascending=False).reset_index()
    missing_info.columns = ['Column', 'Percentage of Missing Values']
    print(missing_info)

calculate_missing_percentage(df)


                  Column  Percentage of Missing Values
0                 rating                     48.462155
1           review_count                     47.364775
2               servings                     35.005751
3               keywords                      3.298840
4               category                      0.143727
5  ingredient_quantities                      0.000574
6                  title                      0.000000
7            ingredients                      0.000000
8                  steps                      0.000000


In [169]:
# Normalize text columns
df['title'] = df['title'].str.strip().str.lower()
df['category'] = df['category'].str.strip().str.lower()
df['keywords'] = df['keywords'].str.strip().str.lower()
df['ingredients'] = df['ingredients'].str.strip().str.lower()
df['ingredient_quantities'] = df['ingredient_quantities'].str.strip().str.lower()
df['steps'] = df['steps'].str.strip().str.lower()

# Ensure 'servings', 'rating', 'review_count' are numeric
df['servings'] = pd.to_numeric(df['servings'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['review_count'] = pd.to_numeric(df['review_count'], errors='coerce')

df = df.dropna(subset=['title', 'category', 'keywords', 'servings', 'ingredients', 'ingredient_quantities', 'steps'])
df.info()
calculate_missing_percentage(df)

<class 'pandas.core.frame.DataFrame'>
Index: 327969 entries, 0 to 522514
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   title                  327969 non-null  object 
 1   category               327969 non-null  object 
 2   keywords               327969 non-null  object 
 3   servings               327969 non-null  float64
 4   ingredients            327969 non-null  object 
 5   ingredient_quantities  327969 non-null  object 
 6   steps                  327969 non-null  object 
 7   rating                 168216 non-null  float64
 8   review_count           171601 non-null  float64
dtypes: float64(3), object(6)
memory usage: 25.0+ MB
                  Column  Percentage of Missing Values
0                 rating                     48.709787
1           review_count                     47.677677
2                  title                      0.000000
3               category               

In [170]:
import ast

# Format the list to a python list - return None if list is empty
def convert_to_list(item_str):
    try: # Remove 'c(' and ')' and convert the string into a list using ast.literal_eval for safety
        return ast.literal_eval(item_str.replace('c(', '[').replace(')', ']').replace('NA', '"to taste"'))
    # converst the cvc format to list -> replacing NA with "placeholder" (in to taste)
    except (ValueError, SyntaxError):
        # If there's an error (invalid format), return an empty list or handle as needed
        return None
# Apply this function directly to the relevant columns
df['keywords'] = df['keywords'].apply(convert_to_list)
df['ingredients'] = df['ingredients'].apply(convert_to_list)
df['ingredient_quantities'] = df['ingredient_quantities'].apply(convert_to_list)
df['steps'] = df['steps'].apply(convert_to_list)

In [171]:
df = df.dropna(subset=['keywords','ingredients', 'ingredient_quantities', 'steps'])
calculate_missing_percentage(df)
df

                  Column  Percentage of Missing Values
0                 rating                     49.044200
1           review_count                     47.977734
2                  title                      0.000000
3               category                      0.000000
4               keywords                      0.000000
5               servings                      0.000000
6            ingredients                      0.000000
7  ingredient_quantities                      0.000000
8                  steps                      0.000000


Unnamed: 0,title,category,keywords,servings,ingredients,ingredient_quantities,steps,rating,review_count
0,low-fat berry blue frozen dessert,frozen desserts,"[dessert, low protein, low cholesterol, health...",4.0,"[blueberries, granulated sugar, vanilla yogurt...","[4, 1/4, 1, 1]","[toss 2 cups berries with sugar., let stand fo...",4.5,4.0
3,carina's tofu-vegetable kebabs,soy/tofu,"[beans, vegetable, low cholesterol, weeknight,...",2.0,"[extra firm tofu, eggplant, zucchini, mushroom...","[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1/2, 1/...","[drain the tofu, carefully squeezing out exces...",4.5,2.0
4,cabbage soup,vegetable,"[low protein, vegan, low cholesterol, healthy,...",4.0,"[plain tomato juice, cabbage, onion, carrots, ...","[46, 4, 1, 2, 1]","[mix everything together and bring to a boil.,...",4.5,11.0
7,buttermilk pie with gingersnap crumb crust,pie,"[dessert, healthy, weeknight, oven, < 4 hours]",8.0,"[sugar, margarine, egg, flour, salt, buttermil...","[3/4, 1, 1, 2, 3, 1/4, 1, 1/2, 1/2, 2]","[preheat oven to 350°f., make pie crust, using...",4.0,3.0
10,boston cream pie,pie,"[dessert, weeknight, oven, < 4 hours]",8.0,"[margarine, cake flour, baking powder, salt, s...","[1/2, 2 1/4, 3, 1, 1 1/2, 1/3, 1 1/2, 2, 1, 1/...","[beat egg whites until soft peaks form., gradu...",2.0,2.0
...,...,...,...,...,...,...,...,...,...
522509,spanish coffee with tia maria,beverages,"[< 15 mins, easy, from scratch]",1.0,"[lemon wedge, granulated sugar, cognac, brandy...","[1, 1, 1, 1 1/2, 6, 3, 1, 1]",[cut a small slit in the lemon wedge and slide...,,
522510,slow-cooker classic coffee cake,breads,< 4 hours,12.0,"[all-purpose flour, brown sugar, butter, groun...","[1, 1/2, 4, 2, 1/8, 1, 1, 1/2, 4, 1/2, 2 -3, 1/4]",[line bottom and sides of 5-quart oval slow co...,,
522512,meg's fresh ginger gingerbread,dessert,< 4 hours,8.0,"[fresh ginger, unsalted butter, dark brown sug...","[3, 1/2, 1/2, 1/4, 1/4, 1, 1/4, 1 1/2, 1, 1/2,...",[preheat oven to 350&deg;f grease an 8x8 cake ...,,
522513,roast prime rib au poivre with mixed peppercorns,very low carbs,"[high protein, high in..., < 4 hours]",8.0,"[dijon mustard, garlic, peppercorns, shallot, ...","[9, 2, 4, 2, 1/3, 3 1/2, 1/3]",[position rack in center of oven and preheat t...,,


In [172]:
# Flatten the ingredients lists
all_ingredients = []
for ingredients_list in df['ingredients']:
    # Directly flatten and append the ingredients
    all_ingredients.extend(ingredients_list)

# all_ingredients_normalized = [ingredient.lower().strip() for ingredient in all_ingredients]

# Get unique ingredients
unique_ingredients = sorted(set(all_ingredients_normalized))
unique_ingredients_Notnormal = sorted(set(all_ingredients))


print (len(unique_ingredients_Notnormal))
print (len(unique_ingredients))
# Output the unique ingredients
for ingredient in unique_ingredients:
    print(ingredient)

# Convert the list to a pandas DataFrame and save it to a CSV file

unique_ingredients_df = pd.DataFrame(unique_ingredients, columns=['Ingredients'])
#unique_ingredients_df.to_csv('unique_ingredients_list.csv', index=False)



5850
5850

%
&
'
,
-
1
1% fat buttermilk
1% fat cottage cheese
1% low-fat chocolate milk
1% low-fat milk
1-1/2 ingredient fiber crust
10-inch corn tortillas
10-inch flour tortilla
10-inch flour tortillas
10-inch whole wheat  tortillas
10-minute success rice
100 proof vodka
12-inch flour tortilla
12-inch flour tortillas
2
2 texans craving salsa far from home
2% buttermilk
2% cheddar cheese
2% evaporated milk
2% fat cottage cheese
2% low-fat chocolate milk
2% low-fat milk
2% milk
2% mozzarella cheese
20% sour cream
2bleu's 2 minute 2 easy pizza sauce
2bleu's sweet mustard sauce for pretzels and more!
3
3 legume butter
5
5% fat ricotta cheese
6
6-inch corn tortillas
6-inch flour tortillas
6-inch tortillas
7-inch corn tortillas
7-inch flour tortillas
8-inch 97% fat free flour tortillas
8-inch fat-free flour tortillas
8-inch flour tortillas
8-inch low-carb whole wheat tortilla
8-inch ready-made graham cracker crust
9-inch flour tortillas
9-inch graham cracker crust
9-inch graham cracker cru

In [173]:
def assign_unit(ingredient):
    ingredient = ingredient.lower()
    # Liquid ingredients
    if any(word in ingredient for word in ['water', 'milk', 'oil', 'syrup', 'juice', 'broth', 'extract', 'vinegar', 'wine']):
        return 'cups'
    # Granular dry ingredients
    elif any(word in ingredient for word in ['flour', 'sugar', 'salt', 'spice', 'powder', 'baking', 'cornmeal', 'cinnamon', 'nutmeg']):
        return 'tablespoons'
    # Solid fats
    elif any(word in ingredient for word in ['butter', 'cream cheese', 'shortening', 'cheese']):
        return 'ounces'
    # Large solid food items
    elif any(word in ingredient for word in ['vegetable', 'fruit', 'meat', 'chicken', 'fish', 'beef', 'pork', 'tofu', 'turkey']):
        return 'pounds'
    # Eggs
    elif any(word in ingredient for word in ['eggs', 'egg']):
        return 'units'
    # Fresh herbs and leafy greens
    elif any(word in ingredient for word in ['herbs', 'leaves', 'leaf', 'cilantro', 'parsley', 'mint', 'basil', 'spinach']):
        return 'teaspoons'
    # Common vegetables
    elif any(word in ingredient for word in ['onion', 'garlic', 'potato', 'carrot', 'tomato', 'pepper', 'celery']):
        return 'units'
    # Nuts and seeds
    elif any(word in ingredient for word in ['nuts', 'almonds', 'pecans', 'walnuts', 'cashews', 'peanuts', 'seeds']):
        return 'cups'
    # Yeast and thickening agents
    elif any(word in ingredient for word in ['yeast', 'gelatin', 'cornstarch', 'starch', 'pectin']):
        return 'teaspoons'
    # Zest and peels
    elif any(word in ingredient for word in ['zest', 'rind', 'peel']):
        return 'teaspoons'
    # Chocolate and cocoa
    elif any(word in ingredient for word in ['chocolate', 'cocoa', 'chips']):
        return 'cups'
    # Beverages
    elif any(word in ingredient for word in ['coffee', 'tea', 'espresso']):
        return 'teaspoons'
    # Canned or packaged items
    elif any(word in ingredient for word in ['canned', 'can', 'jar', 'package', 'box']):
        return 'units'
    # Miscellaneous common items
    elif any(word in ingredient for word in ['breadcrumbs', 'crackers', 'pasta', 'rice', 'quinoa']):
        return 'cups'
    # Dairy and milk products
    elif any(word in ingredient for word in ['yogurt', 'sour cream', 'cream', 'whipping cream']):
        return 'cups'
    else:
        return 'unspecified'

# Example usage


In [177]:
# Apply the classifier to the unique ingredients list
unique_ingredients_df['unit'] = unique_ingredients_df['Ingredients'].apply(assign_unit)

# Count the number of "unspecified" units
unspecified_count = unique_ingredients_df['unit'].value_counts().get('unspecified', 0)
unspecified_count
unique_ingredients_df

Unnamed: 0,Ingredients,unit
0,,unspecified
1,%,unspecified
2,&,unspecified
3,',unspecified
4,",",unspecified
...,...,...
5845,zinfandel,unspecified
5846,ziti pasta,cups
5847,zucchini,unspecified
5848,zucchini with italian-style tomato sauce,units


In [None]:
# Assuming your DataFrame is named df and has a 'RecipeCategory' column
unique_categories = df['RecipeCategory'].dropna().unique()

# To convert this to a sorted list
unique_categories = sorted(unique_categories)

# Display the unique categories
print(unique_categories)

['< 15 Mins', '< 30 Mins', '< 4 Hours', '< 60 Mins', 'African', 'Apple', 'Apple Pie', 'Artichoke', 'Asian', 'Australian', 'Austrian', 'Avocado', 'Baking', 'Bar Cookie', 'Bass', 'Bath/Beauty', 'Bean Soup', 'Beans', 'Bear', 'Beef Liver', 'Beef Organ Meats', 'Beginner Cook', 'Belgian', 'Berries', 'Beverages', 'Birthday', 'Black Bean Soup', 'Black Beans', 'Brazilian', 'Bread Machine', 'Bread Pudding', 'Breads', 'Breakfast', 'Breakfast Casseroles', 'Breakfast Eggs', 'Broccoli Soup', 'Broil/Grill', 'Brown Rice', 'Brunch', 'Buttermilk Biscuits', 'Cajun', 'Cambodian', 'Camping', 'Canadian', 'Candy', 'Canning', 'Cantonese', 'Caribbean', 'Catfish', 'Cauliflower', 'Chard', 'Cheese', 'Cheesecake', 'Cherries', 'Chicken', 'Chicken Breast', 'Chicken Crock Pot', 'Chicken Livers', 'Chicken Thigh & Leg', 'Chilean', 'Chinese', 'Chocolate Chip Cookies', 'Chowders', 'Christmas', 'Chutneys', 'Citrus', 'Clear Soup', 'Coconut', 'Coconut Cream Pie', 'Collard Greens', 'Colombian', 'Corn', 'Costa Rican', 'Crab',

In [None]:
import pandas as pd
import pickle
import re
from collections import Counter

# Load datasets
#raw_recipes = pd.read_csv('/content/drive/MyDrive/id2223 Project/data/RAW_recipes.csv')
#raw_interactions = pd.read_csv('/content/drive/MyDrive/id2223 Project/data/RAW_interactions.csv')
#pp_recipes = pd.read_csv('/content/drive/MyDrive/id2223 Project/data/PP_recipes.csv')
#pp_users = pd.read_csv('/content/drive/MyDrive/id2223 Project/data/PP_users.csv')

# Load ingredient mapping
with open('/content/drive/MyDrive/id2223 Project/data/ingr_map.pkl', 'rb') as f:
    ingr_map = pickle.load(f)

# Helper function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# 1. Clean and standardize recipes
def clean_recipes(df, ingr_map):
    # Handle missing values
    df = df.dropna(subset=['name', 'ingredients', 'steps'])  # Drop rows missing critical data
    df.fillna('', inplace=True)  # Fill other NaNs with empty strings

    # Normalize text fields
    df['name'] = df['name'].apply(clean_text)
    df['steps'] = df['steps'].apply(clean_text)
    df['ingredients'] = df['ingredients'].apply(clean_text)

    # Standardize ingredient names
    df['ingredients'] = df['ingredients'].apply(
        lambda x: ', '.join([ingr_map.get(ing.strip(), ing.strip()) for ing in x.split(',')])
    )

    # Remove duplicates
    df = df.drop_duplicates(subset=['name', 'ingredients'])

    return df

cleaned_recipes = clean_recipes(raw_recipes, ingr_map)

# 2. Process interactions for feedback
def clean_interactions(df, recipe_ids):
    df = df[df['recipe_id'].isin(recipe_ids)]  # Filter valid recipe interactions
    df = df.dropna(subset=['user_id', 'recipe_id'])  # Drop rows missing critical IDs

    # Aggregate duplicate interactions (e.g., average ratings)
    df = df.groupby(['user_id', 'recipe_id']).agg({'rating': 'mean'}).reset_index()
    return df

cleaned_interactions = clean_interactions(raw_interactions, cleaned_recipes['id'].unique())

# 3. Validate preprocessed recipes and users
def validate_users_and_recipes(pp_users, pp_recipes, recipe_ids):
    pp_recipes = pp_recipes[pp_recipes['id'].isin(recipe_ids)]  # Ensure valid recipes
    pp_users.fillna('unknown', inplace=True)  # Fill missing dietary preferences
    pp_recipes = pp_recipes.drop_duplicates()
    return pp_users, pp_recipes

cleaned_users, cleaned_pp_recipes = validate_users_and_recipes(pp_users, pp_recipes, cleaned_recipes['id'].unique())

# 4. Generate training data input-output pairs
def create_training_data(recipes_df):
    training_data = []
    for _, row in recipes_df.iterrows():
        input_text = f"Create a recipe for a {row.get('cuisine', 'general')} dish for {row.get('servings', '4')} servings."
        output_text = f"Recipe name: {row['name']}\nIngredients: {row['ingredients']}\nInstructions: {row['steps']}"
        training_data.append({"input": input_text, "output": output_text})
    return pd.DataFrame(training_data)

training_pairs = create_training_data(cleaned_recipes)

# Save training data
training_pairs.to_csv('/content/drive/MyDrive/id2223 Project/data/training_pairs.csv', index=False)

# 5. Generate shopping lists for multiple recipes
def generate_shopping_list(recipes_df):
    shopping_list = Counter()
    for ingredients in recipes_df['ingredients']:
        for item in ingredients.split(','):
            # Extract ingredient name and quantity
            name, *quantity = item.strip().rsplit(' ', 1)
            if quantity:
                try:
                    shopping_list[name] += float(quantity[0])  # Add quantities
                except ValueError:
                    shopping_list[name] += 1  # Add item if quantity parsing fails
            else:
                shopping_list[name] += 1
    return shopping_list

# Example: Generate a shopping list for the cleaned recipes
shopping_list = generate_shopping_list(cleaned_recipes)
print("Shopping List:", shopping_list)

# Save shopping list
shopping_list_df = pd.DataFrame(shopping_list.items(), columns=['Ingredient', 'Quantity'])
shopping_list_df.to_csv('/content/drive/MyDrive/id2223 Project/data/shopping_list.csv', index=False)

# 6. Split data for training and testing
def split_data(training_pairs):
    train = training_pairs.sample(frac=0.8, random_state=42)
    test = training_pairs.drop(train.index)
    return train, test

train_data, test_data = split_data(training_pairs)

# Save train and test data
train_data.to_csv('/content/drive/MyDrive/id2223 Project/data/train_data.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/id2223 Project/data/test_data.csv', index=False)

print("Data processing and training preparation complete.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/id2223 Project/data/ingr_map.pkl'