In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "../data/raw/recipes.csv"
data = pd.read_csv(path)

In [3]:
df = data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517 non-null  int64  
 3   AuthorName                  522517 non-null  object 
 4   CookTime                    439972 non-null  object 
 5   PrepTime                    522517 non-null  object 
 6   TotalTime                   522517 non-null  object 
 7   DatePublished               522517 non-null  object 
 8   Description                 522512 non-null  object 
 9   Images                      522516 non-null  object 
 10  RecipeCategory              521766 non-null  object 
 11  Keywords                    505280 non-null  object 
 12  RecipeIngredientQuantities  522514 non-null  object 
 13  RecipeIngredie

In [5]:
# sütun isimleri PascalCase formatında.
# bunu snake_case formatına çevirelim.
def to_snake_case(s: str) -> str:
    import re

    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", s)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
df.columns = [to_snake_case(col) for col in df.columns]

In [6]:
df.columns

Index(['recipe_id', 'name', 'author_id', 'author_name', 'cook_time',
       'prep_time', 'total_time', 'date_published', 'description', 'images',
       'recipe_category', 'keywords', 'recipe_ingredient_quantities',
       'recipe_ingredient_parts', 'aggregated_rating', 'review_count',
       'calories', 'fat_content', 'saturated_fat_content',
       'cholesterol_content', 'sodium_content', 'carbohydrate_content',
       'fiber_content', 'sugar_content', 'protein_content', 'recipe_servings',
       'recipe_yield', 'recipe_instructions'],
      dtype='object')

In [7]:
# silinecek sütunlar:
# author_id, author_name, cook_time, prep_time, date_published, images,
# review_count, saturated_fat_content, cholesterol_content, sodium_content,
# fiber_content, recipe_servings, recipe_yield

cols_to_drop = [
    "author_id",
    "author_name",
    "cook_time",
    "prep_time",
    "date_published",
    "images",
    "review_count",
    "saturated_fat_content",
    "cholesterol_content",
    "sodium_content",
    "fiber_content",
    "recipe_servings",
    "recipe_yield",
]
df = df.drop(columns=cols_to_drop)

In [8]:
# bütün eksik verileri satır bazında silelim
df = df.dropna()

In [9]:
df.reset_index(drop=True, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262954 entries, 0 to 262953
Data columns (total 15 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   recipe_id                     262954 non-null  int64  
 1   name                          262954 non-null  object 
 2   total_time                    262954 non-null  object 
 3   description                   262954 non-null  object 
 4   recipe_category               262954 non-null  object 
 5   keywords                      262954 non-null  object 
 6   recipe_ingredient_quantities  262954 non-null  object 
 7   recipe_ingredient_parts       262954 non-null  object 
 8   aggregated_rating             262954 non-null  float64
 9   calories                      262954 non-null  float64
 10  fat_content                   262954 non-null  float64
 11  carbohydrate_content          262954 non-null  float64
 12  sugar_content                 262954 non-nul

In [11]:
df.head()

Unnamed: 0,recipe_id,name,total_time,description,recipe_category,keywords,recipe_ingredient_quantities,recipe_ingredient_parts,aggregated_rating,calories,fat_content,carbohydrate_content,sugar_content,protein_content,recipe_instructions
0,38,Low-Fat Berry Blue Frozen Dessert,PT24H45M,Make and share this Low-Fat Berry Blue Frozen ...,Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"",...","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla ...",4.5,170.9,2.5,37.1,30.2,3.2,"c(""Toss 2 cups berries with sugar."", ""Let stan..."
1,39,Biryani,PT4H25M,Make and share this Biryani recipe from Food.com.,Chicken Breast,"c(""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"",...","c(""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ...","c(""saffron"", ""milk"", ""hot green chili peppers""...",3.0,1110.7,58.8,84.4,20.4,63.4,"c(""Soak saffron in warm milk for 5 minutes and..."
2,40,Best Lemonade,PT35M,This is from one of my first Good House Keepi...,Beverages,"c(""Low Protein"", ""Low Cholesterol"", ""Healthy"",...","c(""1 1/2"", ""1"", NA, ""1 1/2"", NA, ""3/4"")","c(""sugar"", ""lemons, rind of"", ""lemon, zest of""...",4.5,311.1,0.2,81.5,77.2,0.3,"c(""Into a 1 quart Jar with tight fitting lid, ..."
3,41,Carina's Tofu-Vegetable Kebabs,PT24H20M,This dish is best prepared a day in advance to...,Soy/Tofu,"c(""Beans"", ""Vegetable"", ""Low Cholesterol"", ""We...","c(""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2...","c(""extra firm tofu"", ""eggplant"", ""zucchini"", ""...",4.5,536.1,24.0,64.2,32.1,29.3,"c(""Drain the tofu, carefully squeezing out exc..."
4,42,Cabbage Soup,PT50M,Make and share this Cabbage Soup recipe from F...,Vegetable,"c(""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""...","c(""46"", ""4"", ""1"", ""2"", ""1"")","c(""plain tomato juice"", ""cabbage"", ""onion"", ""c...",4.5,103.6,0.4,25.1,17.7,4.3,"c(""Mix everything together and bring to a boil..."


In [12]:
# total time'dan total time minutes sütununu çıkaralım
# PT24H45M -> 24*60 + 45 = 1485
def parse_total_time_minutes(s: str) -> int:
    import re

    pattern = r"PT(?:(\d+)H)?(?:(\d+)M)?"
    match = re.match(pattern, s)
    if not match:
        return 0
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    return hours * 60 + minutes

df["total_time_minutes"] = df["total_time"].apply(parse_total_time_minutes)
df.drop(columns=["total_time"], inplace=True)

In [13]:
# keywords, recipe_ingredient_quantities, recipe_ingredient_parts, recipe_instructions
# sütunları r matrisi formunda c("...", "...") şeklinde veriler içeriyor.
# bunları python liste formatına çevirelim.
def r_vector_to_list(s: str) -> list:
    import re

    s = s.strip()
    if s.startswith("c(") and s.endswith(")"):
        s = s[2:-1]
    items = re.findall(r'"(.*?)"', s)
    return items
df["keywords"] = df["keywords"].apply(r_vector_to_list)
df["recipe_ingredient_quantities"] = df["recipe_ingredient_quantities"].apply(r_vector_to_list)
df["recipe_ingredient_parts"] = df["recipe_ingredient_parts"].apply(r_vector_to_list)
df["recipe_instructions"] = df["recipe_instructions"].apply(r_vector_to_list)

In [14]:
df.head()

Unnamed: 0,recipe_id,name,description,recipe_category,keywords,recipe_ingredient_quantities,recipe_ingredient_parts,aggregated_rating,calories,fat_content,carbohydrate_content,sugar_content,protein_content,recipe_instructions,total_time_minutes
0,38,Low-Fat Berry Blue Frozen Dessert,Make and share this Low-Fat Berry Blue Frozen ...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[4, 1/4, 1, 1]","[blueberries, granulated sugar, vanilla yogurt...",4.5,170.9,2.5,37.1,30.2,3.2,"[Toss 2 cups berries with sugar., Let stand fo...",1485
1,39,Biryani,Make and share this Biryani recipe from Food.com.,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[1, 4, 2, 2, 8, 1/4, 8, 1/2, 1, 1, 1/4, 1/4, 1...","[saffron, milk, hot green chili peppers, onion...",3.0,1110.7,58.8,84.4,20.4,63.4,[Soak saffron in warm milk for 5 minutes and p...,265
2,40,Best Lemonade,This is from one of my first Good House Keepi...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[1 1/2, 1, 1 1/2, 3/4]","[sugar, lemons, rind of, lemon, zest of, fresh...",4.5,311.1,0.2,81.5,77.2,0.3,"[Into a 1 quart Jar with tight fitting lid, pu...",35
3,41,Carina's Tofu-Vegetable Kebabs,This dish is best prepared a day in advance to...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...","[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1/2, 1/...","[extra firm tofu, eggplant, zucchini, mushroom...",4.5,536.1,24.0,64.2,32.1,29.3,"[Drain the tofu, carefully squeezing out exces...",1460
4,42,Cabbage Soup,Make and share this Cabbage Soup recipe from F...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[46, 4, 1, 2, 1]","[plain tomato juice, cabbage, onion, carrots, ...",4.5,103.6,0.4,25.1,17.7,4.3,"[Mix everything together and bring to a boil.,...",50


In [15]:
# recipe_ingredient_quantities ve recipe_ingredient_parts sütunlarının
# her bir ögesini birleştirip yeni bir sütun oluşturalım: ingredients
def combine_ingredients(quantities: list, parts: list) -> list:
    combined = []
    for q, p in zip(quantities, parts):
        combined.append(f"{q} {p}".strip())
    return combined
df["ingredients"] = df.apply(lambda row: combine_ingredients(row["recipe_ingredient_quantities"], row["recipe_ingredient_parts"]), axis=1)
df.drop(columns=["recipe_ingredient_quantities"], inplace=True)

In [16]:
# n_ingredients sütununu ekleyelim
df["n_ingredients"] = df["ingredients"].apply(len)

In [17]:
df.head()

Unnamed: 0,recipe_id,name,description,recipe_category,keywords,recipe_ingredient_parts,aggregated_rating,calories,fat_content,carbohydrate_content,sugar_content,protein_content,recipe_instructions,total_time_minutes,ingredients,n_ingredients
0,38,Low-Fat Berry Blue Frozen Dessert,Make and share this Low-Fat Berry Blue Frozen ...,Frozen Desserts,"[Dessert, Low Protein, Low Cholesterol, Health...","[blueberries, granulated sugar, vanilla yogurt...",4.5,170.9,2.5,37.1,30.2,3.2,"[Toss 2 cups berries with sugar., Let stand fo...",1485,"[4 blueberries, 1/4 granulated sugar, 1 vanill...",4
1,39,Biryani,Make and share this Biryani recipe from Food.com.,Chicken Breast,"[Chicken Thigh & Leg, Chicken, Poultry, Meat, ...","[saffron, milk, hot green chili peppers, onion...",3.0,1110.7,58.8,84.4,20.4,63.4,[Soak saffron in warm milk for 5 minutes and p...,265,"[1 saffron, 4 milk, 2 hot green chili peppers,...",25
2,40,Best Lemonade,This is from one of my first Good House Keepi...,Beverages,"[Low Protein, Low Cholesterol, Healthy, Summer...","[sugar, lemons, rind of, lemon, zest of, fresh...",4.5,311.1,0.2,81.5,77.2,0.3,"[Into a 1 quart Jar with tight fitting lid, pu...",35,"[1 1/2 sugar, 1 lemons, rind of, 1 1/2 lemon, ...",4
3,41,Carina's Tofu-Vegetable Kebabs,This dish is best prepared a day in advance to...,Soy/Tofu,"[Beans, Vegetable, Low Cholesterol, Weeknight,...","[extra firm tofu, eggplant, zucchini, mushroom...",4.5,536.1,24.0,64.2,32.1,29.3,"[Drain the tofu, carefully squeezing out exces...",1460,"[12 extra firm tofu, 1 eggplant, 2 zucchini, 1...",14
4,42,Cabbage Soup,Make and share this Cabbage Soup recipe from F...,Vegetable,"[Low Protein, Vegan, Low Cholesterol, Healthy,...","[plain tomato juice, cabbage, onion, carrots, ...",4.5,103.6,0.4,25.1,17.7,4.3,"[Mix everything together and bring to a boil.,...",50,"[46 plain tomato juice, 4 cabbage, 1 onion, 2 ...",5


In [18]:
# fat_content, carbohydrate_content, protein_content, sugar_content
# sütunlarından percentage değerleri çıkaralım
# örneğin:
# fat_content = (fat_content) / (fat_content + carbohydrate_content + protein_content + sugar_content)
def calculate_percentage(row, column):
    total = (
        row["fat_content"]
        + row["carbohydrate_content"]
        + row["protein_content"]
        + row["sugar_content"]
    )
    if total == 0:
        return 0
    return (row[column]*100) / total
df["fat_content_perc"] = df.apply(lambda row: calculate_percentage(row, "fat_content"), axis=1)
df["carbohydrate_content_perc"] = df.apply(lambda row: calculate_percentage(row, "carbohydrate_content"), axis=1)
df["protein_content_perc"] = df.apply(lambda row: calculate_percentage(row, "protein_content"), axis=1)
df["sugar_content_perc"] = df.apply(lambda row: calculate_percentage(row, "sugar_content"), axis=1)

In [19]:
# calories, fat_content, carbohydrate_content, protein_content, sugar_content ve percentage
# sütunlarını maks 2 ondalıklı yapalım
df["calories"] = df["calories"].round(2)
df["fat_content"] = df["fat_content"].round(2)
df["carbohydrate_content"] = df["carbohydrate_content"].round(2)
df["protein_content"] = df["protein_content"].round(2)
df["sugar_content"] = df["sugar_content"].round(2)
df["carbohydrate_content_perc"] = df["carbohydrate_content_perc"].round(2)
df["fat_content_perc"] = df["fat_content_perc"].round(2)
df["protein_content_perc"] = df["protein_content_perc"].round(2)
df["sugar_content_perc"] = df["sugar_content_perc"].round(2)

In [20]:
df["recipe_id"] = np.arange(len(df))
df.to_parquet("../data/raw/recipes.parquet", index=False, engine="pyarrow")

In [21]:
df.to_parquet("../data/raw/recipes.parquet", index=False, engine="pyarrow")

In [22]:
# uygulamada 10k kayıt kullanmak için veriyi küçültelim
df_final = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [23]:
# elimizdeki recipe_id'leri 0'dan başlayarak yeniden numaralandıralım
df_final["recipe_id"] = np.arange(len(df_final))

In [24]:
# vektörize metin sütunları
text_columns = [
    "name",
    "description",
    "recipe_category",
    "recipe_ingredient_parts",
    "recipe_instructions",
    "keywords",
]

def create_text_vector(row):
    text = " ".join([col + ": " + str(row[col]) for col in text_columns])
    return text

df_final["text_vector"] = df_final.apply(create_text_vector, axis=1)

In [25]:
df_final.head()

Unnamed: 0,recipe_id,name,description,recipe_category,keywords,recipe_ingredient_parts,aggregated_rating,calories,fat_content,carbohydrate_content,...,protein_content,recipe_instructions,total_time_minutes,ingredients,n_ingredients,fat_content_perc,carbohydrate_content_perc,protein_content_perc,sugar_content_perc,text_vector
0,0,Chicken Pom Pom Pie,This came from an old magazine i found. I know...,Savory Pies,"[Chicken, Poultry, Meat, Kid Friendly, < 60 Mi...","[onion, garlic clove, chicken thigh fillets, b...",4.5,465.3,16.1,43.0,...,39.3,[Heat oil in a large frying pan. Add onion and...,40,"[1 onion, 1 garlic clove, 1 chicken thigh fill...",6,15.51,41.43,37.86,5.2,name: Chicken Pom Pom Pie description: This ca...
1,1,Triple Almond Cookies (Vegan),"These have almond butter, almond extract and c...",Dessert,"[Cookie & Brownie, Nuts, Vegan, < 30 Mins, For...","[almond butter, pure maple syrup, canola oil, ...",5.0,134.1,8.5,12.9,...,3.2,"[Preheat the oven to 350*F., In a large bowl, ...",30,"[1/2 almond butter, 1/2 pure maple syrup, 3 ca...",6,27.87,42.3,10.49,19.34,name: Triple Almond Cookies (Vegan) descriptio...
2,2,Green Onion Low Fat Drop Biscuits,I wanted a quick bread recipe to dip into soup...,Breads,"[Winter, Savory, < 30 Mins, For Large Groups, ...","[whole wheat pastry flour, whole wheat flour, ...",4.0,79.3,2.5,12.3,...,3.0,"[Preheat your oven to 400 degrees., Spray a ba...",25,"[1 whole wheat pastry flour, 1 whole wheat flo...",8,13.09,64.4,15.71,6.81,name: Green Onion Low Fat Drop Biscuits descri...
3,3,Beef and Curry Pie,"From Gourmet magazine, November 2006: &quot;Th...",Lunch/Snacks,"[Potato, Vegetable, Meat, < 4 Hours]","[soy sauce, sugar, salt, onion, curry powder, ...",4.0,782.6,50.4,66.1,...,16.8,"[Mix together beef, soy sauce, sugar, and salt...",105,"[1/2 soy sauce, 1 sugar, 1/2 salt, 1/4 onion, ...",9,37.11,48.67,12.37,1.84,name: Beef and Curry Pie description: From Gou...
4,4,Grilled Flank Steak,This is a recipe that I got from my oldest son...,Steak,"[Meat, Lactose Free, Egg Free, Free Of..., < 3...","[fresh lime juice, water, garlic, sugar, salt]",5.0,194.0,9.4,2.0,...,24.2,[Combine all ingredients except steak in a sma...,25,"[1/4 fresh lime juice, 1/4 water, 2 garlic, 1 ...",5,25.82,5.49,66.48,2.2,name: Grilled Flank Steak description: This is...


In [26]:
df_final[["recipe_id", "text_vector"]].to_parquet("../data/processed/metadata.parquet", index=False, engine="pyarrow")

In [27]:
df_final.drop(columns=["recipe_ingredient_parts", "text_vector"]).to_parquet("../data/processed/master_df.parquet", index=False, engine="pyarrow")