## Introductory steps & Data preprocessing

### Importing necessary libraries

In [3]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType, VectorDistances

import os
import pandas as pd
import ast

### Loading the dataset

Loading dataset to a pandas dataframe. The dataset used for the workflow is the [Recipe Dataset (over 2M) Food](https://www.kaggle.com/datasets/wilmerarltstrmberg/recipe-dataset-over-2m/data) from Kaggle.

In [2]:
filename = "recipes_data.csv"
df = pd.read_csv(filename)
df

Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""bite size shredded rice biscuits"", ""vanilla""...",www.cookbooks.com
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""cream of mushroom soup"", ""beef"", ""sour cream...",www.cookbooks.com
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""pepper"", ""cream cheese"", ""gar...",www.cookbooks.com
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken gravy"", ""cream of mushroom soup"", ""c...",www.cookbooks.com
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""graham cracker crumbs"", ""powdered sugar"", ""p...",www.cookbooks.com
...,...,...,...,...,...,...,...
2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""marshmallows"", ...",www.foodnetwork.com
2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""choice"", ""miracle whip"", ""eggs"", ""relish"", ""...",cookpad.com
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""soy sauce"", ""radish"", ""white sesame seeds"", ...",cookpad.com
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""egg"", ""sugar"", ""freshly groun...",cooking.nytimes.com


### Preprocessing the dataset

Let's see if the dataset is updated and the websites really exist. So, the unique websites of our dataset are the below.

In [3]:
print(df['site'].unique())

['www.cookbooks.com' 'www.allrecipes.com' 'www.food.com'
 'recipes-plus.com' 'www.epicurious.com' 'food52.com' 'www.myrecipes.com'
 'www.seriouseats.com' 'www.tasteofhome.com' 'tastykitchen.com'
 'www.yummly.com' 'cookeatshare.com' 'www.foodnetwork.com' 'cookpad.com'
 'www.kraftrecipes.com' 'online-cookbook.com' 'www.lovefood.com'
 'www.landolakes.com' 'cooking.nytimes.com' 'allrecipes.com'
 'www.foodgeeks.com' 'www.cookstr.com' 'recipeland.com'
 'www.vegetariantimes.com' 'www.delish.com' 'www.foodandwine.com'
 'www.chowhound.com' 'www.foodrepublic.com']


After visiting them, realized that 'www.cookbooks.com', 'recipes-plus.com', 'www.kraftrecipes.com' and 'www.cookstr.com' do not exist. So, since we want to return links to the visitors, we need to remove these sites. 

In [4]:
excluded_sites = ['www.cookbooks.com', 'recipes-plus.com', 'www.kraftrecipes.com', 'www.cookstr.com']
df_new = df[~df['site'].isin(excluded_sites)].reset_index(drop=True)
df_new

Unnamed: 0,title,ingredients,directions,link,source,NER,site
0,Deconstructed Screwdriver (The Raw Egg),"[""1/4 cup orange juice, or as desired"", ""2 (1....","[""Fill a food-safe silicon-based round ice mol...",www.allrecipes.com/recipe/241895/deconstructed...,Gathered,"[""jiggers vodka"", ""orange juice""]",www.allrecipes.com
1,Kettle Corn,"[""1/4 cup vegetable oil"", ""1/4 cup white sugar...","[""Heat the vegetable oil in a large pot over m...",www.allrecipes.com/recipe/20808/kettle-corn/,Gathered,"[""vegetable oil"", ""popcorn kernels"", ""white su...",www.allrecipes.com
2,Pop'S Fabulous Ajvar,"[""2 1/2 pounds firm, shiny eggplants"", ""3 larg...","[""Preheat oven to 350 degrees F (175 degrees C...",www.allrecipes.com/recipe/238733/pops-fabulous...,Gathered,"[""eggplants"", ""green bell peppers"", ""red bell ...",www.allrecipes.com
3,Chipotle-Mango Guacamole,"[""4 avocados, mashed"", ""2 Roma (plum) tomatoes...","[""Stir avocados, tomatoes, mango, cilantro, re...",www.allrecipes.com/recipe/241254/chipotle-mang...,Gathered,"[""sour cream"", ""ground cumin"", ""tomatoes"", ""pe...",www.allrecipes.com
4,Basic Homemade Ricotta Cheese,"[""7 cups whole milk"", ""1/2 cup heavy whipping ...","[""Place milk, cream, and vinegar in a saucepan...",www.allrecipes.com/recipe/261696/basic-homemad...,Gathered,"[""heavy whipping cream"", ""white vinegar"", ""sal...",www.allrecipes.com
...,...,...,...,...,...,...,...
1263465,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""marshmallows"", ...",www.foodnetwork.com
1263466,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""choice"", ""miracle whip"", ""eggs"", ""relish"", ""...",cookpad.com
1263467,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""soy sauce"", ""radish"", ""white sesame seeds"", ...",cookpad.com
1263468,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""egg"", ""sugar"", ""freshly groun...",cooking.nytimes.com


As our dataset is too large we will sample 6000 random rows to add them into our database.

In [5]:
df_sample = df_new.sample(n=6000, random_state=42)  # random_state is set for reproducibility

Note that the NER column is a list of strings. Let's change the format and convert it to a string with comma separated elements.

In [6]:
df_sample["NER"]

536743     ["white cranberry juice", "blueberries", "water"]
641258     ["lemons", "\u00bc", "parmesan cheese", "fresh...
1233476    ["scallions", "cheddar cheese", "water", "shre...
482320     ["extra virgin olive oil", "t", "sweet cherry ...
321919     ["baking powder", "slivered almonds", "vanilla...
                                 ...                        
288769                           ["flour", "natural yogurt"]
807749     ["mustard seeds", "brown mustard", "pretzels",...
330830     ["italian seasoning", "italian seasoned breadc...
74593      ["tomatoes", "red bell peppers", "basil", "bal...
83332      ["oil", "oregano", "hot pepper", "misc", "eggs...
Name: NER, Length: 6000, dtype: object

In [7]:
df_sample["ner_new"] = df_sample["NER"].apply(ast.literal_eval)
df_sample["ner_new"] = df_sample["ner_new"].apply(", ".join)

if 'Unnamed: 0' in df_sample.columns:
    df_sample = df_sample.drop(columns=['Unnamed: 0'])

In [8]:
df_sample.to_csv('sampled_data.csv', index=False)

## Collection creation and data import

Before running the code below, you need to start a local Docker instance and add OPENAI_API_KEY as environmental variable

In [4]:
# If you are running the code in a virtual environmet, you need to uncomment the code below and add your OPENAI_API_KEY. Else, you can set it from terminal.
# os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"

### Connect to the Weaviate instance

In [5]:
client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
    grpc_port=50051,
    headers={"X-OpenAI-API-Key": os.getenv('OPENAI_API_KEY')}
    
)

### Collection creation

Creating the collection with name "Recipes". It will have 5 properties, but we will only vectorize the "ner_new" property using the "text-embedding-3-small" vectorizer from OpenAI.

In [7]:
client.collections.create(
    "Recipes",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(model="text-embedding-3-small"),
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
    properties=[
        Property(name="title", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="ingredients", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="directions", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="link", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="ner_new", data_type=DataType.TEXT, skip_vectorization=False),
    ],
    
)

<weaviate.collections.collection.Collection at 0x758a1977e950>

### Import data

Let's populate our database with data. We use the data from the "sampled_data.csv" that we created before and add them in batches.

In [8]:
collection = client.collections.get("Recipes")

with client.batch.fixed_size(batch_size=50) as batch:
    with pd.read_csv(
        "sampled_data.csv",
        usecols=["title", "ingredients", "directions", "link", "ner_new"],
        chunksize=5,
    ) as csv_iterator:
        for chunk in csv_iterator:
            for index, row in chunk.iterrows():
                batch.add_object(
                    collection="Recipes",
                        properties = {
                            "title": row["title"],
                            "ingredients": row["ingredients"],
                            "directions": row["directions"],
                            "link": row["link"],
                            "ner_new": row["ner_new"]
                         }
                )

### Closing the client

After ending with our experiments we can close the client

In [16]:
#client.close() 