In [10]:
import pandas as pd
# The data for each recipe in the dataset is essentially tabular
# given its key-value pairings. We begin by visualizing its rows. 
chunks = pd.read_csv('./recipes.csv', chunksize=1000)
chunk = next(chunks)
print(chunk.columns.tolist())

['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']


In [11]:
# We now look to analyze the dataset's shape - how many training
# examples are we going to have access to?
training_examples = 0
for chunk in chunks:
    training_examples += len(chunk)
print(training_examples)

2230142


In [13]:
# Wow, that's a lot of data! Let's take a more focused look.
# Now, we visualize the first 3 rows of 5 chunks' title, ingredients, and
# directions rows. Based on the above processing and the goals for
# fine tuning, these are the only rows we're going to keep in the dataset.
chunks = pd.read_csv('./recipes.csv', chunksize=1000)
for i in range(5):
    chunk = next(chunks)[['title', 'ingredients', 'directions']]
    display(chunk.head(3))

Unnamed: 0,title,ingredients,directions
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish...."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C..."


Unnamed: 0,title,ingredients,directions
1000,Hidden Valley Ranch Oyster Crackers,"[""12 to 16 oz. plain oyster crackers"", ""1 pkg....","[""Combine salad dressing mix and oil."", ""Add d..."
1001,Pepperoni Loaf,"[""1 frozen loaf bread, let thaw overnight"", ""1...","[""Take thawed bread and roll out onto cookie s..."
1002,Animal Crackers,"[""1/2 c. oatmeal"", ""2 tsp. honey"", ""1/4 plus 1...","[""Grind oatmeal in blender until fine."", ""Add ..."


Unnamed: 0,title,ingredients,directions
2000,Carrot Casserole,"[""2 c. grated raw carrots"", ""2 eggs, beaten"", ...","[""Mix together well."", ""Bake in buttered casse..."
2001,Melt In Your Mouth Chicken Casserole,"[""2 1/2 to 3 lb. chicken, cooked and deboned (...","[""Preheat oven to 425\u00b0."", ""In oblong baki..."
2002,Autumn Apple Salad,"[""1 (20 oz.) can crushed pineapple (undrained)...","[""In a saucepan, combine pineapple and sugar; ..."


Unnamed: 0,title,ingredients,directions
3000,Lentil Loaf,"[""1 1/2 c. lentils, mashed"", ""2 stalks celery,...","[""You may use a little cottage cheese and cere..."
3001,Dieter'S Delight,"[""1 medium cabbage"", ""2 ribs celery"", ""1 small...","[""Boil chicken breasts about 30 minutes."", ""Ad..."
3002,Carrot Casserole,"[""2 bunches carrots"", ""8 oz. Velveeta cheese"",...","[""Slice and cook carrots; drain."", ""Place in b..."


Unnamed: 0,title,ingredients,directions
4000,Ham And Cheese Sandwiches,"[""1 stick oleo"", ""1 Tbsp. grated onion"", ""1 Tb...","[""Mix oleo with grated onion and mustard."", ""S..."
4001,World'S-Best Cookies,"[""2 sticks butter"", ""1 c. white sugar"", ""1 c. ...","[""Cream softened butter and sugars."", ""Add egg..."
4002,Taco Salad,"[""1 lb. ground beef"", ""1 c. French dressing"", ...","[""Brown meat and add French dressing."", ""Simme..."


In [21]:
# This looks suitable. We now get rid of the extra rows (NER,
# link, etc) which aren't needed for fine-tuning, clean data,
# and save it to a new CSV file. 


# As seen above, data in each column is either a string or
# list of strings. this function handles both cases of cleaning
# *These lists, in jupyter, are stored as strings. We need to convert
# them into a list in order to process.

import ast
import os

chunks = pd.read_csv('./recipes.csv', chunksize=1000)

def clean_column(column):
    if isinstance(column, list):
        return [str(item).strip().lower() for item in column]
    elif isinstance(column, str):
        try:
            parsed_list = ast.literal_eval(column)
            if isinstance(parsed_list, list):
                return [str(item).strip().lower() for item in parsed_list]
        except: 
            pass
        return column.strip().lower()
    return column

for i, chunk in enumerate(chunks):
    # For each chunk, keeps only desired columns
    chunk = chunk[['title', 'ingredients', 'directions']]
    # Removes examples that have NaN values
    chunk = chunk.dropna(subset=['title','ingredients','directions'])
    # Strips all whitespace and makes lowercase 
    for col in chunk:
        chunk[col] = chunk[col].apply(clean_column)
    chunk.to_csv("recipes_cleaned.csv",
                mode="a",
                index=False,
                header=not os.path.exists("recipes_cleaned.csv")  # write header only once
            )
    print(f"Processed chunk {i + 1}")
print("All chunks have been processed")

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr



Processed chunk 2177
Processed chunk 2178
Processed chunk 2179
Processed chunk 2180
Processed chunk 2181
Processed chunk 2182
Processed chunk 2183
Processed chunk 2184
Processed chunk 2185
Processed chunk 2186
Processed chunk 2187
Processed chunk 2188
Processed chunk 2189
Processed chunk 2190
Processed chunk 2191
Processed chunk 2192
Processed chunk 2193
Processed chunk 2194
Processed chunk 2195
Processed chunk 2196
Processed chunk 2197
Processed chunk 2198
Processed chunk 2199
Processed chunk 2200
Processed chunk 2201
Processed chunk 2202
Processed chunk 2203
Processed chunk 2204
Processed chunk 2205
Processed chunk 2206
Processed chunk 2207
Processed chunk 2208
Processed chunk 2209
Processed chunk 2210
Processed chunk 2211
Processed chunk 2212
Processed chunk 2213
Processed chunk 2214
Processed chunk 2215
Processed chunk 2216
Processed chunk 2217
Processed chunk 2218
Processed chunk 2219
Processed chunk 2220
Processed chunk 2221
Processed chunk 2222
Processed chunk 2223
Processed chu