# Data Ingest Notebook

We're using the 'us' region to store our BQ tables so that we can make use of the `%%bigquery` built-in magic when querying the tables.

In [1]:
from google.cloud import storage
from google.cloud import bigquery

project_id = "shidcs329e"
bucket_name = "cookbook_data113"
folder_name = "raw"
dataset_name = "magazine_recipes_raw"
region = "us"

storage_client = storage.Client()
bq_client = bigquery.Client()

In the next sections, we load the CSV files into BQ tables. Each section loads a different file.

Refer to the [SDK documentation](https://cloud.google.com/python/docs/reference/bigquery/latest/index.html) for details on `LoadJobConfig`.

In [2]:
def create_load_table(file_name, table_name, schema, delimiter=","):

  uri = "gs://{}/{}/{}".format(bucket_name, folder_name, file_name)
  table_id = "{}.{}.{}".format(project_id, dataset_name, table_name)

  table = bigquery.Table(table_id, schema=schema)
  table = bq_client.create_table(table, exists_ok=True)
  print("Created table {}".format(table.table_id))

  # remove the load_time field from the schema before loading the data,
  # the load_time value will be auto-generated
  del schema[-1]

  job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        write_disposition="WRITE_TRUNCATE",
        field_delimiter=delimiter,
        allow_jagged_rows = True,
        allow_quoted_newlines = True,
        ignore_unknown_values = True,
        quote_character='"'
      )

  load_job = bq_client.load_table_from_uri(uri, table_id, job_config=job_config)
  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Loaded {} rows.".format(destination_table.num_rows))


In [3]:
file_name = 'Recipes.csv'
table_name = 'bird_recipes'

schema = [
  bigquery.SchemaField("recipe_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("title", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("subtitle", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("servings", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("yield_unit", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("prep_min", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("cook_min", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("stnd_min", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("source", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("intro", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("directions", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table bird_recipes
Loaded 1031 rows.


In [None]:
file_name = 'Ingredients.csv'
table_name = 'ingredients'

schema = [
  bigquery.SchemaField("ingredient_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("category", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("plural", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table ingredients
Loaded 3346 rows.


In [11]:
file_name = 'Quantity.csv'
table_name = 'quantity'

schema = [
  bigquery.SchemaField("quantity_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("recipe_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("ingredient_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("max_qty", "FLOAT", mode="NULLABLE"),
  bigquery.SchemaField("min_qty", "FLOAT", mode="NULLABLE"),
  bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("preparation", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("optional", "BOOLEAN", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table quantity
Loaded 5116 rows.


In [None]:
file_name = 'Nutrition.csv'
table_name = 'nutrition'

schema = [
  bigquery.SchemaField("recipe_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("protien", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("carbo", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("alcohol", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("total_fat", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("sat_fat", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("cholestrl", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("sodium", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("iron", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("vitamin_c", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("vitamin_a", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("fiber", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("pcnt_cal_carb", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("pcnt_cal_fat", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("pcnt_cal_prot", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("calories", "FLOAT64", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]
create_load_table(file_name, table_name, schema)

Created table nutrition
Loaded 878 rows.


In [None]:
file_name = 'faker_recipe_journalists.csv'
table_name = 'faker_journalists'

schema = [
  bigquery.SchemaField("author_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("phone_number", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]
create_load_table(file_name, table_name, schema)

Created table faker_journalists
Loaded 90 rows.


In [None]:
file_name = 'recipe_at.csv'
table_name = 'recipe_at'

schema = [
  bigquery.SchemaField("name", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("rating", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("ease_of_prep", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("note", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("type","STRING", mode = "NULLABLE"),
  bigquery.SchemaField("prep_time", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("cookbook", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("page", "STRING", mode = "NULLABLE"),
  bigquery.SchemaField("ingredients","STRING", mode = "NULLABLE"),
  bigquery.SchemaField("slowcooker","STRING", mode = "NULLABLE"),
  bigquery.SchemaField("link","STRING", mode = "NULLABLE"),
  bigquery.SchemaField("last_made","STRING", mode = "NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="NULLABLE", default_value_expression="CURRENT_TIMESTAMP")
]

create_load_table(file_name, table_name, schema)

Created table recipe_at
Loaded 145 rows.


# Verify loads

In [4]:
sql = "select table_name from {}.INFORMATION_SCHEMA.TABLES order by table_name".format(dataset_name)
query = (sql)

query_job = bq_client.query(
    query,
    location=region,
)

results = query_job.result()

for table in query_job:
    table_name = table.values()[0]
    print("table:", table_name)

table: bird_recipes
table: faker_journalists
table: ingredients
table: nutrition
table: quantity
table: recipe_at


In [5]:
%%bigquery
select * from magazine_recipes_raw.bird_recipes limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,recipe_id,title,subtitle,servings,yield_unit,prep_min,cook_min,stnd_min,source,intro,directions,load_time
0,559,Citrus Beef Ribbons,,48,48 each,120,3,0,California Beef Council,,Partially freeze tenderloin for ease in handli...,2024-01-30 01:08:47.612652+00:00
1,698,Hazelnut Stuffed French Bread,,20,2 loaves,155,30,0,Hazelnut Marketing Board,A delicious crunchy bread with an unexpected f...,"For the filling, thoroughly mix together all i...",2024-01-30 01:08:47.612652+00:00
2,1213,Seafood Seasoning,,36,makes 3/4 cup,0,5,0,The Texas Department of Agriculture,This is an excellent seafood season-all.,Mix all ingredients well. Store in sealed cont...,2024-01-30 01:08:47.612652+00:00
3,449,Fruit Soup,Sotsuppe (Scandinavian),18,,5,60,720,Wisconsin Department of Agriculture,,Soak dried fruits and tapioca overnight in wat...,2024-01-30 01:08:47.612652+00:00
4,515,Tote-Along Dressing,,20,1 1/4 cups,5,7,60,The American Cancer Society,,Combine all ingredients in screw-top jar; shak...,2024-01-30 01:08:47.612652+00:00


In [6]:
%%bigquery
select * from magazine_recipes_raw.faker_journalists limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,author_id,name,age,phone_number,state,load_time
0,13,Heather Roberts,25,(499)524-6610x935,IN,2024-01-27 00:25:41.566545+00:00
1,22,Christina Walker,25,(701)568-8477x9361,KS,2024-01-27 00:25:41.566545+00:00
2,40,David Chen,25,+1-380-466-0657x3547,WY,2024-01-27 00:25:41.566545+00:00
3,15,Joseph Freeman,26,+1-890-507-5470,OH,2024-01-27 00:25:41.566545+00:00
4,37,Gregory Haley,26,(703)455-7448,OR,2024-01-27 00:25:41.566545+00:00


In [7]:
%%bigquery
select * from magazine_recipes_raw.ingredients limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ingredient_id,category,name,plural,load_time
0,1017,deli,deli coleslaw,,2024-01-26 23:50:31.637778+00:00
1,1748,deli,head cheese,,2024-01-26 23:50:31.637778+00:00
2,2732,deli,prepared deli coleslaw,,2024-01-26 23:50:31.637778+00:00
3,4278,deli,tortellini deli salad,,2024-01-26 23:50:31.637778+00:00
4,742,candy,cinnamon red hots,,2024-01-26 23:50:31.637778+00:00


In [8]:
%%bigquery
select * from magazine_recipes_raw.nutrition limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,recipe_id,protien,carbo,alcohol,total_fat,sat_fat,cholestrl,sodium,iron,vitamin_c,vitamin_a,fiber,pcnt_cal_carb,pcnt_cal_fat,pcnt_cal_prot,calories,load_time
0,559,8.1,0.78,0.0,2.85,1.07,23.86,58.48,1.03,0.92,0.32,0.0,5.11,41.98,52.91,61.2,2024-01-27 00:11:11.060078+00:00
1,838,4.29,20.0,0.0,2.13,0.6,0.0,136.5,0.0,0.0,0.0,0.0,68.77,16.48,14.75,116.33,2024-01-27 00:11:11.060078+00:00
2,858,0.31,7.56,0.0,1.49,1.29,0.0,69.42,0.1,0.0,0.0,0.0,67.31,29.93,2.77,44.95,2024-01-27 00:11:11.060078+00:00
3,873,8.08,11.71,0.0,2.15,1.34,8.54,255.96,0.12,2.39,78.08,0.0,47.57,19.62,32.8,98.48,2024-01-27 00:11:11.060078+00:00
4,874,8.03,11.37,0.0,8.15,5.07,33.16,119.56,0.12,2.32,309.88,0.0,30.13,48.59,21.27,150.94,2024-01-27 00:11:11.060078+00:00


In [14]:
%%bigquery
select * from magazine_recipes_raw.quantity limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,quantity_id,recipe_id,ingredient_id,max_qty,min_qty,unit,preparation,optional,load_time
0,1,214,1613,2.0,2.0,cup(s),,False,2024-01-30 01:16:24.616985+00:00
1,2,214,3334,0.25,0.25,cup(s),,False,2024-01-30 01:16:24.616985+00:00
2,3,214,2222,0.5,0.5,cup(s),melted,False,2024-01-30 01:16:24.616985+00:00
3,4,214,2797,0.25,0.25,cup(s),or water,False,2024-01-30 01:16:24.616985+00:00
4,5,214,3567,3.0,3.0,teaspoon(s),,False,2024-01-30 01:16:24.616985+00:00


In [10]:
%%bigquery
select * from magazine_recipes_raw.recipe_at limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,name,rating,ease_of_prep,note,type,prep_time,cookbook,page,ingredients,slowcooker,link,last_made,load_time
0,Chive Butter Radishes,4,,,,,,,,,,6/16/2018,2024-01-30 01:04:46.085678+00:00
1,Sweet Potato Breakfast Burritos,4,,,Main Dish,,,,Sweet potato,,https://www.ambitiouskitchen.com/healthy-sweet...,11/1/2018,2024-01-30 01:04:46.085678+00:00
2,Spicy Black Bean Nachos,2,,,Main Dish,,,,"Beans,Adobo Chile",,https://www.mexicanplease.com/spicy-black-bean...,9/17/2018,2024-01-30 01:04:46.085678+00:00
3,Balsamic Pork Chops,1,,,,,,,,,,2/14/2019,2024-01-30 01:04:46.085678+00:00
4,Chocolate Raspberry Torte,5,Hard,So good,Dessert,60.0,,,"Eggs,Milk",,,10/11/2016,2024-01-30 01:04:46.085678+00:00
