# Gousto Recipe Scraper

***This notebook scrapes recipes from gousto.co.uk***

This script grabs 25 recipes and all their information including the recipe title, nutritional informational and ingredients. To grab the recipes for a certain diet type the category must be changed to fish/vegan/vegetarian/beef/pork/chicken... it is clear in the code where to do these changes. At the end the recipe data is saved as a csv file.

In [13]:
#Make requests to webpages and print the response text
import requests
#Used to parse json, treats json as a python dictionary
import json
#package used to work with regular expressions
import re

import pandas as pd
import numpy as np

In [14]:
api_root = 'https://production-api.gousto.co.uk/cmsreadbroker/v1'

In [15]:
s = requests.Session()

***In the code chunk below the category must be changed from 'chicken-recipes' the diet type you wish to scrape recipes for***

In [16]:
#The below code chunck loads the json object containing the recipe titles
limit=400
offset=100

catagory = "chicken-recipes" #'vegetarian-recipes' #vegan-recipes #pork-recipes #beef-recipes #fish-recipes #chicken-recipes

# put this in a loop, to iterate through increasing offset until you hit the data->count value (currently 3008)
r = s.get(f'{api_root}/recipes?category={catagory}&limit={limit}&offset={offset}')
# API returns a JSON object, so turn that into a parsable python dictionary object
j = json.loads(r.content)

***In the code chunk below change 'chicken-recipes' to the diet type you wish to scrape recipes for***

In [17]:
#In the code chunk below information for each recipe is loaded, parsed and saved to the appropriate list
titles = []
ratings = []
preptimes =[]
cusines = []
calories = []
fat_portions = []
fat_100gs = []
saturated_fat_portions = []
saturated_fat_100gs = []
carbs_portions = []
carbs_100gs = []
fibre_portions = []
fibre_100gs = []
protein_portions = []
protein_100gs = []
salt_portions = []
salt_100gs = []
recipe_ingredients = []
final_ingredients = []

#The j['data']['entries'] accesess all the data for each recipe and iterates through them using the loop
count_limit = 0

print(f"len {len(j['data']['entries'])}")
for entry in j['data']['entries']:

    if count_limit == 25:
        break

    #This grabs the url of each recipe entry
    u = entry['url']

    #Change 
    #If the pattern /[a-z0-9-]+/ is found in u (list of urls) the rest of the code block will execute
    if not re.search(r'/(chicken-recipes)/', u):
        #End current iteration in for loop and continue to next iteration
        continue

    #Basically we want to remove the parts of the url that are categories like vegan-recipes or chicken-recipes etc
    # example is /vegan-recipes/sri-lankan-coconut-dal-with-aubergine-pickle
    # need to chop off the /xxxxx-recipes bit

    #Takes the second part of the recipe name, removing the first part
    recipe_name = re.search(r'/[a-z0-9-]+/([a-z0-9-]+)', u)[1]

    #Creates the api address for each individual recipe by connecting the root api and the recipe name
    #Eg Request URL: https://production-api.gousto.co.uk/cmsreadbroker/v1/recipe/goan-fish-spinach-curry-coriander-rice
    r = s.get(f'{api_root}/recipe/{recipe_name}')
    if not r:
        print(f"Couldn't get {recipe-name}")
        print(r.status_code)
        print(r.message)
        continue

    #Loads all the json content at the URL and saves to the variable recipe
    recipe = json.loads(r.content)

    title = recipe["data"]["entry"]["title"]
    rating = recipe["data"]["entry"]["rating"]["average"]
    prep_time = recipe["data"]["entry"]["prep_times"]["for_2"]
    cuisine = recipe["data"]["entry"]["cuisine"]["slug"]
    calorie = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["energy_kcal"]
    fat_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["fat_mg"]
    fat_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["fat_mg"]
    saturated_fat_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["fat_saturates_mg"]
    saturated_fat_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["fat_saturates_mg"]
    carbs_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["carbs_mg"]
    carbs_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["carbs_mg"]
    fibre_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["fibre_mg"]
    fibre_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["fibre_mg"]
    protein_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["protein_mg"]
    protein_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["protein_mg"]
    salt_portion = recipe["data"]["entry"]["nutritional_information"]["per_portion"]["salt_mg"]
    salt_100g = recipe["data"]["entry"]["nutritional_information"]["per_hundred_grams"]["salt_mg"]

    titles.append(title)
    ratings.append(rating)
    preptimes.append(prep_time)
    cusines.append(cuisine)
    calories.append(calorie)
    fat_portions.append(fat_portion)
    fat_100gs.append(fat_100g)
    saturated_fat_portions.append(saturated_fat_portion)
    saturated_fat_100gs.append(saturated_fat_100g)
    carbs_portions.append(carbs_portion)
    carbs_100gs.append(carbs_100g)
    fibre_portions.append(fibre_portion)
    fibre_100gs.append(fibre_100g)
    protein_portions.append(protein_portion)
    protein_100gs.append(protein_100g)
    salt_portions.append(salt_portion)
    salt_100gs.append(salt_100g)

    ingredients_ = recipe["data"]["entry"]["ingredients"]

    for i in range(1, len(ingredients_)):
        item = recipe["data"]["entry"]["ingredients"][i]["label"]
        recipe_ingredients.append(item)


    final_ingredients.append(list(recipe_ingredients))
    recipe_ingredients = []

    count_limit += 1


len 244


In [18]:
#The code chunk below sews together the lists to create a dataframe of the recipe information
d = {'title': titles, 'rating': ratings, 'prep_time': preptimes, 'cuisine': cusines, 'calories': calories, 'fat_portion': fat_portions, 'fat_100g':fat_100gs, 'saturated_fat_portion': saturated_fat_portions, 'saturated_fat_100g': saturated_fat_100gs, 'carbs_portion':carbs_portions, 'carbs_100g': carbs_100gs, 'fibre_portion':fibre_portions, 'fibre_100g': fibre_100gs, 'protein_portions':protein_portions, 'protein_100g':protein_100gs, 'salt_portion': salt_portions, 'salt_100g':salt_100gs ,'ingredients':final_ingredients}
df = pd.DataFrame(d)
df.head()

Unnamed: 0,title,rating,prep_time,cuisine,calories,fat_portion,fat_100g,saturated_fat_portion,saturated_fat_100g,carbs_portion,carbs_100g,fibre_portion,fibre_100g,protein_portions,protein_100g,salt_portion,salt_100g,ingredients
0,Baharat Chicken & Pepper Skewers With Spiced Rice,4.5,35,middle-eastern,497,4613,734,1513,240,71589,11399,8355,1330,44817,7136,1486,236,"[1 tsp baharat, 1 tsp ground cumin, 1 tsp smok..."
1,Spiced Chicken Mujaddara,4.5,35,middle-eastern,701,22169,4904,6237,1379,80081,17717,8194,1812,51162,11319,249,55,"[2 tsp ground cumin seeds , 130g wholegrain ri..."
2,10-Min Peri Peri Chicken Rice,4.5,10,mediterranean,411,9280,2401,2282,590,49398,12780,4785,1238,36046,9326,3052,789,"[1 tsp smoked paprika, 1 tsp ground turmeric, ..."
3,Ginger Chicken & Broccoli Tray Bake,4.5,40,indonesian,574,13023,3153,3063,741,73727,17851,5770,1397,43102,10436,2219,537,"[30ml soy sauce, 130g basmati rice, 25g honey,..."
4,Cheaty Chicken Kiev,4.5,30,ukrainian,591,7098,1112,1892,296,71869,11264,11000,1724,67302,10548,469,73,"[400g potatoes, 60g panko breadcrumbs, 2 Briti..."


***Change 'chicken-recipes' to the diet type you wish to save the csv file as***

In [9]:
df_v1 = df["ingredients"].apply(pd.Series)
recipes = df.merge(df_v1, left_index=True, right_index=True )
recipes.to_csv('chicken-recipes.csv', index = False)

***Change 'chicken-recipes' to the diet type you wish to read the csv file of***

In [10]:
#This code chunk cleans the data file and gets it into the format Nommm asks for
df = pd.read_csv('chicken-recipes.csv')

quantvars=['g', 'tsp', 'tbsp', 'pot', 'sachet']

def split_ingredients(string, quantvars):
   
    if isinstance(string, str):
        words = string.split()
        quantities = [word for word in words if len([letter for letter in word if letter.isdigit()]) > 0]
        quantities = quantities + [word for word in words if word in quantvars]
        ingredients = [word for word in words if word not in quantities]

        quantities = ' '.join(quantities)
        ingredients = ' '.join(ingredients)
    else:
        quantities, ingredients = np.nan, np.nan
   
    return quantities, ingredients

df_recipes = df.copy()  
cols = [col for col in df_recipes.columns if col.isdigit()]
for col in cols:
    df_recipes[f'quantities_{col}'], df_recipes[f'ingredients_{col}'] = zip(*df_recipes[col].apply(lambda string: split_ingredients(string, quantvars)))

In [11]:
df_recipes.drop(['ingredients', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], axis=1, inplace=True)

***Change 'chicken-recipes' to the file name you wish to save the cleaned data as***

In [12]:
df_recipes.to_csv('chicken-recipes.csv', index = False)