# 3.0 Generate clean recipe table

## Update old recipe table with new image urls (faster), and remove/replace broken links

In [15]:
import os
import pandas as pd
import numpy as np

In [16]:
# Takes the OLD recipe table with images not updated
df_old = pd.read_csv('../data/recipe_table.csv', sep=';')
df_new = pd.read_csv('../data/recipe_table_updated_UAT.csv', sep=',')

df_new = df_new.reset_index()
df_new.columns = df_new.iloc[0].tolist()
df_new = df_new.drop(0)
df_new['id'] = df_new['id'].astype(int)

In [17]:
old_images = df_old.set_index('id').to_dict()['image_url']
new_images = df_new.set_index('id').to_dict()['image_url']

In [18]:
# Dictionary to match new image links with old recipe ids
new_dict = {k: new_images.get(k, v) for k, v in old_images.items()}

In [19]:
# Make new dataframe wtih updated images
updated_images = pd.DataFrame.from_dict(new_dict, orient='index', columns=['image_url'])

In [20]:
# Add image resizer to further optimize speed
updated_images = updated_images.reset_index()
updated_images.columns = ['id', 'image_url']
updated_images['image_url'] = updated_images['image_url'] + '?auto=format&fit=crop&w=320&h=218'

In [21]:
# Update old recipe table with new images
df_updated = df_old.copy()
df_updated['image_url'] = updated_images['image_url']

# links broken on new table still. 217 was good
broken_links = [741, 808, 824, 880, 889, 938, 945]
for i in broken_links:
    df_updated.drop(df_updated.loc[df_updated['id']==i].index, inplace=True)

df_updated.dropna(subset=['food_group'], inplace=True)

In [22]:
# Bad numbers in new recipe table but working in old. Change these links back to old links
id_list = [320, 335, 349, 495, 647, 764]

for elem in id_list:
    df_updated.at[df_updated[df_updated['id']==elem].index[0], 'image_url'] = df_old.at[df_old[df_old['id']==elem].index[0],'image_url']

In [23]:
# Checking that links have been replaced
print(df_old.at[df_old[df_old['id']==495].index[0],'image_url'])
print(df_updated.at[df_updated[df_updated['id']==495].index[0], 'image_url'])

https://d3ce0k5v3uorqv.cloudfront.net/uploads/recipe_image/643/image/d8e83d976ac20ff159db6191e92278b5.jpg
https://d3ce0k5v3uorqv.cloudfront.net/uploads/recipe_image/643/image/d8e83d976ac20ff159db6191e92278b5.jpg


In [24]:
# Reassigning ginger salmon to fish instead of chicken
df_updated.at[df_updated[df_updated['title']=='Ginger salmon with buckwheat noodles & peanuts'].index[0], 'food_group'] = 'Fish'
df_updated[df_updated['title']=='Ginger salmon with buckwheat noodles & peanuts']

Unnamed: 0,allergens,calories,carbs,cuisine,fat,food_group,protein,season,specials,id,cooking_time,image_url,instructions,key_ingredient,title,description,price_1p_pence,price_2p_pence,price_4p_pence
444,"{Fish,Peanuts,Sesame,Soya}",618,48,asian,32,Fish,42,spring,,537,15,https://mindfulchef-uat.imgix.net/recipes/537/...,1. Boil a kettle. Heat a frying pan with half ...,Freshly caught Scottish salmon,Ginger salmon with buckwheat noodles & peanuts,This quick sesame and ginger stir-fry features...,1175,1700,3000


In [25]:
# Changing 4 shellfish entries to correct food groups
df_updated.at[df_updated[df_updated['id']==31].index[0], 'food_group'] = 'Beef'
df_updated.at[df_updated[df_updated['id']==102].index[0], 'food_group'] = 'Chicken'
df_updated.at[df_updated[df_updated['id']==366].index[0], 'food_group'] = 'Chicken'
df_updated.at[df_updated[df_updated['id']==407].index[0], 'food_group'] = 'Fish'

In [26]:
# Check change
df_updated[df_updated['id']==407]

Unnamed: 0,allergens,calories,carbs,cuisine,fat,food_group,protein,season,specials,id,cooking_time,image_url,instructions,key_ingredient,title,description,price_1p_pence,price_2p_pence,price_4p_pence
327,"{Crustaceans,Mustard,Sulphites}",636,56,asian,30,Fish,36,winter,"{'post workout'""}""",407,30,https://mindfulchef-uat.imgix.net/recipes/407/...,1. Boil a kettle. Rinse the black rice and add...,Lincolnshire prawns,Sri Lankan coconut prawn curry & black rice,Our Lincolnshire prawns are grown sustainably ...,1200,2000,3800


In [30]:
df_updated.to_csv('../data/recipe_table_new.csv', sep=';', index=False)

In [31]:
df_updated[df_updated['title'].str.contains('Matcha salmon')]['image_url'].values[0]

'https://d3ce0k5v3uorqv.cloudfront.net/uploads/recipe_image/643/image/d8e83d976ac20ff159db6191e92278b5.jpg'