In [1]:
import pandas as pd

## Create New Ingredients Table

In [2]:
# Read data
cleanIngredients = pd.read_csv("Clean Recipe Data/cleanIngredients_v2.csv")
cleanIngredients.head()

Unnamed: 0,cleanIngredientID,name,name_cleaned,groupings
0,4308,lettuce,lettuce,lettuce
1,2744,french vanilla pudding and pie filling mix,french vanilla pudding and pie filling mix,vanilla
2,6843,stove top stuffing mix,stove top stuffing mix,stove top stuffing mix
3,1910,cream cheese,cream cheese,cream cheese
4,1168,cheddar,cheddar,cheddar


In [3]:
# Create a new table of grouped ingredients
groupedIngredients = cleanIngredients['groupings'].unique()
groupedIngredients = pd.DataFrame(groupedIngredients, columns=['name'])

# Assign an ID
groupedIngredients.reset_index(inplace=True)
groupedIngredients.rename({'index': 'ingredientID'}, axis=1, inplace=True)

groupedIngredients.head()

Unnamed: 0,ingredientID,name
0,0,lettuce
1,1,vanilla
2,2,stove top stuffing mix
3,3,cream cheese
4,4,cheddar


## Create New Join Table

In [4]:
# Add new id back into old cleanIngredient table
newIngredientTable = cleanIngredients.merge(groupedIngredients, how='left', left_on='groupings', \
                                            right_on='name')
newIngredientTable = newIngredientTable.loc[:, ['cleanIngredientID', 'ingredientID']]
newIngredientTable.head()

Unnamed: 0,cleanIngredientID,ingredientID
0,4308,0
1,2744,1
2,6843,2
3,1910,3
4,1168,4


In [5]:
# Import old join table
recipeCleanJoin = pd.read_csv("Clean Recipe Data/recipeCleanJoin.csv")
recipeCleanJoin.head()

Unnamed: 0,recipeID,cleanIngredientID
0,424415,389
1,146223,2683
2,312329,1257
3,74301,7940
4,76272,3484


In [6]:
# Replace old IDs with new grouped ones
recipeIngredientJoin = recipeCleanJoin.merge(newIngredientTable, how='left', \
                                             on='cleanIngredientID')
recipeIngredientJoin = recipeIngredientJoin.copy().loc[:,['recipeID', 'ingredientID']]
recipeIngredientJoin.drop_duplicates(inplace=True)
recipeIngredientJoin.head()

Unnamed: 0,recipeID,ingredientID
0,424415,4806
1,146223,1737
2,312329,72
3,74301,4029
4,76272,295


## Add Column with New Ingredient IDs to Recipe Table

In [7]:
recipeIngredientList = recipeIngredientJoin.groupby('recipeID')['ingredientID'].apply(list).reset_index(name='ingredientIDList')
recipeIngredientList.head()

Unnamed: 0,recipeID,ingredientIDList
0,38,"[3433, 4556, 379, 4632]"
1,40,"[3956, 231, 4048, 2539, 4442, 4747]"
2,45,"[3956, 1758, 5892, 2294, 1737, 103, 2889, 172,..."
3,46,[660]
4,49,"[580, 2930, 72, 1737, 6296, 103, 9, 930, 525]"


## Save Files

In [9]:
groupedIngredients.to_csv("Clean Recipe Data/ingredients.csv", index=False)
recipeIngredientJoin.to_csv("Clean Recipe Data/recipeIngredientJoin.csv", index=False)
recipeIngredientList.to_csv("Clean Recipe Data/RecipeIngredientList.csv", index=False)