# Eat Your Books Project

## Data Processing

In [1]:
import pandas as pd

In [2]:
data_folder_path = "data/2023-02-05"

### Examine Bookmarks

In [3]:
recipe_bookmark = pd.read_csv(data_folder_path + '/recipe_bookmark.csv')
print("----------------------------------------------------------------------")
print("Recipe bookmark")
print(recipe_bookmark.shape)
print(recipe_bookmark.head())
print(recipe_bookmark.bookmark_name.unique())
print("----------------------------------------------------------------------")


----------------------------------------------------------------------
Recipe bookmark
(3188, 2)
   recipe_id     bookmark_name
0    1855630  I've cooked this
1    2601496  I've cooked this
2    2601194  I've cooked this
3    2602365  I've cooked this
4    2866117  I've cooked this
["I've cooked this" 'Favorite Recipes' '2017-01' '2017-02' '2017-03'
 '2017-08' '2017-09' '2017-10' '2017-11' '2017-12' '2018-01' '2018-02'
 '2018-03' '2018-04' '2018-05' '2018-06' '2018-07' '2018-08' '2018-09'
 '2018-10' '2018-11' '2018-12' '2019-01' '2019-02' '2019-03' '2019-04'
 '2019-05' '2019-06' '2019-07' '2019-08' '2019-09' '2019-10' '2019-11'
 '2019-12' '2020-01' '2020-02' '2020-03' '2020-04' '2020-05' '2020-06'
 '2020-07' '2020-08' '2020-09' '2020-10' '2020-11' '2020-12' '2021-01'
 '2021-02' '2021-03' '2021-04' '2021-05' '2021-06' '2021-07' '2021-08'
 '2021-09' '2021-10' '2021-11' '2021-12' '2022-01' '2022-02' '2022-03'
 '2022-04' '2022-05' '2022-06' '2022-07' '2022-08' '2022-09' '2022-10'
 '2022-11

### Convert Date Column to Date Types

In [4]:
no_dates = ["I've cooked this"
            , 'Favorite Recipes' 
            , 'Ottolenghi Guardian Book'
            ]

recipe_dates = recipe_bookmark.loc[~recipe_bookmark["bookmark_name"].isin(no_dates)]
recipe_dates['bookmark_name'].replace({'pre-2017': '2016-01'},inplace=True)
recipe_dates['bookmark_name'] = pd.to_datetime(recipe_dates["bookmark_name"])
recipe_dates.rename(columns={"bookmark_name": "date"},inplace=True)
print("----------------------------------------------------------------------")
print("Recipe Dates")
print(recipe_dates.head())
print(recipe_dates.shape)
print("----------------------------------------------------------------------")

----------------------------------------------------------------------
Recipe Dates
      recipe_id       date
1346    1714414 2017-01-01
1347    2159890 2017-01-01
1348     909867 2017-01-01
1349     769307 2017-01-01
1350    1714997 2017-01-01
(1637, 2)
----------------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipe_dates['bookmark_name'] = pd.to_datetime(recipe_dates["bookmark_name"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### How Many Recipes Cooked

In [5]:
## first get the recipes that have a bookmark either "I've cooked this" or "Favourite Recipes"

recipe_cooked = recipe_bookmark.loc[recipe_bookmark['bookmark_name'].isin(["I've cooked this", "Favorite Recipes"])]
print("----------------------------------------------------------------------")
print("Recipe Cooked")
print(recipe_cooked.head())
print(recipe_cooked.shape)
print("----------------------------------------------------------------------")

----------------------------------------------------------------------
Recipe Cooked
   recipe_id     bookmark_name
0    1855630  I've cooked this
1    2601496  I've cooked this
2    2601194  I've cooked this
3    2602365  I've cooked this
4    2866117  I've cooked this
(1346, 2)
----------------------------------------------------------------------


In [6]:
# then merge those from above with those with a data, all with a date means they've been cooked

recipe_dates_cooked = recipe_dates.merge(recipe_cooked, on='recipe_id', how='outer')

print("----------------------------------------------------------------------")
print("Recipe Dates Cooked")
print(recipe_dates_cooked.head())
print(recipe_dates_cooked.shape)
print("----------------------------------------------------------------------")

----------------------------------------------------------------------
Recipe Dates Cooked
   recipe_id       date     bookmark_name
0    1714414 2017-01-01  I've cooked this
1    2159890 2017-01-01  I've cooked this
2     909867 2017-01-01  I've cooked this
3     769307 2017-01-01  I've cooked this
4    1714997 2017-01-01  I've cooked this
(1851, 3)
----------------------------------------------------------------------


In [7]:
# examine empty values
nulls = recipe_dates_cooked[recipe_dates_cooked.isna().any(axis=1)]
print("----------------------------------------------------------------------")
print("Recipe Dates Cooked Nulls")
print(nulls.head())
print(nulls.shape)
print("----------------------------------------------------------------------")


----------------------------------------------------------------------
Recipe Dates Cooked Nulls
      recipe_id       date     bookmark_name
1746    2865862 2023-01-01               NaN
1801    2023128        NaT  I've cooked this
1802    1855782        NaT  I've cooked this
1803    1715016        NaT  I've cooked this
1804    2175183        NaT  I've cooked this
(51, 3)
----------------------------------------------------------------------


In [8]:
nulls.to_csv(data_folder_path + '/nulls.csv',index=False)

In [9]:
recipe_dates_cooked.to_csv(data_folder_path + '/recipe_dates_cooked.csv',index=False)

recipes_cooked = recipe_dates_cooked.recipe_id.values.tolist()
print("----------------------------------------------------------------------")
print("Number of recipes cooked", len(recipes_cooked))
print("----------------------------------------------------------------------")

recipes = pd.read_csv(data_folder_path + '/recipe.csv')
recipes_total = recipes.id.values.tolist()
print("----------------------------------------------------------------------")
print("Number of total ", len(recipes_total))
print("----------------------------------------------------------------------")

----------------------------------------------------------------------
Number of recipes cooked 1851
----------------------------------------------------------------------
----------------------------------------------------------------------
Number of total  13834
----------------------------------------------------------------------


In [10]:
unfound_count = 0
unfounds = list()

for recipe in recipes_cooked:
    if recipe not in recipes_total:
        unfound_count += 1
        unfounds.append(recipe)
print("----------------------------------------------------------------------")
print("Recipes in cooked not in total", unfound_count)
print(unfounds)
print("----------------------------------------------------------------------")


----------------------------------------------------------------------
Recipes in cooked not in total 0
[]
----------------------------------------------------------------------


In [11]:
!jupyter nbconvert --to python data_processing.ipynb 

[NbConvertApp] Converting notebook data_processing.ipynb to python
[NbConvertApp] Writing 3752 bytes to data_processing.py
