## Library

In [None]:
import numpy as np
import pandas as pd
import ast

## Load Data

In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle #creating folder
! cp kaggle.json ~/.kaggle/ #copying kaggle.json
! chmod 600 ~/.kaggle/kaggle.json #reading the file with full access
! kaggle datasets download 'shuyangli94/food-com-recipes-and-user-interactions'
! mkdir data
! unzip food-com-recipes-and-user-interactions.zip -d data

Downloading food-com-recipes-and-user-interactions.zip to /content
 99% 266M/267M [00:09<00:00, 31.9MB/s]
100% 267M/267M [00:09<00:00, 29.7MB/s]
Archive:  food-com-recipes-and-user-interactions.zip
  inflating: data/PP_recipes.csv     
  inflating: data/PP_users.csv       
  inflating: data/RAW_interactions.csv  
  inflating: data/RAW_recipes.csv    
  inflating: data/ingr_map.pkl       
  inflating: data/interactions_test.csv  
  inflating: data/interactions_train.csv  
  inflating: data/interactions_validation.csv  


In [None]:
df_recipe_raw = pd.read_csv('data/RAW_recipes.csv')

for col in ['tags', 'nutrition', 'steps', 'ingredients']:
  df_recipe_raw[col] = df_recipe_raw[col].apply(ast.literal_eval)

df_recipe_raw['submitted'] = pd.to_datetime(df_recipe_raw['submitted'])

df_recipe_raw = df_recipe_raw.sort_values(by='id')

## Data Preprocessing

### Recipe

In [None]:
df_recipe = df_recipe_raw[[
    'id', 'name', 'contributor_id', 'submitted', 'description',
    'minutes', 'n_steps', 'n_ingredients',
]]

In [None]:
df_recipe = df_recipe.rename(columns={
    'contributor_id': 'user_id',
    'submitted': 'date'
})

In [None]:
for i, col in enumerate(['calories', 'total_fat', 'sugar',
                         'sodium', 'protein', 'saturated_fat']):
  df_recipe[col] = df_recipe_raw['nutrition'].apply(lambda x: x[i])

In [None]:
df_recipe.columns

Index(['id', 'name', 'user_id', 'date', 'description', 'minutes', 'n_steps',
       'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein',
       'saturated_fat'],
      dtype='object')

In [None]:
df_recipe = df_recipe.loc[df_recipe['name'].notnull()]

In [None]:
df_recipe['description'] = df_recipe['description'].fillna('')

In [None]:
df_recipe['name'] = df_recipe['name'].apply(lambda x: ' '.join([token.capitalize() for token in x.strip().split(' ')]))

In [None]:
df_recipe = df_recipe.reset_index(drop=True)

In [None]:
df_recipe = df_recipe.loc[df_recipe['minutes']<=600]

In [None]:
df_recipe.to_csv('recipe.csv', index=0)

### Rating

In [None]:
df_rating_raw = pd.read_csv('data/RAW_interactions.csv')
df_rating_raw['date'] = pd.to_datetime(df_rating_raw['date'])

In [None]:
df_rating = df_rating_raw.loc[df_rating_raw['recipe_id'].isin(df_recipe['id'].tolist())]

In [None]:
df_rating['review'] = df_rating['review'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rating['review'] = df_rating['review'].fillna('')


In [None]:
df_rating = df_rating.reset_index(drop=True)

In [None]:
df_rating.to_csv('rating.csv', index=0)

## Ingredients

In [None]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
ingredients = sorted(df_recipe_raw['ingredients'].explode().unique().tolist())

In [None]:
ingredients_clean = []
ingredients_clean_map = {}

for ingredient in ingredients:
  tokens = ingredient.split(' ')
  ingredient_clean = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
  ingredients_clean_map[ingredient] = ingredient_clean
  ingredients_clean.append(ingredient_clean)

In [None]:
ingredients_clean = sorted(list(set(ingredients_clean)))

In [None]:
len(ingredients_clean)

13755

In [None]:
ingredients_map = {}
for i, ingredient in enumerate(ingredients_clean):
  ingredients_map[ingredient] = i

In [None]:
df_ingredient = pd.DataFrame({'id': ingredients_map.values(), 'ingredient': ingredients_map.keys()})

In [None]:
df_ingredient.to_csv('ingredient.csv', index=0)

## Recipe Ingredient

In [None]:
df_recipe_ingredient = df_recipe_raw.set_index('id')['ingredients'].explode()
df_recipe_ingredient = pd.DataFrame(df_recipe_ingredient).reset_index()

In [None]:
df_recipe_ingredient['ingredients'] = df_recipe_ingredient['ingredients'].map(ingredients_clean_map)
df_recipe_ingredient['ingredients'] = df_recipe_ingredient['ingredients'].map(ingredients_map)

In [None]:
df_recipe_ingredient = df_recipe_ingredient.rename(columns={'id': 'recipe_id', 'ingredients': 'ingredient_id'})

In [None]:
df_recipe_ingredient = df_recipe_ingredient.loc[df_recipe_ingredient['recipe_id'].isin(df_recipe['id'].tolist())]

In [None]:
df_recipe_ingredient = df_recipe_ingredient.reset_index(drop=True)

In [None]:
df_recipe_ingredient.to_csv('recipe_ingredient.csv', index=0)

### Tags

In [None]:
tags = sorted(df_recipe_raw['tags'].explode().unique().tolist())

In [None]:
tags = tags[1:]

In [None]:
tags_map = {}
for i, tag in enumerate(tags):
  tags_map[tag] = i

In [None]:
df_tag = pd.DataFrame({'id': tags_map.values(), 'tag': tags_map.keys()})

In [None]:
df_tag.to_csv('tag.csv', index=0)

### Recipe Tag

In [None]:
df_recipe_tag = df_recipe_raw.set_index('id')['tags'].explode()

In [None]:
df_recipe_tag = pd.DataFrame(df_recipe_tag).reset_index()

In [None]:
df_recipe_tag = df_recipe_tag.loc[df_recipe_tag['tags']!='']

In [None]:
df_recipe_tag['tags'] = df_recipe_tag['tags'].map(tags_map)

In [None]:
df_recipe_tag = df_recipe_tag.rename(columns={'id': 'recipe_id', 'tags': 'tag_id'})

In [None]:
df_recipe_tag = df_recipe_tag.loc[df_recipe_tag['recipe_id'].isin(df_recipe['id'].tolist())]

In [None]:
df_recipe_tag = df_recipe_tag.reset_index(drop=True)

In [None]:
df_recipe_tag.to_csv('recipe_tag.csv', index=0)

### Step

In [None]:
df_recipe_step = df_recipe_raw.set_index('id')['steps'].explode()

In [None]:
df_recipe_step = pd.DataFrame(df_recipe_step).reset_index()

In [None]:
df_recipe_step = df_recipe_step.rename(columns={'steps': 'step'})

In [None]:
df_recipe_step = df_recipe_step.dropna(how='any')

In [None]:
df_recipe_step = df_recipe_step.loc[df_recipe_step['step'].str.len() > 2]

In [None]:
df_recipe_step['step_n'] = df_recipe_step.groupby('id').cumcount() + 1

In [None]:
df_recipe_step.to_csv('recipe_step.csv', index=0)

### User

In [None]:
user_ids = sorted(list(set(df_recipe_raw['contributor_id'].unique().tolist()
+ df_rating_raw['user_id'].unique().tolist())))

In [None]:
df_user = pd.DataFrame({'id': user_ids})

In [None]:
df_user.to_csv('user.csv', index=0)