# Feature Engineering

Placeholder text

### Load Data

#### Import Libraries

In [21]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

SEED= 42

In [2]:
PREPROCESS_DATA_PATH = r"..\data\preprocess"
RECIPES_FILE = "recipes_eda.parquet"
INTERACTIONS_FILE = "interactions_eda.parquet"

path_recipes = os.path.join(PREPROCESS_DATA_PATH, RECIPES_FILE)
path_interactions = os.path.join(PREPROCESS_DATA_PATH, INTERACTIONS_FILE)

recipes = pd.read_parquet(path_recipes, engine='pyarrow')
interactions = pd.read_parquet(path_interactions, engine='pyarrow')

Before proceeding, check dataframes for any unexpected behavior during import

In [3]:
# Check schemas
recipes.info()
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231636 entries, 0 to 231635
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   name                    231635 non-null  object        
 1   id                      231636 non-null  int64         
 2   minutes                 231636 non-null  int64         
 3   contributor_id          231636 non-null  int64         
 4   submitted               231636 non-null  datetime64[ns]
 5   tags                    231636 non-null  object        
 6   n_steps                 231636 non-null  int64         
 7   steps                   231636 non-null  object        
 8   description             226657 non-null  object        
 9   ingredients             231636 non-null  object        
 10  n_ingredients           231636 non-null  int64         
 11  calories                231636 non-null  float64       
 12  total_fat_pdv           231636

In [6]:
# Check first entry of both dataframes
recipes.head(1)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,ingredients,...,saturated_fat_pdv,carbohydrates_pdv,average_rating,interaction_count,description_word_count,steps_word_count,avg_step_word_count,review_word_count,review_sentiment,description_sentiment
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",...,0.0,4.0,5.0,3,32.0,135,12.272727,44.0,0.333095,0.53125


In [5]:
interactions.head(1)

Unnamed: 0,user_id,recipe_id,date,rating,review,review_word_count,review_sentiment
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,27.0,0.27


In [7]:
# Summary statistics, counts of recipe dataframe
recipes.describe()

Unnamed: 0,id,minutes,contributor_id,submitted,n_steps,n_ingredients,calories,total_fat_pdv,sugar_pdv,sodium_pdv,...,saturated_fat_pdv,carbohydrates_pdv,average_rating,interaction_count,description_word_count,steps_word_count,avg_step_word_count,review_word_count,review_sentiment,description_sentiment
count,231636.0,231636.0,231636.0,231636,231636.0,231636.0,231636.0,231636.0,231636.0,231636.0,...,231636.0,231636.0,231636.0,231636.0,226657.0,231636.0,231635.0,231629.0,231629.0,226657.0
mean,222013.733539,123.107721,5534907.0,2006-11-14 01:49:00.537740288,9.765503,9.051184,473.942712,36.080609,84.297013,30.147611,...,45.588851,15.560448,4.346243,4.888554,36.385689,102.081455,10.422034,53.680937,0.323601,0.281319
min,38.0,0.0,27.0,1999-08-06 00:00:00,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,-1.0,-1.0
25%,99943.75,20.0,56905.0,2004-09-16 00:00:00,6.0,6.0,174.375,8.0,9.0,5.0,...,7.0,4.0,4.0,1.0,15.0,54.0,8.0,35.272727,0.226481,0.074242
50%,207248.5,40.0,173614.0,2007-01-23 00:00:00,9.0,9.0,313.4,20.0,25.0,14.0,...,23.0,9.0,4.714286,2.0,28.0,86.0,10.0,49.0,0.321094,0.266667
75%,333815.25,65.0,398275.0,2008-10-29 00:00:00,12.0,11.0,519.725,41.0,68.0,33.0,...,52.0,16.0,5.0,4.0,48.0,130.0,12.25,65.5,0.413244,0.448611
max,537716.0,288000.0,2002290000.0,2018-12-04 00:00:00,145.0,43.0,434360.2,17183.0,362729.0,29338.0,...,10395.0,36098.0,5.0,1613.0,1168.0,2246.0,150.0,784.0,1.0,1.0
std,141206.160005,1977.763646,99791620.0,,5.995141,3.734775,1189.713934,77.798996,800.082621,131.96186,...,98.235864,81.824733,0.990808,17.532518,32.226114,73.948741,3.526212,30.019055,0.173436,0.271146


In [8]:
# Same for interactions
interactions.describe()

Unnamed: 0,user_id,recipe_id,date,rating,review_word_count,review_sentiment
count,1132365.0,1132365.0,1132365,1132365.0,1132196.0,1132196.0
mean,138429300.0,160896.7,2009-05-13 02:55:54.951450880,4.411015,52.02219,0.3329295
min,1533.0,38.0,2000-01-25 00:00:00,0.0,0.0,-1.0
25%,135470.0,54257.0,2007-03-11 00:00:00,4.0,27.0,0.1919227
50%,330937.0,120547.0,2008-12-29 00:00:00,5.0,44.0,0.3185185
75%,804550.0,243850.0,2011-03-07 00:00:00,5.0,67.0,0.4611827
max,2002373000.0,537716.0,2018-12-20 00:00:00,5.0,1182.0,1.0
std,501427300.0,130398.3,,1.264753,37.14006,0.2296309


In [13]:
# Check for nulls in recipes
recipes.isnull().sum()

name                         1
id                           0
minutes                      0
contributor_id               0
submitted                    0
tags                         0
n_steps                      0
steps                        0
description               4979
ingredients                  0
n_ingredients                0
calories                     0
total_fat_pdv                0
sugar_pdv                    0
sodium_pdv                   0
protein_pdv                  0
saturated_fat_pdv            0
carbohydrates_pdv            0
average_rating               0
interaction_count            0
description_word_count    4979
steps_word_count             0
avg_step_word_count          1
review_word_count            7
review_sentiment             7
description_sentiment     4979
dtype: int64

In [14]:
# Check nulls in interactions
interactions.isnull().sum()

user_id                0
recipe_id              0
date                   0
rating                 0
review               169
review_word_count    169
review_sentiment     169
dtype: int64

### Baseline Features and Models

In [3]:
# Select subset of original columns
item_features = recipes[["minutes", "submitted", "tags", "n_steps", "ingredients", "n_ingredients", "calories", "total_fat_pdv", "sugar_pdv", "sodium_pdv", "protein_pdv", "saturated_fat_pdv", "carbohydrates_pdv"]].copy()

In [16]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231636 entries, 0 to 231635
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   id                 231636 non-null  int64         
 1   minutes            231636 non-null  int64         
 2   submitted          231636 non-null  datetime64[ns]
 3   tags               231636 non-null  object        
 4   n_steps            231636 non-null  int64         
 5   ingredients        231636 non-null  object        
 6   n_ingredients      231636 non-null  int64         
 7   calories           231636 non-null  float64       
 8   total_fat_pdv      231636 non-null  float64       
 9   sugar_pdv          231636 non-null  float64       
 10  sodium_pdv         231636 non-null  float64       
 11  protein_pdv        231636 non-null  float64       
 12  saturated_fat_pdv  231636 non-null  float64       
 13  carbohydrates_pdv  231636 non-null  float64 

#### Temporal Features

In [4]:
# Split date column
item_features["year"] = item_features["submitted"].dt.year
item_features["month"] = item_features["submitted"].dt.month
item_features["day"] = item_features["submitted"].dt.day

In [5]:
item_features["month_norm"] = (2 * np.pi / 12) * item_features["month"]
item_features["month_sin"] = np.sin(item_features["month_norm"])
item_features["month_cos"] = np.cos(item_features["month_norm"])

item_features["day_norm"] = (2 * np.pi / 31) * item_features["day"]
item_features["day_sin"] = np.sin(item_features["day_norm"])
item_features["day_cos"] = np.cos(item_features["day_norm"])

item_features = item_features.drop(["submitted", "month", "month_norm", "day", "day_norm"], axis=1)

#### One-Hot Encoded Features

In [6]:
# Count ingredients
recipe_ingredient = item_features[["ingredients"]].explode("ingredients")
ingredient_counts = recipe_ingredient.groupby("ingredients").value_counts().reset_index()

In [7]:
# Filter ingredients by frequency
INGREDIENT_THRESHOLD = 50
filtered_ingredients = ingredient_counts[ingredient_counts["count"] >= INGREDIENT_THRESHOLD].sort_values("count", ascending=False)["ingredients"]
print(f"Number of ingredient features: {filtered_ingredients.size}")

Number of ingredient features: 2717


In [8]:
# Filter the exploded dataframe for recipe-ingredient combinations of the top N ingredients
recipe_ingredient_filtered = recipe_ingredient[recipe_ingredient["ingredients"].isin(filtered_ingredients)]

In [9]:
# Create a binary column for each ingredient (one-hot encoding)
recipe_ingredient_pivot = pd.pivot_table(recipe_ingredient_filtered, index=recipe_ingredient_filtered.index, columns="ingredients", aggfunc=lambda x: 1, fill_value=0)

In [10]:
# Left outer join adds back recipes without top ingredients and the original columns, filling null values only in the ingredient columns
item_features = item_features.merge(recipe_ingredient_pivot, left_index=True, right_index=True, how="left")
item_features[filtered_ingredients] = item_features[filtered_ingredients].fillna(0)
item_features[filtered_ingredients] = item_features[filtered_ingredients].astype(int)
item_features = item_features.drop("ingredients", axis=1)

Repeat the same process for tags

In [12]:
recipe_tag = item_features[["tags"]].explode("tags")
tag_counts = recipe_tag.groupby("tags").value_counts().reset_index()

In [13]:
TAG_THRESHOLD = 25
filtered_tags = tag_counts[tag_counts["count"] >= TAG_THRESHOLD].sort_values("count", ascending=False)["tags"]
print(f"Number of tag features: {filtered_tags.size}")

Number of tag features: 470


In [14]:
recipe_tag_filtered = recipe_tag[recipe_tag["tags"].isin(filtered_tags)]

In [15]:
recipe_tag_pivot = pd.pivot_table(recipe_tag_filtered, index=recipe_tag_filtered.index, columns="tags", aggfunc=lambda x: 1, fill_value=0)

In [16]:
item_features = item_features.merge(recipe_tag_pivot, left_index=True, right_index=True, how="left", suffixes=("_ing", "_tag"))

In [17]:
# Get lists of tag (and ingredient) columns for the case of duplicate columns
tag_columns = [tag + "_tag" if tag + "_tag" in item_features.columns else tag for tag in filtered_tags]
ingredient_columns = [ing + "_ing" if ing + "_ing" in item_features.columns else ing for ing in filtered_ingredients]

In [18]:
del recipe_tag_pivot

In [19]:
item_features[tag_columns] = item_features[tag_columns].fillna(0)
item_features[tag_columns] = item_features[tag_columns].astype(int)
item_features = item_features.drop("tags", axis=1)

#### Scaled Numeric Features

In [20]:
# Log + 1 scaling for skewed features
outlier_fields = ["minutes", "calories", "total_fat_pdv", "sugar_pdv", "sodium_pdv", "protein_pdv", "saturated_fat_pdv", "carbohydrates_pdv"]       # From EDA

item_features[outlier_fields] = np.log(item_features[outlier_fields] + 1)

    minutes  calories  total_fat_pdv  sugar_pdv  sodium_pdv  protein_pdv  \
0  4.025352  3.960813       0.000000   2.639057    0.000000     1.098612   
1  3.433987  5.161352       2.944439   0.000000    2.890372     3.135494   
2  4.875197  5.601381       3.135494   3.496508    3.891820     3.688879   
3  3.828641  5.911068       2.890372   2.397895    1.098612     2.708050   
4  5.252273  5.869014       0.693147   5.823046    3.178054     1.386294   

   saturated_fat_pdv  carbohydrates_pdv  
0           0.000000           1.609438  
1           3.583519           0.693147  
2           3.332205           1.791759  
3           2.197225           3.044522  
4           0.000000           3.367296  


In [22]:
item_features.head()

Unnamed: 0,id,minutes,n_steps,n_ingredients,calories,total_fat_pdv,sugar_pdv,sodium_pdv,protein_pdv,saturated_fat_pdv,...,whitefish,whole-chicken,whole-duck,whole-turkey,wild-game,wings,winter,yams-sweet-potatoes,yeast_tag,zucchini_tag
0,137739,4.025352,11,7,3.960813,0.0,2.639057,0.0,1.098612,0.0,...,0,0,0,0,0,0,1,0,0,0
1,31490,3.433987,9,6,5.161352,2.944439,0.0,2.890372,3.135494,3.583519,...,0,0,0,0,0,0,0,0,0,0
2,112140,4.875197,6,13,5.601381,3.135494,3.496508,3.89182,3.688879,3.332205,...,0,0,0,0,0,0,0,0,0,0
3,59389,3.828641,11,11,5.911068,2.890372,2.397895,1.098612,2.70805,2.197225,...,0,0,0,0,0,0,0,0,0,0
4,44061,5.252273,5,8,5.869014,0.693147,5.823046,3.178054,1.386294,0.0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Split the DataFrame, store indices
item_features_train, item_features_temp = train_test_split(item_features, test_size=0.3, random_state=SEED)
item_features_val, item_features_test = train_test_split(item_features_temp, test_size=0.5, random_state=SEED)

indices_train = item_features_train.index
indices_val = item_features_val.index
indices_test = item_features_test.index

In [27]:
# Standard scale numerical features
numeric_fields = outlier_fields + ["n_steps", "n_ingredients", "year"]

scaler = StandardScaler()
item_features_train[numeric_fields] = scaler.fit_transform(item_features_train[numeric_fields])
item_features_val[numeric_fields] = scaler.transform(item_features_val[numeric_fields])
item_features_test[numeric_fields] = scaler.transform(item_features_test[numeric_fields])

#### Obtain the corresponding interactions data

In [0]:
# Create user_items DF and get/store the corresponding recipe index alongside each interaction
user_items = interactions[["user_id", "recipe_id", "rating"]]
recipe_id_index = recipes["id"]
recipe_index_map = {recipe_id: index for index, recipe_id in recipe_id_index.items()}
user_items["recipe_idx"] = user_items["recipe_id"].map(recipe_index_map)

In [39]:
# Get recipe_ids corresponding to each of the data splits
train_ids = recipe_id_index.loc[indices_train]
val_ids = recipe_id_index.loc[indices_val]
test_ids = recipe_id_index.loc[indices_test]

In [37]:
# Split the user_items dataframe by the same item-level splits as items_features
user_items_train = user_items[user_items["recipe_id"].isin(train_ids)]
user_items_val = user_items[user_items["recipe_id"].isin(val_ids)]
user_items_test = user_items[user_items["recipe_id"].isin(test_ids)]

0    137739
1     31490
2    112140
3     59389
4     44061
Name: id, dtype: int64

In [None]:
# Calculate and store an index for each user, and append it to user_items dataframe
user_id_index = user_items["user_id"].unique()
user_index_map = {user_id: index for index, user_id in enumerate(user_id_index)}
user_items["user_idx"] = user_items["user_id"].map(user_index_map)