# Feature Engineering

Placeholder text

### Load Data

#### Import Libraries

In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, when

In [2]:
PREPROCESS_DATA_PATH = r"..\data\preprocess"
RECIPES_FILE = "recipes_eda.parquet"
INTERACTIONS_FILE = "interactions_eda.parquet"

path_recipes = os.path.join(PREPROCESS_DATA_PATH, RECIPES_FILE)
path_interactions = os.path.join(PREPROCESS_DATA_PATH, INTERACTIONS_FILE)

spark = SparkSession.builder.getOrCreate()

recipes = spark.read.parquet(path_recipes)
interactions = spark.read.parquet(path_interactions)

Before proceeding, check dataframes for any unexpected behavior during import

In [3]:
# Check schemas
recipes.printSchema()
interactions.printSchema()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)
 |-- minutes: long (nullable = true)
 |-- contributor_id: long (nullable = true)
 |-- submitted: timestamp_ntz (nullable = true)
 |-- tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- n_steps: long (nullable = true)
 |-- steps: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- n_ingredients: long (nullable = true)
 |-- calories: double (nullable = true)
 |-- total_fat_pdv: double (nullable = true)
 |-- sugar_pdv: double (nullable = true)
 |-- sodium_pdv: double (nullable = true)
 |-- protein_pdv: double (nullable = true)
 |-- saturated_fat_pdv: double (nullable = true)
 |-- carbohydrates_pdv: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- interaction_count: long (nullable = true)
 |-- description

In [4]:
# Check first entry of both dataframes
recipes.show(n=1, vertical=True)
interactions.show(n=1)

-RECORD 0--------------------------------------
 name                   | arriba   baked wi... 
 id                     | 137739               
 minutes                | 55                   
 contributor_id         | 47892                
 submitted              | 2005-09-16 00:00:00  
 tags                   | [60-minutes-or-le... 
 n_steps                | 11                   
 steps                  | [make a choice an... 
 description            | autumn is my favo... 
 ingredients            | [winter squash, m... 
 n_ingredients          | 7                    
 calories               | 51.5                 
 total_fat_pdv          | 0.0                  
 sugar_pdv              | 13.0                 
 sodium_pdv             | 0.0                  
 protein_pdv            | 2.0                  
 saturated_fat_pdv      | 0.0                  
 carbohydrates_pdv      | 4.0                  
 average_rating         | 5.0                  
 interaction_count      | 3             

In [6]:
# Summary statistics, counts of recipe dataframe
recipes.summary().show()

+-------+--------------------+------------------+------------------+-------------------+-----------------+--------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+----------------------+------------------+-------------------+------------------+-------------------+---------------------+
|summary|                name|                id|           minutes|     contributor_id|          n_steps|         description|     n_ingredients|         calories|    total_fat_pdv|        sugar_pdv|        sodium_pdv|       protein_pdv|saturated_fat_pdv| carbohydrates_pdv|    average_rating| interaction_count|description_word_count|  steps_word_count|avg_step_word_count| review_word_count|   review_sentiment|description_sentiment|
+-------+--------------------+------------------+------------------+-------------------+-----------------+--------------------

In [7]:
# Same for interactions
interactions.summary().show()

+-------+--------------------+------------------+------------------+--------------------+------------------+-------------------+-----------------+
|summary|             user_id|         recipe_id|            rating|              review| review_word_count|   review_sentiment|__index_level_0__|
+-------+--------------------+------------------+------------------+--------------------+------------------+-------------------+-----------------+
|  count|             1132365|           1132365|           1132365|             1132196|           1132196|            1132196|          1132365|
|   mean|1.3842931051382372E8|160896.70114406574| 4.411014999580524|                20.0| 52.02219492031415| 0.3329295339209602|566183.4841716231|
| stddev|5.0142727313740426E8|130398.27670415681|1.2647527152948626|  40.195356282370064|37.140061463398574|0.22963089862414507|326886.4261336108|
|    min|                1533|                38|                 0|\t\tThis is good,...|               0.0|          

In [9]:
# Check for nulls in recipes
recipes.select(*(sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in recipes.columns)).show()

+----+---+-------+--------------+---------+----+-------+-----+-----------+-----------+-------------+--------+-------------+---------+----------+-----------+-----------------+-----------------+--------------+-----------------+----------------------+----------------+-------------------+-----------------+----------------+---------------------+
|name| id|minutes|contributor_id|submitted|tags|n_steps|steps|description|ingredients|n_ingredients|calories|total_fat_pdv|sugar_pdv|sodium_pdv|protein_pdv|saturated_fat_pdv|carbohydrates_pdv|average_rating|interaction_count|description_word_count|steps_word_count|avg_step_word_count|review_word_count|review_sentiment|description_sentiment|
+----+---+-------+--------------+---------+----+-------+-----+-----------+-----------+-------------+--------+-------------+---------+----------+-----------+-----------------+-----------------+--------------+-----------------+----------------------+----------------+-------------------+-----------------+-----------

In [10]:
# Check nulls in interactions
interactions.select(*(sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in interactions.columns)).show()

+-------+---------+----+------+------+-----------------+----------------+-----------------+
|user_id|recipe_id|date|rating|review|review_word_count|review_sentiment|__index_level_0__|
+-------+---------+----+------+------+-----------------+----------------+-----------------+
|      0|        0|   0|     0|   169|              169|             169|                0|
+-------+---------+----+------+------+-----------------+----------------+-----------------+


### Baseline Features and Models

#### Baseline Features