In [48]:
from ydata_profiling import ProfileReport
import polars as pl
import os 

In [49]:
pl.Config.set_tbl_cols(10)

polars.config.Config

In [127]:
DATA_PATH = "../data/food_recsys"

In [51]:
df_recipe = pl.read_csv(os.path.join(DATA_PATH, "raw","raw-data_recipe.csv"))
df_interaction = pl.read_csv(os.path.join(DATA_PATH, "raw", "raw-data_interaction.csv"))
df_test_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-test_rating.csv"))
df_train_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-train_rating.csv"))
df_valid_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-valid_rating.csv"))

In [52]:
df_recipe.head()

recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,cooking_directions,nutritions,reviews
i64,str,f64,str,i64,str,str,str,str
222388,"""Homemade Bacon""",5.0,"""https://images.media-allrecipe…",3,"""pork belly^smoked paprika^kosh…","""{'directions': u'Prep\n5 m\nCo…","""{u'niacin': {u'hasCompleteData…","""{8542392: {'rating': 5, 'follo…"
240488,"""Pork Loin, Apples, and Sauerkr…",4.764706,"""https://images.media-allrecipe…",29,"""sauerkraut drained^Granny Smit…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{3574785: {'rating': 5, 'follo…"
218939,"""Foolproof Rosemary Chicken Win…",4.571429,"""https://images.media-allrecipe…",12,"""chicken wings^sprigs rosemary^…","""{'directions': u""Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{13774946: {'rating': 5, 'foll…"
87211,"""Chicken Pesto Paninis""",4.625,"""https://images.media-allrecipe…",163,"""focaccia bread quartered^prepa…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{1563136: {'rating': 5, 'follo…"
245714,"""Potato Bacon Pizza""",4.5,"""https://images.media-allrecipe…",2,"""red potatoes^strips bacon^Sauc…","""{'directions': u'Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{2945555: {'rating': 5, 'follo…"


In [53]:
df_interaction.head()

user_id,recipe_id,rating,dateLastModified
i64,i64,i64,str
8542392,222388,5,"""2017-04-22T12:46:43.663 """
11174581,222388,5,"""2013-06-20T15:50:25.96 """
8262477,222388,5,"""2015-02-14T07:27:51.307 """
3574785,240488,5,"""2017-10-07T18:20:08.973 """
12145410,240488,2,"""2018-01-06T00:06:09.563 """


In [54]:
df_recipe = df_recipe.with_columns(pl.col("recipe_id").cast(pl.Utf8))
df_interaction = df_interaction.with_columns(pl.col("user_id").cast(pl.Utf8), pl.col("recipe_id").cast(pl.Utf8), pl.col("dateLastModified").cast(pl.Datetime))
df_test_rating = df_test_rating.with_columns(pl.col("dateLastModified").cast(pl.Datetime), pl.col("user_id").cast(pl.Utf8), pl.col("recipe_id").cast(pl.Utf8))
df_train_rating = df_train_rating.with_columns(pl.col("dateLastModified").cast(pl.Datetime), pl.col("user_id").cast(pl.Utf8), pl.col("recipe_id").cast(pl.Utf8))
df_valid_rating = df_valid_rating.with_columns(pl.col("dateLastModified").cast(pl.Datetime), pl.col("user_id").cast(pl.Utf8), pl.col("recipe_id").cast(pl.Utf8))

In [55]:
# dates in train test and val 
print("TRAIN")
print(df_train_rating.select(pl.col("dateLastModified").min().alias("min"), pl.col("dateLastModified").max().alias("max")))
print(df_train_rating.shape)
print("VALID")
print(df_valid_rating.select(pl.col("dateLastModified").min().alias("min"), pl.col("dateLastModified").max().alias("max")))
print(df_valid_rating.shape)
print("TEST")
print(df_test_rating.select(pl.col("dateLastModified").min().alias("min"), pl.col("dateLastModified").max().alias("max")))
print(df_test_rating.shape)
print("INTERACTION")
print(df_interaction.select(pl.col("dateLastModified").min().alias("min"), pl.col("dateLastModified").max().alias("max")))
print(df_interaction.shape)

TRAIN
shape: (1, 2)
┌─────────────────────────┬─────────────────────────┐
│ min                     ┆ max                     │
│ ---                     ┆ ---                     │
│ datetime[μs]            ┆ datetime[μs]            │
╞═════════════════════════╪═════════════════════════╡
│ 2000-02-08 12:09:11.987 ┆ 2011-10-14 17:43:08.433 │
└─────────────────────────┴─────────────────────────┘
(676946, 4)
VALID
shape: (1, 2)
┌─────────────────────────┬─────────────────────────┐
│ min                     ┆ max                     │
│ ---                     ┆ ---                     │
│ datetime[μs]            ┆ datetime[μs]            │
╞═════════════════════════╪═════════════════════════╡
│ 2011-10-14 17:46:56.443 ┆ 2013-01-31 22:23:27.613 │
└─────────────────────────┴─────────────────────────┘
(133459, 4)
TEST
shape: (1, 2)
┌─────────────────────────┬─────────────────────────┐
│ min                     ┆ max                     │
│ ---                     ┆ ---                     │

In [56]:
print(f" train valid test {df_train_rating.shape[0] + df_valid_rating.shape[0] + df_test_rating.shape[0]} VS total interactions {df_interaction.shape[0]}")

 train valid test 1093845 VS total interactions 3794003


In [57]:
# checking availability of recipes in each split
print("TRAIN")
print(df_train_rating.select(pl.col("recipe_id").n_unique().alias("n_unique_recipes")))
print("VALID")
print(df_valid_rating.select(pl.col("recipe_id").n_unique().alias("n_unique_recipes")))
print("TEST")
print(df_test_rating.select(pl.col("recipe_id").n_unique().alias("n_unique_recipes")))
print("INTERACTION")
print(df_interaction.select(pl.col("recipe_id").n_unique().alias("n_unique_recipes")))

TRAIN
shape: (1, 1)
┌──────────────────┐
│ n_unique_recipes │
│ ---              │
│ u32              │
╞══════════════════╡
│ 29093            │
└──────────────────┘
VALID
shape: (1, 1)
┌──────────────────┐
│ n_unique_recipes │
│ ---              │
│ u32              │
╞══════════════════╡
│ 22684            │
└──────────────────┘
TEST
shape: (1, 1)
┌──────────────────┐
│ n_unique_recipes │
│ ---              │
│ u32              │
╞══════════════════╡
│ 37342            │
└──────────────────┘
INTERACTION
shape: (1, 1)
┌──────────────────┐
│ n_unique_recipes │
│ ---              │
│ u32              │
╞══════════════════╡
│ 49698            │
└──────────────────┘


In [58]:
df_interaction.head()

user_id,recipe_id,rating,dateLastModified
str,str,i64,datetime[μs]
"""8542392""","""222388""",5,2017-04-22 12:46:43.663
"""11174581""","""222388""",5,2013-06-20 15:50:25.960
"""8262477""","""222388""",5,2015-02-14 07:27:51.307
"""3574785""","""240488""",5,2017-10-07 18:20:08.973
"""12145410""","""240488""",2,2018-01-06 00:06:09.563


In [59]:
df_interaction.filter(pl.col("user_id") == '8542392')

user_id,recipe_id,rating,dateLastModified
str,str,i64,datetime[μs]
"""8542392""","""222388""",5,2017-04-22 12:46:43.663
"""8542392""","""84270""",5,2014-01-04 12:25:35.080
"""8542392""","""218070""",5,2015-05-01 18:28:47.667
"""8542392""","""245062""",5,2017-02-20 02:36:27.923
"""8542392""","""213243""",5,2015-09-11 17:34:46.550
…,…,…,…
"""8542392""","""213029""",5,2016-01-01 04:11:05.970
"""8542392""","""204478""",5,2017-01-22 04:20:05.610
"""8542392""","""13096""",5,2018-02-18 21:05:54.630
"""8542392""","""234181""",5,2015-02-12 18:07:34.940


Checking if user, recipe from **interactions** are in all the splits

In [97]:
sample = df_interaction.filter(pl.col("split") == "none").sample(n=1, with_replacement=False).select(pl.col("recipe_id", "user_id")) # x now only nones
recipe = sample["recipe_id"][0]
user = sample["user_id"][0]
print(recipe, user)

print(df_train_rating.filter(
    (pl.col("user_id") == user) & (pl.col("recipe_id") == recipe) 
))

print(df_test_rating.filter(
    (pl.col("user_id") == user) & (pl.col("recipe_id") == recipe) 
))

print(df_valid_rating.filter(
    (pl.col("user_id") == user) & (pl.col("recipe_id") == recipe) 
))


51283 8374422
shape: (0, 4)
┌─────────┬───────────┬────────┬──────────────────┐
│ user_id ┆ recipe_id ┆ rating ┆ dateLastModified │
│ ---     ┆ ---       ┆ ---    ┆ ---              │
│ str     ┆ str       ┆ i64    ┆ datetime[μs]     │
╞═════════╪═══════════╪════════╪══════════════════╡
└─────────┴───────────┴────────┴──────────────────┘
shape: (0, 4)
┌─────────┬───────────┬────────┬──────────────────┐
│ user_id ┆ recipe_id ┆ rating ┆ dateLastModified │
│ ---     ┆ ---       ┆ ---    ┆ ---              │
│ str     ┆ str       ┆ i64    ┆ datetime[μs]     │
╞═════════╪═══════════╪════════╪══════════════════╡
└─────────┴───────────┴────────┴──────────────────┘
shape: (0, 4)
┌─────────┬───────────┬────────┬──────────────────┐
│ user_id ┆ recipe_id ┆ rating ┆ dateLastModified │
│ ---     ┆ ---       ┆ ---    ┆ ---              │
│ str     ┆ str       ┆ i64    ┆ datetime[μs]     │
╞═════════╪═══════════╪════════╪══════════════════╡
└─────────┴───────────┴────────┴──────────────────┘


Checking review distribution of each recipe in each split 

In [117]:
random_recipe = df_recipe.sample(n=1, with_replacement=False).select(pl.col("recipe_id")).to_series()[0]
print(f"Random recipe id {random_recipe}")

print("Recipe info")
print(df_recipe.filter(pl.col("recipe_id") == random_recipe))

print("Interactions for that recipe")
print(df_interaction.filter(pl.col("recipe_id") == random_recipe).shape)

print("Train ratings for that recipe")
print(df_train_rating.filter(pl.col("recipe_id") == random_recipe).shape)
print("Valid ratings for that recipe")
print(df_valid_rating.filter(pl.col("recipe_id") == random_recipe).shape)
print("Test ratings for that recipe")
print(df_test_rating.filter(pl.col("recipe_id") == random_recipe).shape)

Random recipe id 7916
Recipe info
shape: (1, 9)
┌──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┐
│ recipe_i ┆ recipe_n ┆ aver_rat ┆ image_ur ┆ review_n ┆ ingredie ┆ cooking_ ┆ nutritio ┆ reviews  │
│ d        ┆ ame      ┆ e        ┆ l        ┆ ums      ┆ nts      ┆ directio ┆ ns       ┆ ---      │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ns       ┆ ---      ┆ str      │
│ str      ┆ str      ┆ f64      ┆ str      ┆ i64      ┆ str      ┆ ---      ┆ str      ┆          │
│          ┆          ┆          ┆          ┆          ┆          ┆ str      ┆          ┆          │
╞══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════╡
│ 7916     ┆ Mom's    ┆ 4.72     ┆ http://i ┆ 163      ┆ yellow   ┆ {'direct ┆ {u'niaci ┆ {693504: │
│          ┆ Rum Cake ┆          ┆ mages.me ┆          ┆ cake mix ┆ ions':   ┆ n': {u'h ┆ {'rating │
│          ┆          ┆          ┆ dia-allr

Adding split label of each review, and doing analysis 

In [60]:

ratings_all = pl.concat([
    df_train_rating.select(["user_id", "recipe_id", "dateLastModified"]).with_columns(pl.lit("train").alias("split")),
    df_valid_rating.select(["user_id", "recipe_id", "dateLastModified"]).with_columns(pl.lit("valid").alias("split")),
    df_test_rating.select(["user_id", "recipe_id", "dateLastModified"]).with_columns(pl.lit("test").alias("split")),
])

key_cols = ["user_id", "recipe_id", "dateLastModified"]
duplicados = (
    ratings_all
    .group_by(key_cols)
    .count()
    .filter(pl.col("count") > 1)
)
if duplicados.height > 0:
    print("duplicados en el join:", duplicados.height)
else:
    print("no se detectaron llaves duplicadas entre splits")

ratings_all = ratings_all.unique(key_cols)

df_interaction = df_interaction.join(ratings_all, on=key_cols, how="left")

df_interaction = df_interaction.with_columns(pl.col("split").fill_null("none"))

split_summary = (
    df_interaction
    .group_by("split")
    .count()
    .with_columns((pl.col("count") / df_interaction.height).alias("fraction"))
    .sort("count", descending=True)
)

print("Resumen interacciones por split:")
print(split_summary)

matched = df_interaction.filter(pl.col("split") != "none").height
print(f"Interacciones que matchean un split: {matched} / {df_interaction.height} ({matched/df_interaction.height:.2%})")


  .count()


no se detectaron llaves duplicadas entre splits
Resumen interacciones por split:
shape: (4, 3)
┌───────┬─────────┬──────────┐
│ split ┆ count   ┆ fraction │
│ ---   ┆ ---     ┆ ---      │
│ str   ┆ u32     ┆ f64      │
╞═══════╪═════════╪══════════╡
│ none  ┆ 2700158 ┆ 0.711691 │
│ train ┆ 676946  ┆ 0.178425 │
│ test  ┆ 283440  ┆ 0.074707 │
│ valid ┆ 133459  ┆ 0.035176 │
└───────┴─────────┴──────────┘
Interacciones que matchean un split: 1093845 / 3794003 (28.83%)
Resumen interacciones por split:
shape: (4, 3)
┌───────┬─────────┬──────────┐
│ split ┆ count   ┆ fraction │
│ ---   ┆ ---     ┆ ---      │
│ str   ┆ u32     ┆ f64      │
╞═══════╪═════════╪══════════╡
│ none  ┆ 2700158 ┆ 0.711691 │
│ train ┆ 676946  ┆ 0.178425 │
│ test  ┆ 283440  ┆ 0.074707 │
│ valid ┆ 133459  ┆ 0.035176 │
└───────┴─────────┴──────────┘
Interacciones que matchean un split: 1093845 / 3794003 (28.83%)


  .count()


Cross with mealrec+ data 

In [130]:
df_interaction.shape

(3794003, 5)

In [131]:
mealrec_user = "39"
mealrec_recipe = "61727"

In [134]:
df_interaction.filter(
    (pl.col("user_id") == mealrec_user) 
    #(pl.col("recipe_id") == mealrec_recipe)
)

user_id,recipe_id,rating,dateLastModified,split
str,str,i64,datetime[μs],str
"""39""","""14521""",5,2000-05-08 21:43:50.913,"""train"""
"""39""","""222350""",5,2012-05-28 18:35:40.320,"""valid"""
"""39""","""216981""",5,2014-03-16 14:10:20.457,"""test"""
"""39""","""14731""",4,2001-01-15 07:35:02.200,"""train"""
"""39""","""21430""",5,2000-12-14 09:58:13.467,"""train"""
…,…,…,…,…
"""39""","""18322""",5,2000-07-07 10:02:19.833,"""train"""
"""39""","""12974""",4,2001-01-03 08:07:26.950,"""train"""
"""39""","""34357""",5,2005-08-02 13:17:54.437,"""train"""
"""39""","""24887""",5,2001-11-12 12:24:13.013,"""train"""


In [137]:
df_recipe.head()

recipe_id,recipe_name,aver_rate,image_url,review_nums,ingredients,cooking_directions,nutritions,reviews
str,str,f64,str,i64,str,str,str,str
"""222388""","""Homemade Bacon""",5.0,"""https://images.media-allrecipe…",3,"""pork belly^smoked paprika^kosh…","""{'directions': u'Prep\n5 m\nCo…","""{u'niacin': {u'hasCompleteData…","""{8542392: {'rating': 5, 'follo…"
"""240488""","""Pork Loin, Apples, and Sauerkr…",4.764706,"""https://images.media-allrecipe…",29,"""sauerkraut drained^Granny Smit…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{3574785: {'rating': 5, 'follo…"
"""218939""","""Foolproof Rosemary Chicken Win…",4.571429,"""https://images.media-allrecipe…",12,"""chicken wings^sprigs rosemary^…","""{'directions': u""Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{13774946: {'rating': 5, 'foll…"
"""87211""","""Chicken Pesto Paninis""",4.625,"""https://images.media-allrecipe…",163,"""focaccia bread quartered^prepa…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{1563136: {'rating': 5, 'follo…"
"""245714""","""Potato Bacon Pizza""",4.5,"""https://images.media-allrecipe…",2,"""red potatoes^strips bacon^Sauc…","""{'directions': u'Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{2945555: {'rating': 5, 'follo…"


In [132]:
df_interaction.filter(
    (pl.col("user_id") == mealrec_user) & (pl.col("recipe_id") == mealrec_recipe)
)

user_id,recipe_id,rating,dateLastModified,split
str,str,i64,datetime[μs],str
