In [29]:
!pip install pandas-gbq google-cloud-bigquery pyarrow



In [30]:
import polars as pl
import os 

import pandas as pd
from pandas_gbq import to_gbq
from slugify import slugify
import re

In [31]:
DATA_PATH = "../data/food_recsys"

In [32]:
df_recipe = pl.read_csv(os.path.join(DATA_PATH, "raw","raw-data_recipe.csv"))
df_interaction = pl.read_csv(os.path.join(DATA_PATH, "raw", "raw-data_interaction.csv"))
df_test_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-test_rating.csv"))
df_train_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-train_rating.csv"))
df_valid_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-valid_rating.csv"))

In [33]:
def normalize_colnames(cols):
    norm = []
    for c in cols:
        c2 = re.sub(r"[^0-9a-zA-Z]+", "_", c).strip("_").lower()
        norm.append(c2)
    return norm

def pick_first(df: pl.DataFrame, candidates, default=None):
    cols = set(df.columns)
    for c in candidates:
        if c in cols:
            return c
    return default

def normalize_df(df: pl.DataFrame) -> pl.DataFrame:
    return df.rename({old: new for old,new in zip(df.columns, normalize_colnames(df.columns))})

df_recipe       = normalize_df(df_recipe)
df_interaction  = normalize_df(df_interaction)
df_train_rating = normalize_df(df_train_rating)
df_valid_rating = normalize_df(df_valid_rating)
df_test_rating  = normalize_df(df_test_rating)

In [34]:
print(df_recipe.columns)
print(df_interaction.columns)
print(df_train_rating.columns)
print(df_valid_rating.columns)
print(df_test_rating.columns)

['recipe_id', 'recipe_name', 'aver_rate', 'image_url', 'review_nums', 'ingredients', 'cooking_directions', 'nutritions', 'reviews']
['user_id', 'recipe_id', 'rating', 'datelastmodified']
['user_id', 'recipe_id', 'rating', 'datelastmodified']
['user_id', 'recipe_id', 'rating', 'datelastmodified']
['user_id', 'recipe_id', 'rating', 'datelastmodified']


In [35]:
def clean_df_interaction(core: pl.DataFrame) -> pl.DataFrame:
    date_col = pick_first(core, ["datelastmodified", "dateLastModified", "date_last_modified", "timestamp"], None)

    cast_exprs = []
    if "user_id" in core.columns:
        cast_exprs.append(pl.col("user_id").cast(pl.Utf8))
    if "recipe_id" in core.columns:
        cast_exprs.append(pl.col("recipe_id").cast(pl.Utf8))
    if "rating" in core.columns:
        cast_exprs.append(pl.col("rating").cast(pl.Float64))
    if date_col:
        cast_exprs.append(
            pl.col(date_col)
              .cast(pl.Utf8)
              .str.replace("\n", "")
              .str.to_datetime(strict=False)
              .alias(date_col)
        )

    core = core.with_columns(cast_exprs)

    if date_col and date_col in core.columns:
        core = core.with_columns([
            pl.col(date_col).dt.month().alias("month"),
            pl.col(date_col).dt.quarter().alias("quarter"),
        ])

    core = core.drop_nulls()

    if date_col and date_col in core.columns:
        core = core.sort(date_col)

    core = core.unique()

    return core

df_recipe = df_recipe.with_columns([
    pl.col("recipe_id").cast(pl.Utf8),
    pl.col("recipe_name").cast(pl.Utf8),
    pl.col("aver_rate").cast(pl.Float64),
    pl.col("image_url").cast(pl.Utf8),
    pl.col("review_nums").cast(pl.Int64),
    pl.col("ingredients").cast(pl.Utf8),
    pl.col("cooking_directions").cast(pl.Utf8),
    pl.col("nutritions").cast(pl.Utf8),
    pl.col("reviews").cast(pl.Utf8),
])

df_interaction = clean_df_interaction(df_interaction)
df_train_rating = clean_df_interaction(df_train_rating)
df_valid_rating = clean_df_interaction(df_valid_rating)
df_test_rating  = clean_df_interaction(df_test_rating)

Consolidating features of recipes in 1 columns

In [36]:
rid  = pick_first(df_recipe, ["recipe_id","id","rid"], None)
ttl  = pick_first(df_recipe, ["title","recipe_name", "name"], None)
tags = pick_first(df_recipe, ["tags","tag_list"], None)
ing  = pick_first(df_recipe, ["ingredients","ingredient","ingr","ing"], None)
img  = pick_first(df_recipe, ["image_url","img_url","image","picture"], None)
desc = pick_first(df_recipe, ["description","summary","text"], None)
cook = pick_first(df_recipe, ["cooking_directions","directions","steps"], None)
nut  = pick_first(df_recipe, ["nutritions","nutrition","nutrients"], None)
rev  = pick_first(df_recipe, ["reviews","review","comments"], None)
# aver_rate, review_nums


In [37]:
if rid is None:
    raise ValueError("No encontré columna de ID de receta en df_recipe (candidatos: recipe_id/id/rid).")

def safe_str(col): 
    return pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(pl.Utf8)).otherwise(pl.lit(""))


In [38]:

recipe_cols = [c for c in [rid, ttl, tags, ing, desc, img, cook, nut, rev] if c]
recipes_canon = pl.DataFrame({c: df_recipe[c] for c in recipe_cols})

combined_parts = []
if ttl:  combined_parts.append(pl.lit("Title: ")       + safe_str(ttl))
if tags: combined_parts.append(pl.lit(" | Tags: ")     + safe_str(tags))
if ing:  combined_parts.append(pl.lit(" | Ingredients: ")+ safe_str(ing))
if desc: combined_parts.append(pl.lit(" | Desc: ")     + safe_str(desc))
if cook: combined_parts.append(pl.lit(" | Cooking: ")  + safe_str(cook))
if nut:  combined_parts.append(pl.lit(" | Nutrition: ")+ safe_str(nut))
if rev:  combined_parts.append(pl.lit(" | Reviews: ")  + safe_str(rev))

combined_text = combined_parts[0]
for p in combined_parts[1:]:
    combined_text = combined_text + p

recipes_canon = recipes_canon.with_columns([
    combined_text.alias("combined_text")
])

In [39]:
recipes_canon = recipes_canon.rename({rid: "recipe_id"})
if ttl:  recipes_canon = recipes_canon.rename({ttl: "title"})
if tags: recipes_canon = recipes_canon.rename({tags: "tags"})
if ing:  recipes_canon = recipes_canon.rename({ing: "ingredients"})
if desc: recipes_canon = recipes_canon.rename({desc: "description"})
if img:  recipes_canon = recipes_canon.rename({img: "image_url"})
if cook: recipes_canon = recipes_canon.rename({cook: "cooking_directions"})
if nut:  recipes_canon = recipes_canon.rename({nut: "nutritions"})
if rev:  recipes_canon = recipes_canon.rename({rev: "reviews"})


In [40]:
def canon_interactions(df: pl.DataFrame, split_name: str) -> pl.DataFrame:
    df = df.clone()
    df = normalize_df(df)
    uid = pick_first(df, ["user_id","uid","user"], None)
    rid = pick_first(df, ["recipe_id","rid","item_id","iid"], None)
    rat = pick_first(df, ["rating","score","stars","y"], None)
    ts  = pick_first(df, ["timestamp","ts","time"], None)

    # Columns month / quarter may already come from clean_df_interaction
    has_month = "month" in df.columns
    has_quarter = "quarter" in df.columns

    cols = {}
    if uid is None or rid is None:
        raise ValueError(f"Faltan columnas en interacciones ({split_name}): user_id y/o recipe_id.")
    cols["user_id"] = df[uid]
    cols["recipe_id"] = df[rid]

    if rat: cols["rating"] = df[rat].cast(pl.Float64)
    if ts:  cols["timestamp"] = df[ts]
    if has_month: cols["month"] = df["month"].cast(pl.Int64)
    if has_quarter: cols["quarter"] = df["quarter"].cast(pl.Int64)

    out = pl.DataFrame(cols).with_columns([pl.lit(split_name).alias("split")])
    return out

df_inter_train = canon_interactions(df_train_rating, "train")
df_inter_valid = canon_interactions(df_valid_rating, "valid")
df_inter_test  = canon_interactions(df_test_rating, "test")

In [None]:
try:
    df_inter_full = canon_interactions(df_interaction, "raw")
    interactions_canon = pl.concat([df_inter_train, df_inter_valid, df_inter_test, df_inter_full], how="diagonal_relaxed")
except Exception:
    interactions_canon = pl.concat([df_inter_train, df_inter_valid, df_inter_test], how="diagonal_relaxed")


# Cast core id columns
interactions_canon = interactions_canon.with_columns([
    pl.col("user_id").cast(pl.Utf8),
    pl.col("recipe_id").cast(pl.Utf8)
])

subset_keys = [c for c in ["user_id","recipe_id"] if c in interactions_canon.columns]
interactions_canon = interactions_canon.unique(subset=subset_keys, keep="first")

recipes_canon = recipes_canon.with_columns([
    pl.col("recipe_id").cast(pl.Utf8)
]).unique(subset=["recipe_id"], keep="first")

print("Tamaños:")
print("recipes:", recipes_canon.height)
print("interactions:", interactions_canon.height)
print("interactions columns:", interactions_canon.columns)

"""
Tamaños:
recipes: 49698
interactions: 4887848
interactions columns: ['user_id', 'recipe_id', 'rating', 'month', 'quarter', 'split']
"""

Tamaños:
recipes: 49698
interactions: 3794003
interactions columns: ['user_id', 'recipe_id', 'rating', 'month', 'quarter', 'split']


"\nTamaños:\nrecipes: 49698\ninteractions: 4887848\ninteractions columns: ['user_id', 'recipe_id', 'rating', 'month', 'quarter', 'split']\n"

Subida Bigquery

In [50]:
PROJECT_ID = "kaggle-bigquery-471522"
BQ_DATASET = "foodrecsys"        # cambia si quieres
IF_EXISTS  = "replace"   

In [None]:
p_rec = recipes_canon.to_pandas()
p_int = interactions_canon.to_pandas()

# Tipos para BQ
p_rec = p_rec.astype({
    "recipe_id": "string",
    "title": "string",
    #"tags": "string",
    "ingredients": "string",
    #"description": "string",
    "combined_text": "string",
    "image_url": "string",
    "cooking_directions": "string",
    "nutritions": "string",
    "reviews": "string"
}, errors="ignore")

# timestamp a datetime si existe
if "timestamp" in p_int.columns:
    p_int["timestamp"] = pd.to_datetime(p_int["timestamp"], errors="coerce")

cast_int = {
    "user_id": "string",
    "recipe_id": "string",
    "rating": "float64",
    "split": "string"
}
if "month" in p_int.columns:
    cast_int["month"] = "Int64"  # nullable int
if "quarter" in p_int.columns:
    cast_int["quarter"] = "Int64"

p_int = p_int.astype(cast_int, errors="ignore")

# Subida
to_gbq(p_rec, f"{BQ_DATASET}.recipes", project_id=PROJECT_ID, if_exists=IF_EXISTS)
to_gbq(p_int, f"{BQ_DATASET}.interactions", project_id=PROJECT_ID, if_exists=IF_EXISTS)

print("✓ Subido a BigQuery: "
      f"{BQ_DATASET}.recipes (con combined_text) y {BQ_DATASET}.interactions (con split + month/quarter si existen)")

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-3425ks60hkk80fssi9vpohv88g6q1iqd.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=pWeiqrfW549KRkTpCSTdBMlX1hvmEg&prompt=consent&access_type=offline


100%|██████████| 1/1 [00:00<00:00, 1001.98it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]

✓ Subido a BigQuery: foodrecsys.recipes (con combined_text) y foodrecsys.interactions (con split)





jeremy filter

In [51]:
# -----------------------------------------------------------------------------
# Train / Validation windowing + enriched rating (rating_date) and upload
# -----------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Assumptions:
# - df_train_rating / df_valid_rating came from the same raw distribution as in main.py
# - Date column after normalization is 'datelastmodified'
# - We keep only users & recipes meeting min interactions criteria across splits
# - We create 4 BQ tables: train_interactions_windowed, valid_interactions_windowed,
#   final_users, final_recipes.
# If you need different names, adjust TABLE_* variables below.

DATE_COL = "datelastmodified"
PREV_WEEKS = 48
POST_WEEKS = 2
MIN_INTERACTIONS = 5
MIN_INTERACTIONS_VAL = 3

print(f"Using last {PREV_WEEKS} weeks of training data to predict next {POST_WEEKS} weeks of ratings")

# Convert Polars => Pandas (lightweight here)
core_train_rating = df_train_rating.to_pandas()
core_val_rating   = df_valid_rating.to_pandas()

# Ensure dtypes
for c in ["user_id","recipe_id"]:
    if c in core_train_rating.columns:
        core_train_rating[c] = core_train_rating[c].astype(str)
        core_val_rating[c] = core_val_rating[c].astype(str)

# Date parsing (should already be datetime, but enforce)
for df_ in [core_train_rating, core_val_rating]:
    if DATE_COL in df_.columns and not np.issubdtype(df_[DATE_COL].dtype, np.datetime64):
        df_[DATE_COL] = pd.to_datetime(df_[DATE_COL], errors='coerce')

# Drop rows with missing date or rating
core_train_rating = core_train_rating.dropna(subset=[DATE_COL, "rating"])\
    .sort_values(DATE_COL)
core_val_rating = core_val_rating.dropna(subset=[DATE_COL, "rating"])\
    .sort_values(DATE_COL)

# Window training period (last PREV_WEEKS relative to its max date)
max_train_date = core_train_rating[DATE_COL].max()
min_date_val = max_train_date - pd.Timedelta(weeks=PREV_WEEKS)
core_train_rating = core_train_rating.loc[lambda df: df[DATE_COL] >= min_date_val]

# Limit validation horizon (first POST_WEEKS from its min date)
min_val_date = core_val_rating[DATE_COL].min()
max_date_val = min_val_date + pd.Timedelta(weeks=POST_WEEKS)
core_val_rating = core_val_rating.loc[lambda df: df[DATE_COL] <= max_date_val]

# Common users & recipes
common_users = set(core_train_rating['user_id']).intersection(core_val_rating['user_id'])
common_recipes = set(core_train_rating['recipe_id']).intersection(core_val_rating['recipe_id'])

train_users = core_train_rating[
    core_train_rating['user_id'].isin(common_users) &
    core_train_rating['recipe_id'].isin(common_recipes)
]
val_users = core_val_rating[
    core_val_rating['user_id'].isin(common_users) &
    core_val_rating['recipe_id'].isin(common_recipes)
]

# Interaction counts post-initial filter
train_user_counts = train_users['user_id'].value_counts()
val_user_counts   = val_users['user_id'].value_counts()
train_recipe_counts = train_users['recipe_id'].value_counts()
val_recipe_counts   = val_users['recipe_id'].value_counts()

users_min_it_train = set(train_user_counts[train_user_counts >= MIN_INTERACTIONS].index)
users_min_it_val   = set(val_user_counts[val_user_counts >= MIN_INTERACTIONS_VAL].index)
recipes_min_it_train = set(train_recipe_counts[train_recipe_counts >= MIN_INTERACTIONS].index)
recipes_min_it_val   = set(val_recipe_counts[val_recipe_counts >= MIN_INTERACTIONS_VAL].index)

final_users   = users_min_it_train.intersection(users_min_it_val)
final_recipes = recipes_min_it_train.intersection(recipes_min_it_val)

train_users = train_users[
    train_users['user_id'].isin(final_users) &
    train_users['recipe_id'].isin(final_recipes)
]
val_users = val_users[
    val_users['user_id'].isin(final_users) &
    val_users['recipe_id'].isin(final_recipes)
]

# ---- Antiquity bonus (modify_rating analogue) ----
def modify_rating(df: pd.DataFrame, date_col: str = DATE_COL, alpha: float = 0.2) -> pd.DataFrame:
    if df.empty:
        df['rating_date'] = df.get('rating', pd.Series([], dtype=float))
        return df
    min_d = df[date_col].min()
    max_d = df[date_col].max()
    denom = (max_d - min_d).total_seconds() or 1
    age_norm = 1.0 - (df[date_col] - min_d).dt.total_seconds() / denom  # oldest => 1
    df = df.copy()
    df['rating'] = df['rating'].astype(float)
    df['rating_date'] = df['rating'] + alpha * age_norm
    return df

train_users = modify_rating(train_users)
val_users   = modify_rating(val_users)

print(f"Final datasets: {len(train_users)} train interactions, {len(val_users)} val interactions")
print(f"Users: {len(final_users)}, Recipes: {len(final_recipes)}")
print(f"Min interactions per user (train): {train_users['user_id'].value_counts().min()}")
print(f"Min interactions per user (val):   {val_users['user_id'].value_counts().min()}")

# Basic sanity stats
print(train_users[['rating','rating_date']].describe())
print(val_users[['rating','rating_date']].describe())

# Tables to upload
TABLE_TRAIN = f"{BQ_DATASET}.train_interactions_windowed"
TABLE_VALID = f"{BQ_DATASET}.valid_interactions_windowed"
TABLE_USERS = f"{BQ_DATASET}.final_users"
TABLE_RECIPES = f"{BQ_DATASET}.final_recipes"

# Prepare small dimension tables
final_users_df = pd.DataFrame({'user_id': sorted(final_users)})
final_recipes_df = pd.DataFrame({'recipe_id': sorted(final_recipes)})

# Cast for BQ
for df_, name in [(train_users, 'train'), (val_users, 'val')]:
    for c in ['user_id','recipe_id']:
        df_[c] = df_[c].astype(str)
    if DATE_COL in df_.columns:
        df_[DATE_COL] = pd.to_datetime(df_[DATE_COL], errors='coerce')

# Upload (idempotent replace by IF_EXISTS policy from earlier cell)
print("Subiendo tablas filtradas a BigQuery...")
to_gbq(train_users, TABLE_TRAIN, project_id=PROJECT_ID, if_exists=IF_EXISTS)
to_gbq(val_users,   TABLE_VALID, project_id=PROJECT_ID, if_exists=IF_EXISTS)
to_gbq(final_users_df,   TABLE_USERS,   project_id=PROJECT_ID, if_exists=IF_EXISTS)
to_gbq(final_recipes_df, TABLE_RECIPES, project_id=PROJECT_ID, if_exists=IF_EXISTS)
print("✓ Tablas subidas:")
print(f" - {TABLE_TRAIN}")
print(f" - {TABLE_VALID}")
print(f" - {TABLE_USERS}")
print(f" - {TABLE_RECIPES}")

Using last 48 weeks of training data to predict next 2 weeks of ratings
Final datasets: 1161 train interactions, 229 val interactions
Users: 199, Recipes: 250
Min interactions per user (train): 1
Min interactions per user (val):   1
            rating  rating_date
count  1161.000000  1161.000000
mean      4.537468     4.630851
std       0.762607     0.765950
min       1.000000     1.005784
25%       4.000000     4.129266
50%       5.000000     5.034451
75%       5.000000     5.119908
max       5.000000     5.200000
           rating  rating_date
count  229.000000   229.000000
mean     4.650655     4.752999
std      0.701041     0.699552
min      1.000000     1.061886
25%      4.000000     4.184215
50%      5.000000     5.063263
75%      5.000000     5.136036
max      5.000000     5.200000
Subiendo tablas filtradas a BigQuery...
Final datasets: 1161 train interactions, 229 val interactions
Users: 199, Recipes: 250
Min interactions per user (train): 1
Min interactions per user (val):   1

100%|██████████| 1/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<?, ?it/s]

✓ Tablas subidas:
 - foodrecsys.train_interactions_windowed
 - foodrecsys.valid_interactions_windowed
 - foodrecsys.final_users
 - foodrecsys.final_recipes



