In [1]:
!pip install pandas-gbq google-cloud-bigquery pyarrow

Collecting pandas-gbq
  Downloading pandas_gbq-0.29.2-py3-none-any.whl.metadata (3.6 kB)
Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.36.0-py3-none-any.whl.metadata (8.0 kB)
Collecting db-dtypes<2.0.0,>=1.0.4 (from pandas-gbq)
  Downloading db_dtypes-1.4.3-py3-none-any.whl.metadata (3.0 kB)
Collecting pydata-google-auth>=1.5.0 (from pandas-gbq)
  Downloading pydata_google_auth-1.9.1-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting google-api-core<3.0.0,>=2.10.2 (from pandas-gbq)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-auth-oauthlib>=0.7.0 (from pandas-gbq)
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting google-cloud-core<3.0.0,>=2.4.1 (from google-cloud-bigquery)
  Downloading google_cloud_core-2.4.3-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-resumable-media<3.0.0,>=2.0.0 (from google-cloud-bigquery)
  Downloading google_resumable_media-2.7.2-py2.py3-none-an

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 3.3.1 requires cryptography<46,>=43.0.0, but you have cryptography 42.0.8 which is incompatible.
streamlit 1.32.0 requires packaging<24,>=16.8, but you have packaging 25.0 which is incompatible.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 6.32.0 which is incompatible.
tensorboard 2.17.0 requires protobuf!=4.24.0,<5.0.0,>=3.19.6, but you have protobuf 6.32.0 which is incompatible.


In [7]:
import polars as pl
import os 

import pandas as pd
from pandas_gbq import to_gbq
from slugify import slugify
import re

In [3]:
DATA_PATH = "../data/food_recsys"

In [4]:
df_recipe = pl.read_csv(os.path.join(DATA_PATH, "raw","raw-data_recipe.csv"))
df_interaction = pl.read_csv(os.path.join(DATA_PATH, "raw", "raw-data_interaction.csv"))
df_test_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-test_rating.csv"))
df_train_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-train_rating.csv"))
df_valid_rating = pl.read_csv(os.path.join(DATA_PATH, "raw","core-data-valid_rating.csv"))

In [None]:
def normalize_colnames(cols):
    norm = []
    for c in cols:
        c2 = re.sub(r"[^0-9a-zA-Z]+", "_", c).strip("_").lower()
        norm.append(c2)
    return norm

def pick_first(df: pl.DataFrame, candidates, default=None):
    cols = set(df.columns)
    for c in candidates:
        if c in cols:
            return c
    return default

def normalize_df(df: pl.DataFrame) -> pl.DataFrame:
    return df.rename({old: new for old,new in zip(df.columns, normalize_colnames(df.columns))})

df_recipe       = normalize_df(df_recipe)
df_interaction  = normalize_df(df_interaction)
df_train_rating = normalize_df(df_train_rating)
df_valid_rating = normalize_df(df_valid_rating)
df_test_rating  = normalize_df(df_test_rating)


In [18]:
rid  = pick_first(df_recipe, ["recipe_id","id","rid"], None)
ttl  = pick_first(df_recipe, ["title","recipe_name", "name"], None)
tags = pick_first(df_recipe, ["tags","tag_list"], None)
ing  = pick_first(df_recipe, ["ingredients","ingredient","ingr","ing"], None)
img  = pick_first(df_recipe, ["image_url","img_url","image","picture"], None)
desc = pick_first(df_recipe, ["description","summary","text"], None)
cook = pick_first(df_recipe, ["cooking_directions","directions","steps"], None)
nut  = pick_first(df_recipe, ["nutritions","nutrition","nutrients"], None)
rev  = pick_first(df_recipe, ["reviews","review","comments"], None)
# aver_rate, review_nums


In [19]:
if rid is None:
    raise ValueError("No encontré columna de ID de receta en df_recipe (candidatos: recipe_id/id/rid).")

def safe_str(col): 
    return pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(pl.Utf8)).otherwise(pl.lit(""))


In [20]:

recipe_cols = [c for c in [rid, ttl, tags, ing, desc, img, cook, nut, rev] if c]
recipes_canon = pl.DataFrame({c: df_recipe[c] for c in recipe_cols})

combined_parts = []
if ttl:  combined_parts.append(pl.lit("Title: ")       + safe_str(ttl))
if tags: combined_parts.append(pl.lit(" | Tags: ")     + safe_str(tags))
if ing:  combined_parts.append(pl.lit(" | Ingredients: ")+ safe_str(ing))
if desc: combined_parts.append(pl.lit(" | Desc: ")     + safe_str(desc))
if cook: combined_parts.append(pl.lit(" | Cooking: ")  + safe_str(cook))
if nut:  combined_parts.append(pl.lit(" | Nutrition: ")+ safe_str(nut))
if rev:  combined_parts.append(pl.lit(" | Reviews: ")  + safe_str(rev))

combined_text = combined_parts[0]
for p in combined_parts[1:]:
    combined_text = combined_text + p

recipes_canon = recipes_canon.with_columns([
    combined_text.alias("combined_text")
])

In [21]:
recipes_canon.head()

recipe_id,recipe_name,ingredients,image_url,cooking_directions,nutritions,reviews,combined_text
i64,str,str,str,str,str,str,str
222388,"""Homemade Bacon""","""pork belly^smoked paprika^kosh…","""https://images.media-allrecipe…","""{'directions': u'Prep\n5 m\nCo…","""{u'niacin': {u'hasCompleteData…","""{8542392: {'rating': 5, 'follo…","""Title: Homemade Bacon | Ingred…"
240488,"""Pork Loin, Apples, and Sauerkr…","""sauerkraut drained^Granny Smit…","""https://images.media-allrecipe…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{3574785: {'rating': 5, 'follo…","""Title: Pork Loin, Apples, and …"
218939,"""Foolproof Rosemary Chicken Win…","""chicken wings^sprigs rosemary^…","""https://images.media-allrecipe…","""{'directions': u""Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{13774946: {'rating': 5, 'foll…","""Title: Foolproof Rosemary Chic…"
87211,"""Chicken Pesto Paninis""","""focaccia bread quartered^prepa…","""https://images.media-allrecipe…","""{'directions': u'Prep\n15 m\nC…","""{u'niacin': {u'hasCompleteData…","""{1563136: {'rating': 5, 'follo…","""Title: Chicken Pesto Paninis |…"
245714,"""Potato Bacon Pizza""","""red potatoes^strips bacon^Sauc…","""https://images.media-allrecipe…","""{'directions': u'Prep\n20 m\nC…","""{u'niacin': {u'hasCompleteData…","""{2945555: {'rating': 5, 'follo…","""Title: Potato Bacon Pizza | In…"


In [22]:
recipes_canon = recipes_canon.rename({rid: "recipe_id"})
if ttl:  recipes_canon = recipes_canon.rename({ttl: "title"})
if tags: recipes_canon = recipes_canon.rename({tags: "tags"})
if ing:  recipes_canon = recipes_canon.rename({ing: "ingredients"})
if desc: recipes_canon = recipes_canon.rename({desc: "description"})
if img:  recipes_canon = recipes_canon.rename({img: "image_url"})
if cook: recipes_canon = recipes_canon.rename({cook: "cooking_directions"})
if nut:  recipes_canon = recipes_canon.rename({nut: "nutritions"})
if rev:  recipes_canon = recipes_canon.rename({rev: "reviews"})


In [24]:
def canon_interactions(df: pl.DataFrame, split_name: str) -> pl.DataFrame:
    df = df.clone()
    df = normalize_df(df)
    uid = pick_first(df, ["user_id","uid","user"], None)
    rid = pick_first(df, ["recipe_id","rid","item_id","iid"], None)
    rat = pick_first(df, ["rating","score","stars","y"], None)
    ts  = pick_first(df, ["timestamp","ts","time"], None)

    cols = {}
    if uid is None or rid is None:
        raise ValueError(f"Faltan columnas en interacciones ({split_name}): user_id y/o recipe_id.")
    cols["user_id"] = df[uid]
    cols["recipe_id"] = df[rid]

    if rat: cols["rating"] = df[rat].cast(pl.Float64)
    if ts:  cols["timestamp"] = df[ts]

    out = pl.DataFrame(cols).with_columns([pl.lit(split_name).alias("split")])
    return out

df_inter_train = canon_interactions(df_train_rating, "train")
df_inter_valid = canon_interactions(df_valid_rating, "valid")
df_inter_test  = canon_interactions(df_test_rating, "test")

In [None]:
try:
    df_inter_full = canon_interactions(df_interaction, "raw")
    interactions_canon = pl.concat([df_inter_train, df_inter_valid, df_inter_test, df_inter_full], how="diagonal_relaxed")
except Exception:
    interactions_canon = pl.concat([df_inter_train, df_inter_valid, df_inter_test], how="diagonal_relaxed")

interactions_canon = interactions_canon.with_columns([
    pl.col("user_id").cast(pl.Utf8),
    pl.col("recipe_id").cast(pl.Utf8)
]).unique(subset=["user_id","recipe_id","split"], keep="first")

recipes_canon = recipes_canon.with_columns([
    pl.col("recipe_id").cast(pl.Utf8)
]).unique(subset=["recipe_id"], keep="first")

print("Tamaños:")
print("recipes:", recipes_canon.height)
print("interactions:", interactions_canon.height)

Tamaños:
recipes: 49698
interactions: 4887848


In [29]:
recipes_canon.columns

['recipe_id',
 'title',
 'ingredients',
 'image_url',
 'cooking_directions',
 'nutritions',
 'reviews',
 'combined_text']

Subida Bigquery

In [30]:
PROJECT_ID = "kaggle-bigquery-471522"
BQ_DATASET = "foodrecsys"        # cambia si quieres
IF_EXISTS  = "replace"   

In [33]:
p_rec = recipes_canon.to_pandas()
p_int = interactions_canon.to_pandas()

# Tipos para BQ
p_rec = p_rec.astype({
    "recipe_id": "string",
    "title": "string",
    #"tags": "string",
    "ingredients": "string",
    #"description": "string",
    "combined_text": "string",
    "image_url": "string",
    "cooking_directions": "string",
    "nutritions": "string",
    "reviews": "string"
}, errors="ignore")

# timestamp a datetime si existe
if "timestamp" in p_int.columns:
    p_int["timestamp"] = pd.to_datetime(p_int["timestamp"], errors="coerce")

p_int = p_int.astype({
    "user_id": "string",
    "recipe_id": "string",
    "rating": "float64",
    "split": "string"
}, errors="ignore")

# Subida
to_gbq(p_rec, f"{BQ_DATASET}.recipes", project_id=PROJECT_ID, if_exists=IF_EXISTS)
to_gbq(p_int, f"{BQ_DATASET}.interactions", project_id=PROJECT_ID, if_exists=IF_EXISTS)

print("✓ Subido a BigQuery: "
      f"{BQ_DATASET}.recipes (con combined_text) y {BQ_DATASET}.interactions (con split)")

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-3425ks60hkk80fssi9vpohv88g6q1iqd.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fbigquery&state=pWeiqrfW549KRkTpCSTdBMlX1hvmEg&prompt=consent&access_type=offline


100%|██████████| 1/1 [00:00<00:00, 1001.98it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]

✓ Subido a BigQuery: foodrecsys.recipes (con combined_text) y foodrecsys.interactions (con split)



