# Data Loading & Quality Utils

Notebook utilitaire contenant toutes les fonctions de chargement, nettoyage et transformation des données.
À importer dans les notebooks d'analyse spécialisés.

In [1]:
# Imports de base
from pathlib import Path
import os
import duckdb
import polars as pl
import numpy as np
import pandas as pd
from typing import Optional, List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

## 🔌 Connexion DuckDB

In [2]:
def get_db_path() -> Path:
    """Localise automatiquement la base DuckDB dans la hiérarchie de dossiers."""
    anchors = [Path.cwd().resolve(), *Path.cwd().resolve().parents]
    db_candidate = next(
        (anchor / "00_preprod" / "data" / "mangetamain.duckdb"
         for anchor in anchors
         if (anchor / "00_preprod" / "data" / "mangetamain.duckdb").exists()),
        None,
    )
    if db_candidate is None:
        raise FileNotFoundError("Impossible de localiser 00_preprod/data/mangetamain.duckdb")
    return db_candidate

def get_table_overview(db_path: Path) -> pl.DataFrame:
    """Retourne un aperçu de toutes les tables avec leurs tailles."""
    with duckdb.connect(database=str(db_path), read_only=True) as conn:
        tables = conn.execute("SHOW TABLES").pl()
        table_list = [row[0] for row in tables.iter_rows()]
        
        row_counts = []
        for table in table_list:
            count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
            row_counts.append(count)
    
    return pl.DataFrame({
        "table": table_list,
        "row_count": row_counts,
    }).sort("row_count", descending=True)

## 📥 Chargement des données

In [None]:
def load_table(table_name: str, db_path: Optional[Path] = None, limit: Optional[int] = None) -> pl.DataFrame:
    """Charge une table depuis DuckDB."""
    if db_path is None:
        db_path = get_db_path()
    
    sql = f"SELECT * FROM {table_name}"
    if limit is not None:
        sql += f" LIMIT {limit}"
    
    with duckdb.connect(database=str(db_path), read_only=True) as conn:
        return conn.execute(sql).pl()

def load_interactions_raw(db_path: Optional[Path] = None) -> pl.DataFrame:
    """Charge la table RAW_interactions avec filtrage de base."""
    if db_path is None:
        db_path = get_db_path()
    
    sql = """
    SELECT *
    FROM RAW_interactions
    WHERE rating BETWEEN 1 AND 5
      AND date IS NOT NULL
    """
    
    with duckdb.connect(database=str(db_path), read_only=True) as conn:
        return conn.execute(sql).pl()

def load_enriched_interactions(db_path: Optional[Path] = None) -> pl.DataFrame:
    """Charge interactions enrichies avec les données de recettes."""
    if db_path is None:
        db_path = get_db_path()
    
    sql = """
    SELECT 
        i.user_id,
        i.recipe_id,
        i.date,
        i.rating,
        i.review,
        r.name as recipe_name,
        r.ingredients,
        r.tags,
        r.nutrition,
        r.n_steps,
        r.n_ingredients
    FROM RAW_interactions i
    LEFT JOIN RAW_recipes r ON i.recipe_id = r.id
    WHERE i.rating BETWEEN 1 AND 5
      AND i.date IS NOT NULL
    """
    
    with duckdb.connect(database=str(db_path), read_only=True) as conn:
        return conn.execute(sql).pl()

## 🧹 Data Quality & Cleaning

In [4]:
def analyze_data_quality(df: pl.DataFrame, name: str = "DataFrame") -> Dict[str, any]:
    """Analyse complète de la qualité des données."""
    report = {
        "name": name,
        "shape": df.shape,
        "schema": dict(df.schema),
        "null_counts": df.null_count().to_dict(),
        "duplicate_count": df.is_duplicated().sum(),
    }
    
    # Analyse spécifique ratings si présent
    if "rating" in df.columns:
        ratings = df["rating"]
        report["rating_stats"] = {
            "mean": ratings.mean(),
            "std": ratings.std(),
            "min": ratings.min(),
            "max": ratings.max(),
            "n_zero": df.filter(pl.col("rating") == 0).height,
            "n_invalid": df.filter((pl.col("rating") < 0) | (pl.col("rating") > 5)).height,
        }
    
    # Analyse dates si présent
    if "date" in df.columns:
        date_col = df["date"]
        report["date_stats"] = {
            "min_date": date_col.min(),
            "max_date": date_col.max(),
            "null_dates": date_col.null_count(),
        }
    
    return report

def print_quality_report(report: Dict[str, any]):
    """Affiche un rapport de qualité formaté."""
    print(f"📊 Rapport de qualité : {report['name']}")
    print(f"Shape: {report['shape']}")
    print(f"Duplicatas: {report['duplicate_count']}")
    
    if "rating_stats" in report:
        rs = report["rating_stats"]
        print(f"\n🌟 Ratings:")
        print(f"  Moyenne: {rs['mean']:.3f} ± {rs['std']:.3f}")
        print(f"  Range: [{rs['min']}, {rs['max']}]")
        print(f"  Zéros: {rs['n_zero']}, Invalides: {rs['n_invalid']}")
    
    if "date_stats" in report:
        ds = report["date_stats"]
        print(f"\n📅 Dates: {ds['min_date']} → {ds['max_date']}")

## 🔄 Transformations

In [5]:
def add_calendar_features(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    """Ajoute les features calendaires (année, mois, saison, weekend)."""
    return df.with_columns([
        pl.col(date_col).dt.year().alias("year"),
        pl.col(date_col).dt.month().alias("month"),
        pl.col(date_col).dt.weekday().alias("weekday"),
        (pl.col(date_col).dt.weekday() >= 5).cast(pl.Int8).alias("is_weekend"),
        pl.when(pl.col(date_col).dt.month().is_in([12, 1, 2])).then(pl.lit("Winter"))
          .when(pl.col(date_col).dt.month().is_in([3, 4, 5])).then(pl.lit("Spring"))
          .when(pl.col(date_col).dt.month().is_in([6, 7, 8])).then(pl.lit("Summer"))
          .otherwise(pl.lit("Autumn")).alias("season"),
    ])

def add_rating_features(df: pl.DataFrame, rating_col: str = "rating") -> pl.DataFrame:
    """Ajoute les features dérivées du rating (z-score normalisé)."""
    rating_mean = df[rating_col].mean()
    rating_std = df[rating_col].std()
    
    return df.with_columns([
        ((pl.col(rating_col) - rating_mean) / rating_std).alias("normalized_rating"),
    ])

def clean_and_enrich_interactions(df: pl.DataFrame) -> pl.DataFrame:
    """Pipeline complet de nettoyage et enrichissement - retourne une copie transformée."""
    # Nettoyage de base (copie pour ne pas modifier l'original)
    cleaned = df.filter(
        (pl.col("rating").is_between(0, 5, closed="both")) &
        (pl.col("date").is_not_null())
    )
    
    # Suppression des duplicatas exacts
    cleaned = cleaned.unique()
    
    # Ajout des features
    enriched = add_calendar_features(cleaned)
    enriched = add_rating_features(enriched)
    
    return enriched

def load_clean_interactions(db_path: Optional[Path] = None) -> pl.DataFrame:
    """Charge et nettoie les interactions en une seule fois - version transformée prête à l'emploi."""
    raw_interactions = load_interactions_raw(db_path)
    return clean_and_enrich_interactions(raw_interactions)

## 🎯 Fonctions d'analyse spécialisées

In [6]:
def prepare_volume_analysis(df: pl.DataFrame) -> Dict[str, pd.DataFrame]:
    """Prépare les DataFrames pour l'analyse de volume (graphiques)."""
    return {
        "by_year": df.group_by("year").agg(pl.len().alias("n_interactions")).sort("year").to_pandas(),
        "by_month": df.group_by("month").agg(pl.len().alias("n_interactions")).sort("month").to_pandas(),
        "by_weekend": df.group_by("is_weekend").agg(pl.len().alias("n_interactions")).sort("is_weekend").to_pandas(),
        "by_season": df.group_by("season").agg(pl.len().alias("n_interactions")).to_pandas(),
    }

def prepare_rating_analysis(df: pl.DataFrame) -> Dict[str, pd.DataFrame]:
    """Prépare les DataFrames pour l'analyse des ratings."""
    return {
        "monthly_stats": df.group_by("year", "month").agg([
            pl.col("rating").mean().alias("mean_rating"),
            pl.col("rating").median().alias("median_rating"),
            pl.col("rating").std().alias("std_rating"),
            pl.len().alias("n_interactions")
        ]).sort(["year", "month"]).to_pandas(),
        
        "seasonal_stats": df.group_by("season").agg([
            pl.col("rating").mean().alias("mean_rating"),
            pl.col("rating").median().alias("median_rating"),
            pl.col("rating").std().alias("std_rating"),
            pl.len().alias("n_interactions")
        ]).to_pandas(),
        
        "weekend_stats": df.group_by("is_weekend").agg([
            pl.col("rating").mean().alias("mean_rating"),
            pl.col("rating").median().alias("median_rating"),
            pl.col("rating").std().alias("std_rating"),
            pl.len().alias("n_interactions")
        ]).to_pandas(),
    }

## 🧪 Fonctions de test rapide

In [16]:
def show_transformed_sample(df: pl.DataFrame, n: int = 5):
    """Affiche un échantillon formaté des données transformées."""
    print(f"📊 Aperçu des données transformées ({n} premières lignes):")
    
    # Colonnes de base
    base_cols = ["user_id", "recipe_id", "date", "rating"]
    # Nouvelles colonnes ajoutées
    new_cols = [col for col in df.columns if col not in base_cols]
    
    print(f"🆕 Nouvelles colonnes ajoutées: {', '.join(new_cols)}")
    
    # Affichage avec sélection de colonnes importantes
    display_cols = base_cols + ["year", "month", "season", "is_weekend", "normalized_rating"]
    available_cols = [col for col in display_cols if col in df.columns]
    
    sample_df = df.select(available_cols).head(n)
    display(sample_df)
    
    return sample_df

In [8]:
def test_data_pipeline():
    """Test rapide du pipeline de données."""
    print("🧪 Test du pipeline de données...")
    
    # Test connexion
    db_path = get_db_path()
    print(f"✅ DB trouvée: {db_path}")
    
    # Test chargement RAW (original, non modifié)
    df_raw = load_interactions_raw(db_path)
    print(f"✅ RAW_interactions chargées: {df_raw.shape}")
    
    # Test qualité RAW
    report_raw = analyze_data_quality(df_raw, "RAW_interactions")
    print_quality_report(report_raw)
    
    # Test version transformée (copie enrichie)
    df_transformed = load_clean_interactions(db_path)
    print(f"\n✅ Interactions transformées: {df_transformed.shape}")
    
    # Test qualité transformée
    report_transformed = analyze_data_quality(df_transformed, "TRANSFORMED_interactions")
    print_quality_report(report_transformed)
    
    # Aperçu formaté de la version transformée
    show_transformed_sample(df_transformed, n=5)
    
    return df_raw, df_transformed

# Exécution du test si ce notebook est lancé directement
if __name__ == "__main__":
    test_data_pipeline()

🧪 Test du pipeline de données...
✅ DB trouvée: /Users/antoinedalle/Desktop/MangeTaMain_Projet/backtothefuturekitchen/00_preprod/data/mangetamain.duckdb
✅ RAW_interactions chargées: (1132367, 5)
📊 Rapport de qualité : RAW_interactions
Shape: (1132367, 5)
Duplicatas: 0

🌟 Ratings:
  Moyenne: 4.411 ± 1.265
  Range: [0, 5]
  Zéros: 60847, Invalides: 0

📅 Dates: 2000-01-25 → 2018-12-20
✅ RAW_interactions chargées: (1132367, 5)
📊 Rapport de qualité : RAW_interactions
Shape: (1132367, 5)
Duplicatas: 0

🌟 Ratings:
  Moyenne: 4.411 ± 1.265
  Range: [0, 5]
  Zéros: 60847, Invalides: 0

📅 Dates: 2000-01-25 → 2018-12-20

✅ Interactions transformées: (1132367, 11)
📊 Rapport de qualité : TRANSFORMED_interactions
Shape: (1132367, 11)
Duplicatas: 0

🌟 Ratings:
  Moyenne: 4.411 ± 1.265
  Range: [0, 5]
  Zéros: 60847, Invalides: 0

📅 Dates: 2000-01-25 → 2018-12-20
📊 Aperçu des données transformées (5 premières lignes):
🆕 Nouvelles colonnes ajoutées: review, year, month, weekday, is_weekend, season, norm

user_id,recipe_id,date,rating,year,month,season,is_weekend,normalized_rating
i64,i64,date,i64,i32,i8,str,i8,f64
605866,53878,2009-12-26,4,2009,12,"""Winter""",1,-0.324978
35526,15777,2003-02-10,5,2003,2,"""Winter""",0,0.465691
360241,23689,2007-06-25,5,2007,6,"""Summer""",0,0.465691
13483,10386,2002-08-23,5,2002,8,"""Summer""",1,0.465691
425994,106611,2007-01-13,4,2007,1,"""Winter""",1,-0.324978


In [9]:
# Exploration des données brutes pour planifier les analyses
print("🔍 Exploration des tables RAW disponibles...")

# Aperçu des tables
db_path = get_db_path()
tables_overview = get_table_overview(db_path)
print("\n📊 Tables disponibles:")
display(tables_overview)

# Schéma RAW_interactions  
df_interactions = load_table("RAW_interactions", limit=5)
print(f"\n📋 Schéma RAW_interactions ({df_interactions.shape[0]} échantillon):")
display(df_interactions)
print(f"Colonnes: {list(df_interactions.columns)}")

# Schéma RAW_recipes
df_recipes = load_table("RAW_recipes", limit=5) 
print(f"\n📋 Schéma RAW_recipes ({df_recipes.shape[0]} échantillon):")
display(df_recipes)
print(f"Colonnes: {list(df_recipes.columns)}")

🔍 Exploration des tables RAW disponibles...

📊 Tables disponibles:


table,row_count
str,i64
"""RAW_interactions""",1132367
"""interactions_train""",698901
"""RAW_recipes""",231637
"""PP_recipes""",178265
"""PP_users""",25076
"""interactions_test""",12455
"""interactions_validation""",7023



📋 Schéma RAW_interactions (5 échantillon):


user_id,recipe_id,date,rating,review
i64,i64,date,i64,str
38094,40893,2003-02-17,4,"""Great with a salad. Cooked on …"
1293707,40893,2011-12-21,5,"""So simple, so delicious! Great…"
8937,44394,2002-12-01,4,"""This worked very well and is E…"
126440,85009,2010-02-27,5,"""I made the Mexican topping and…"
57222,85009,2011-10-01,5,"""Made the cheddar bacon topping…"


Colonnes: ['user_id', 'recipe_id', 'date', 'rating', 'review']

📋 Schéma RAW_recipes (5 échantillon):


name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
str,i64,i64,i64,date,str,str,i64,str,str,str,i64
"""arriba baked winter squash m…",137739,55,47892,2005-09-16,"""['60-minutes-or-less', 'time-t…","""[51.5, 0.0, 13.0, 0.0, 2.0, 0.…",11,"""['make a choice and proceed wi…","""autumn is my favorite time of …","""['winter squash', 'mexican sea…",7
"""a bit different breakfast piz…",31490,30,26278,2002-06-17,"""['30-minutes-or-less', 'time-t…","""[173.4, 18.0, 0.0, 17.0, 22.0,…",9,"""['preheat oven to 425 degrees …","""this recipe calls for the crus…","""['prepared pizza crust', 'saus…",6
"""all in the kitchen chili""",112140,130,196586,2005-02-25,"""['time-to-make', 'course', 'pr…","""[269.8, 22.0, 32.0, 48.0, 39.0…",6,"""['brown ground beef in large p…","""this modified version of 'mom'…","""['ground beef', 'yellow onions…",13
"""alouette potatoes""",59389,45,68585,2003-04-14,"""['60-minutes-or-less', 'time-t…","""[368.1, 17.0, 10.0, 2.0, 14.0,…",11,"""['place potatoes in a large po…","""this is a super easy, great ta…","""['spreadable cheese with garli…",11
"""amish tomato ketchup for can…",44061,190,41706,2002-10-25,"""['weeknight', 'time-to-make', …","""[352.9, 1.0, 337.0, 23.0, 3.0,…",5,"""['mix all ingredients& boil fo…","""my dh's amish mother raised hi…","""['tomato juice', 'apple cider …",8


Colonnes: ['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']


## 🍳 NOUVELLES FONCTIONS - ANALYSE INGRÉDIENTS

Extension du système pour analyser les ratings par ingrédient dans le temps.

In [17]:
def extract_ingredients_from_string(ingredients_str: str) -> List[str]:
    """Extrait les ingrédients d'une string format "['ing1', 'ing2', ...]"."""
    if not ingredients_str or ingredients_str == 'null':
        return []
    
    # Supprime les crochets et guillemets, puis split sur les virgules
    cleaned = ingredients_str.strip("[]").replace("'", "").replace('"', '')
    ingredients = [ing.strip() for ing in cleaned.split(",")]
    
    # Filtre les éléments vides et normalise
    return [ing.lower().strip() for ing in ingredients if ing.strip()]

def explore_ingredients_format(db_path: Optional[Path] = None, n_samples: int = 10) -> Dict:
    """Fonction de test pour comprendre le format des ingrédients."""
    if db_path is None:
        db_path = get_db_path()
    
    # Charge quelques recettes avec ingrédients
    sql = """
    SELECT id as recipe_id, name, ingredients, n_ingredients
    FROM RAW_recipes 
    WHERE ingredients IS NOT NULL 
    LIMIT ?
    """
    
    with duckdb.connect(database=str(db_path), read_only=True) as conn:
        df_sample = conn.execute(sql, [n_samples]).pl()
    
    # Analyse du format
    analysis = {
        "sample_count": len(df_sample),
        "ingredients_samples": [],
        "format_analysis": {},
    }
    
    for row in df_sample.iter_rows(named=True):
        ingredients_str = row["ingredients"]
        extracted = extract_ingredients_from_string(ingredients_str)
        
        analysis["ingredients_samples"].append({
            "recipe_id": row["recipe_id"],
            "name": row["name"][:50] + "..." if len(row["name"]) > 50 else row["name"],
            "n_ingredients": row["n_ingredients"],
            "raw_string": ingredients_str[:100] + "..." if len(str(ingredients_str)) > 100 else ingredients_str,
            "extracted_count": len(extracted),
            "extracted_sample": extracted[:3]  # Premiers 3 ingrédients
        })
    
    # Stats globales
    all_extracted = []
    for sample in analysis["ingredients_samples"]:
        recipe_ingredients = extract_ingredients_from_string(
            df_sample.filter(pl.col("recipe_id") == sample["recipe_id"])["ingredients"][0]
        )
        all_extracted.extend(recipe_ingredients)
    
    analysis["format_analysis"] = {
        "total_ingredients_extracted": len(all_extracted),
        "unique_ingredients": len(set(all_extracted)),
        "most_common": pd.Series(all_extracted).value_counts().head(10).to_dict(),
        "avg_ingredients_per_recipe": len(all_extracted) / n_samples if n_samples > 0 else 0
    }
    
    return analysis

def load_ingredient_ratings(target_ingredients: List[str], db_path: Optional[Path] = None) -> pl.DataFrame:
    """
    Charge les ratings pour recettes contenant des ingrédients spécifiques.
    
    SCHÉMA DU DATASET FINAL:
    - recipe_id: ID de la recette
    - user_id: ID utilisateur  
    - date: Date de l'interaction
    - rating: Note donnée (1-5)
    - ingredient_name: Nom de l'ingrédient trouvé
    - recipe_name: Nom de la recette (pour debug)
    - n_ingredients: Nombre total ingrédients recette
    
    Une ligne par (recette × ingrédient_cible_trouvé × interaction)
    """
    df_enriched = load_enriched_interactions(db_path)
    
    # Explode les ingrédients - chaque recette devient N lignes (1 par ingrédient)
    df_with_ingredients = (df_enriched
        .with_columns([
            # Extraction des ingrédients individuels  
            pl.col("ingredients").map_elements(
                lambda x: extract_ingredients_from_string(str(x)) if x else [],
                return_dtype=pl.List(pl.Utf8)
            ).alias("ingredient_list")
        ])
        .explode("ingredient_list")  # Explose la liste = 1 ligne par ingrédient
        .rename({"ingredient_list": "ingredient_name"})
        .filter(pl.col("ingredient_name") != "")  # Supprime les vides
    )
    
    # Filtre sur les ingrédients cibles seulement
    df_filtered = df_with_ingredients.filter(
        pl.col("ingredient_name").is_in([ing.lower().strip() for ing in target_ingredients])
    )
    
    # Sélectionne les colonnes finales
    return df_filtered.select([
        "recipe_id", 
        "user_id", 
        "date", 
        "rating",
        "ingredient_name",
        "recipe_name",
        "n_ingredients"
    ])

print("✅ Fonctions ingrédients ajoutées au système!")

✅ Fonctions ingrédients ajoutées au système!


### 🧪 Test du format ingrédients

In [18]:
# Test d'exploration du format des ingrédients
print("🔍 TEST DU FORMAT INGRÉDIENTS")
print("="*50)

# Test avec 5 échantillons
analysis = explore_ingredients_format(n_samples=5)

print(f"📊 Échantillons analysés: {analysis['sample_count']}")
print(f"🧪 Stats extractions:")
print(f"  Total ingrédients: {analysis['format_analysis']['total_ingredients_extracted']}")
print(f"  Ingrédients uniques: {analysis['format_analysis']['unique_ingredients']}")
print(f"  Moyenne par recette: {analysis['format_analysis']['avg_ingredients_per_recipe']:.1f}")

print(f"\n📋 ÉCHANTILLONS DÉTAILLÉS:")
for sample in analysis['ingredients_samples']:
    print(f"  Recipe {sample['recipe_id']}: {sample['name']}")
    print(f"    N ingrédients déclarés: {sample['n_ingredients']}")
    print(f"    N ingrédients extraits: {sample['extracted_count']}")
    print(f"    Échantillon: {sample['extracted_sample']}")
    print(f"    Format brut: {sample['raw_string']}")
    print()

print(f"🔥 TOP INGRÉDIENTS COMMUNS:")
for ing, count in analysis['format_analysis']['most_common'].items():
    print(f"  {ing}: {count}")

🔍 TEST DU FORMAT INGRÉDIENTS
📊 Échantillons analysés: 5
🧪 Stats extractions:
  Total ingrédients: 45
  Ingrédients uniques: 40
  Moyenne par recette: 9.0

📋 ÉCHANTILLONS DÉTAILLÉS:
  Recipe 137739: arriba   baked winter squash mexican style
    N ingrédients déclarés: 7
    N ingrédients extraits: 7
    Échantillon: ['winter squash', 'mexican seasoning', 'mixed spice']
    Format brut: ['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil', 'salt']

  Recipe 31490: a bit different  breakfast pizza
    N ingrédients déclarés: 6
    N ingrédients extraits: 6
    Échantillon: ['prepared pizza crust', 'sausage patty', 'eggs']
    Format brut: ['prepared pizza crust', 'sausage patty', 'eggs', 'milk', 'salt and pepper', 'cheese']

  Recipe 112140: all in the kitchen  chili
    N ingrédients déclarés: 13
    N ingrédients extraits: 13
    Échantillon: ['ground beef', 'yellow onions', 'diced tomatoes']
    Format brut: ['ground beef', 'yellow onions', 'diced tomat

### 🍳 Test des ingrédients ciblés

In [19]:
# Test avec nos ingrédients cibles
print("🍳 TEST DE LOAD_INGREDIENT_RATINGS")
print("="*50)

# Nos ingrédients CORE + SPÉCIALISÉS proposés
core_ingredients = ['quinoa', 'kale', 'sriracha', 'avocado', 'coconut']
specialized_long_term = ['chia seeds', 'oat milk']
specialized_seasonal = ['pumpkin', 'asparagus']
specialized_weekend = ['truffle', 'bacon']

# Commençons par tester les CORE + quelques ingrédients basiques qu'on sait qui existent
test_ingredients = ['salt', 'eggs', 'quinoa', 'kale']

print(f"🎯 Ingrédients testés: {test_ingredients}")
print("📊 Chargement des données...")

try:
    df_result = load_ingredient_ratings(test_ingredients)
    
    print(f"✅ Dataset créé avec succès!")
    print(f"📏 Shape: {df_result.shape}")
    print(f"📋 Colonnes: {df_result.columns}")
    
    # Stats par ingrédient
    stats = df_result.group_by('ingredient_name').agg([
        pl.len().alias('n_ratings'),
        pl.col('rating').mean().alias('avg_rating'),
        pl.col('recipe_id').n_unique().alias('n_recipes'),
        pl.col('user_id').n_unique().alias('n_users')
    ]).sort('n_ratings', descending=True)
    
    print(f"\n📈 PERFORMANCE PAR INGRÉDIENT:")
    display(stats)
    
    print(f"\n📅 Période couverte: {df_result['date'].min()} → {df_result['date'].max()}")
    
    # Test spécifique quinoa (ingrédient trendy)
    quinoa_data = df_result.filter(pl.col('ingredient_name') == 'quinoa')
    if quinoa_data.height > 0:
        print(f"\n🌾 FOCUS QUINOA:")
        print(f"  Interactions: {quinoa_data.height:,}")
        print(f"  Rating moyen: {quinoa_data['rating'].mean():.3f}")
        print(f"  Première mention: {quinoa_data['date'].min()}")
        print(f"  Dernière mention: {quinoa_data['date'].max()}")
        
        # Évolution temporelle rapide
        yearly_quinoa = (quinoa_data
                        .with_columns(pl.col("date").dt.year().alias("year"))
                        .group_by("year")
                        .agg([
                            pl.len().alias("n_ratings"),
                            pl.col("rating").mean().alias("avg_rating")
                        ])
                        .sort("year"))
        print(f"\n📈 Évolution quinoa par année (échantillon):")
        display(yearly_quinoa.head(10))
    else:
        print(f"\n⚠️ Quinoa non trouvé - ingrédient peut-être absent ou format différent")
        
except Exception as e:
    print(f"❌ Erreur: {e}")
    import traceback
    traceback.print_exc()

🍳 TEST DE LOAD_INGREDIENT_RATINGS
🎯 Ingrédients testés: ['salt', 'eggs', 'quinoa', 'kale']
📊 Chargement des données...
✅ Dataset créé avec succès!
📏 Shape: (650828, 7)
📋 Colonnes: ['recipe_id', 'user_id', 'date', 'rating', 'ingredient_name', 'recipe_name', 'n_ingredients']

📈 PERFORMANCE PAR INGRÉDIENT:


ingredient_name,n_ratings,avg_rating,n_recipes,n_users
str,u32,f64,u32,u32
"""salt""",468403,4.38874,85746,130191
"""eggs""",178000,4.353152,33761,66216
"""kale""",2435,4.354004,359,1939
"""quinoa""",1990,4.401508,450,1295



📅 Période couverte: 2000-01-25 → 2018-12-20

🌾 FOCUS QUINOA:
  Interactions: 1,990
  Rating moyen: 4.402
  Première mention: 2003-04-22
  Dernière mention: 2018-11-28

📈 Évolution quinoa par année (échantillon):


year,n_ratings,avg_rating
i32,u32,f64
2003,9,4.666667
2004,8,3.875
2005,30,4.333333
2006,57,4.438596
2007,152,4.467105
2008,226,4.460177
2009,242,4.491736
2010,371,4.466307
2011,268,4.373134
2012,190,4.4


### 🎯 Définition des ingrédients finaux

Liste des 11 ingrédients sélectionnés pour l'analyse temporelle complète.

In [20]:
# Configuration finale des ingrédients pour analyse temporelle
print("🎯 CONFIGURATION INGRÉDIENTS - APPROCHE HYBRIDE")
print("="*70)

# Configuration CORE + SPÉCIALISÉS
ingredients_config = {
    "CORE": {
        "description": "Analysés sur les 3 axes temporels",
        "ingredients": ['quinoa', 'kale', 'sriracha', 'avocado', 'coconut'],
        "rationale": "Potentiel multi-temporel élevé"
    },
    "SPECIALIZED_LONG_TERM": {
        "description": "Optimisés pour tendances long-terme", 
        "ingredients": ['chia seeds', 'oat milk'],
        "rationale": "Émergence récente, adoption explosive"
    },
    "SPECIALIZED_SEASONAL": {
        "description": "Optimisés pour saisonnalité",
        "ingredients": ['pumpkin', 'asparagus'], 
        "rationale": "Cyclicité saisonnière forte"
    },
    "SPECIALIZED_WEEKEND": {
        "description": "Optimisés pour effet weekend",
        "ingredients": ['truffle', 'bacon'],
        "rationale": "Indulgence vs effort weekend"
    }
}

# Liste complète
all_target_ingredients = []
for category, config in ingredients_config.items():
    all_target_ingredients.extend(config["ingredients"])

print(f"📊 CONFIGURATION COMPLÈTE:")
for category, config in ingredients_config.items():
    print(f"\n{category}:")
    print(f"  📝 {config['description']}")
    print(f"  🍳 Ingrédients: {', '.join(config['ingredients'])}")
    print(f"  💡 Logique: {config['rationale']}")

print(f"\n🎯 LISTE FINALE ({len(all_target_ingredients)} ingrédients):")
print(f"  {', '.join(all_target_ingredients)}")

print(f"\n📋 STRATÉGIE D'ANALYSE:")
print(f"  1. CORE → 3 axes complets (Long-terme + Saisonnalité + Weekend)")
print(f"  2. SPÉCIALISÉS → 1 axe optimisé + vérification autres axes") 
print(f"  3. COMPARAISON → Efficacité approche hybride vs uniforme")

# Fonction utilitaire pour les notebooks d'analyse
def get_ingredients_for_analysis(analysis_type: str = "all") -> List[str]:
    """Retourne la liste d'ingrédients selon le type d'analyse."""
    if analysis_type == "core":
        return ingredients_config["CORE"]["ingredients"]
    elif analysis_type == "long_term":
        return ingredients_config["CORE"]["ingredients"] + ingredients_config["SPECIALIZED_LONG_TERM"]["ingredients"]
    elif analysis_type == "seasonal":
        return ingredients_config["CORE"]["ingredients"] + ingredients_config["SPECIALIZED_SEASONAL"]["ingredients"]  
    elif analysis_type == "weekend":
        return ingredients_config["CORE"]["ingredients"] + ingredients_config["SPECIALIZED_WEEKEND"]["ingredients"]
    elif analysis_type == "all":
        return all_target_ingredients
    else:
        raise ValueError(f"Type d'analyse non supporté: {analysis_type}")

print(f"\n✅ Configuration finalisée - Prêt pour les analyses!")

🎯 CONFIGURATION INGRÉDIENTS - APPROCHE HYBRIDE
📊 CONFIGURATION COMPLÈTE:

CORE:
  📝 Analysés sur les 3 axes temporels
  🍳 Ingrédients: quinoa, kale, sriracha, avocado, coconut
  💡 Logique: Potentiel multi-temporel élevé

SPECIALIZED_LONG_TERM:
  📝 Optimisés pour tendances long-terme
  🍳 Ingrédients: chia seeds, oat milk
  💡 Logique: Émergence récente, adoption explosive

SPECIALIZED_SEASONAL:
  📝 Optimisés pour saisonnalité
  🍳 Ingrédients: pumpkin, asparagus
  💡 Logique: Cyclicité saisonnière forte

SPECIALIZED_WEEKEND:
  📝 Optimisés pour effet weekend
  🍳 Ingrédients: truffle, bacon
  💡 Logique: Indulgence vs effort weekend

🎯 LISTE FINALE (11 ingrédients):
  quinoa, kale, sriracha, avocado, coconut, chia seeds, oat milk, pumpkin, asparagus, truffle, bacon

📋 STRATÉGIE D'ANALYSE:
  1. CORE → 3 axes complets (Long-terme + Saisonnalité + Weekend)
  2. SPÉCIALISÉS → 1 axe optimisé + vérification autres axes
  3. COMPARAISON → Efficacité approche hybride vs uniforme

✅ Configuration fina

### 🧪 Test complet des 11 ingrédients sélectionnés

Validation de l'existence et performance de tous nos ingrédients CORE + SPÉCIALISÉS.

In [21]:
# Test de validation des 11 ingrédients sélectionnés
print("🧪 VALIDATION COMPLÈTE DES 11 INGRÉDIENTS")
print("="*70)

# Test de tous nos ingrédients cibles
print("📊 Chargement et test des 11 ingrédients...")

try:
    # Test avec la liste complète
    df_all_ingredients = load_ingredient_ratings(all_target_ingredients)
    
    print(f"✅ Dataset global créé avec succès!")
    print(f"📏 Shape totale: {df_all_ingredients.shape}")
    
    # Stats détaillées par ingrédient
    stats_complete = df_all_ingredients.group_by('ingredient_name').agg([
        pl.len().alias('n_ratings'),
        pl.col('rating').mean().alias('avg_rating'),
        pl.col('recipe_id').n_unique().alias('n_recipes'),
        pl.col('user_id').n_unique().alias('n_users'),
        pl.col('date').min().alias('first_mention'),
        pl.col('date').max().alias('last_mention')
    ]).sort('n_ratings', descending=True)
    
    print(f"\n📈 PERFORMANCE COMPLÈTE PAR INGRÉDIENT:")
    display(stats_complete)
    
    # Analyse par catégorie
    print(f"\n🎯 ANALYSE PAR CATÉGORIE:")
    
    categories = {
        "CORE": ingredients_config["CORE"]["ingredients"],
        "LONG_TERM": ingredients_config["SPECIALIZED_LONG_TERM"]["ingredients"],
        "SEASONAL": ingredients_config["SPECIALIZED_SEASONAL"]["ingredients"],
        "WEEKEND": ingredients_config["SPECIALIZED_WEEKEND"]["ingredients"]
    }
    
    for cat_name, cat_ingredients in categories.items():
        print(f"\n{cat_name}:")
        cat_stats = stats_complete.filter(pl.col('ingredient_name').is_in(cat_ingredients))
        
        if cat_stats.height > 0:
            total_ratings = cat_stats['n_ratings'].sum()
            avg_rating = cat_stats['avg_rating'].mean()
            print(f"  ✅ {cat_stats.height}/{len(cat_ingredients)} ingrédients trouvés")
            print(f"  📊 Total interactions: {total_ratings:,}")
            print(f"  ⭐ Rating moyen: {avg_rating:.3f}")
            
            # Détail par ingrédient
            for row in cat_stats.iter_rows(named=True):
                print(f"    • {row['ingredient_name']}: {row['n_ratings']:,} interactions, ⭐{row['avg_rating']:.3f}")
        else:
            print(f"  ❌ Aucun ingrédient de cette catégorie trouvé")
    
    # Ingrédients manquants
    found_ingredients = set(stats_complete['ingredient_name'].to_list())
    missing_ingredients = set(all_target_ingredients) - found_ingredients
    
    if missing_ingredients:
        print(f"\n⚠️ INGRÉDIENTS MANQUANTS ({len(missing_ingredients)}):")
        for missing in missing_ingredients:
            print(f"  • {missing}")
            
        print(f"\n🔍 RECHERCHE D'ALTERNATIVES pour ingrédients manquants...")
        # Recherche d'ingrédients similaires dans le dataset
        all_unique_ingredients = (df_all_ingredients
                                 .select('ingredient_name')
                                 .unique()
                                 .sort('ingredient_name')
                                 .to_series()
                                 .to_list())
        
        print(f"💡 Suggestions d'alternatives (ingrédients contenant mots-clés):")
        for missing in missing_ingredients:
            suggestions = []
            for keyword in missing.split():
                suggestions.extend([ing for ing in all_unique_ingredients[:100] if keyword.lower() in ing.lower()])
            
            if suggestions:
                print(f"  {missing} → {suggestions[:3]}")
    else:
        print(f"\n✅ PARFAIT! Tous les 11 ingrédients sont présents dans le dataset!")
    
    # Recommandation finale
    print(f"\n" + "="*70)
    print(f"🎯 RECOMMANDATION FINALE:")
    
    viable_ingredients = len(found_ingredients)
    total_interactions = stats_complete['n_ratings'].sum()
    
    if viable_ingredients >= 8:  # Au moins 8/11
        print(f"✅ CONFIGURATION VALIDÉE!")
        print(f"  • {viable_ingredients}/11 ingrédients trouvés")
        print(f"  • {total_interactions:,} interactions totales") 
        print(f"  • Dataset suffisamment riche pour analyses temporelles")
        print(f"\n🚀 PRÊT POUR IMPLÉMENTATION dans les notebooks d'analyse!")
    else:
        print(f"⚠️ CONFIGURATION À AJUSTER")
        print(f"  • Seulement {viable_ingredients}/11 ingrédients trouvés")
        print(f"  • Considérer alternatives ou réduire la liste")
        
except Exception as e:
    print(f"❌ Erreur lors du test: {e}")
    import traceback
    traceback.print_exc()

🧪 VALIDATION COMPLÈTE DES 11 INGRÉDIENTS
📊 Chargement et test des 11 ingrédients...
✅ Dataset global créé avec succès!
📏 Shape totale: (57116, 7)

📈 PERFORMANCE COMPLÈTE PAR INGRÉDIENT:


ingredient_name,n_ratings,avg_rating,n_recipes,n_users,first_mention,last_mention
str,u32,f64,u32,u32,date,date
"""bacon""",32160,4.482214,6948,14387,2000-01-25,2018-12-18
"""avocado""",6766,4.546852,1894,3584,2001-07-24,2018-11-10
"""coconut""",5714,4.383269,1506,3749,2000-12-27,2018-12-18
"""pumpkin""",4154,4.344006,944,3163,2000-10-09,2018-12-19
"""asparagus""",3781,4.530812,1040,2160,2001-03-04,2018-10-25
"""kale""",2435,4.354004,359,1939,2001-07-16,2018-12-12
"""quinoa""",1990,4.401508,450,1295,2003-04-22,2018-11-28
"""chia seeds""",102,4.019608,48,81,2008-02-23,2018-11-25
"""truffle""",11,4.363636,5,10,2006-06-21,2016-12-04
"""oat milk""",3,4.666667,3,3,2001-10-22,2011-01-02



🎯 ANALYSE PAR CATÉGORIE:

CORE:
  ✅ 4/5 ingrédients trouvés
  📊 Total interactions: 16,905
  ⭐ Rating moyen: 4.421
    • avocado: 6,766 interactions, ⭐4.547
    • coconut: 5,714 interactions, ⭐4.383
    • kale: 2,435 interactions, ⭐4.354
    • quinoa: 1,990 interactions, ⭐4.402

LONG_TERM:
  ✅ 2/2 ingrédients trouvés
  📊 Total interactions: 105
  ⭐ Rating moyen: 4.343
    • chia seeds: 102 interactions, ⭐4.020
    • oat milk: 3 interactions, ⭐4.667

SEASONAL:
  ✅ 2/2 ingrédients trouvés
  📊 Total interactions: 7,935
  ⭐ Rating moyen: 4.437
    • pumpkin: 4,154 interactions, ⭐4.344
    • asparagus: 3,781 interactions, ⭐4.531

WEEKEND:
  ✅ 2/2 ingrédients trouvés
  📊 Total interactions: 32,171
  ⭐ Rating moyen: 4.423
    • bacon: 32,160 interactions, ⭐4.482
    • truffle: 11 interactions, ⭐4.364

⚠️ INGRÉDIENTS MANQUANTS (1):
  • sriracha

🔍 RECHERCHE D'ALTERNATIVES pour ingrédients manquants...
💡 Suggestions d'alternatives (ingrédients contenant mots-clés):

🎯 RECOMMANDATION FINALE:
✅ 

### 📊 Analyse de faisabilité temporelle

Test rapide de la richesse temporelle pour nos ingrédients validés.

In [22]:
# Test de faisabilité pour analyses temporelles
print("📊 ANALYSE DE FAISABILITÉ TEMPORELLE")
print("="*60)

if 'df_all_ingredients' in locals() and df_all_ingredients.shape[0] > 0:
    print("🔍 Test des patterns temporels sur ingrédients trouvés...")
    
    # Ajout des features temporelles
    df_temporal = df_all_ingredients.with_columns([
        pl.col("date").dt.year().alias("year"),
        pl.col("date").dt.month().alias("month"), 
        pl.col("date").dt.weekday().alias("weekday"),
        (pl.col("date").dt.weekday() >= 5).cast(pl.Int8).alias("is_weekend"),
        pl.when(pl.col("date").dt.month().is_in([12, 1, 2])).then(pl.lit("Winter"))
          .when(pl.col("date").dt.month().is_in([3, 4, 5])).then(pl.lit("Spring"))
          .when(pl.col("date").dt.month().is_in([6, 7, 8])).then(pl.lit("Summer"))
          .otherwise(pl.lit("Autumn")).alias("season")
    ])
    
    # Test 1: Richesse long-terme (années couvertes)
    temporal_coverage = (df_temporal
                        .group_by('ingredient_name')
                        .agg([
                            pl.col('year').min().alias('first_year'),
                            pl.col('year').max().alias('last_year'),
                            pl.col('year').n_unique().alias('n_years'),
                            pl.len().alias('total_ratings')
                        ])
                        .with_columns([
                            (pl.col('last_year') - pl.col('first_year') + 1).alias('year_span'),
                            (pl.col('total_ratings') / pl.col('n_years')).alias('ratings_per_year')
                        ])
                        .sort('total_ratings', descending=True))
    
    print(f"\n📈 COUVERTURE TEMPORELLE (Long-terme):")
    display(temporal_coverage)
    
    # Test 2: Richesse saisonnière (présence dans les 4 saisons)
    seasonal_coverage = (df_temporal
                        .group_by(['ingredient_name', 'season'])
                        .agg([pl.len().alias('n_ratings')])
                        .pivot(values='n_ratings', index='ingredient_name', columns='season', aggregate_function='first')
                        .fill_null(0))
    
    print(f"\n🌸 COUVERTURE SAISONNIÈRE:")
    display(seasonal_coverage)
    
    # Test 3: Richesse weekend (différence weekend/semaine détectable?)
    weekend_analysis = (df_temporal
                       .group_by(['ingredient_name', 'is_weekend'])
                       .agg([
                           pl.len().alias('n_ratings'),
                           pl.col('rating').mean().alias('avg_rating')
                       ])
                       .pivot(values=['n_ratings', 'avg_rating'], 
                             index='ingredient_name', 
                             columns='is_weekend')
                       .fill_null(0))
    
    print(f"\n📅 ANALYSE WEEKEND vs SEMAINE:")
    display(weekend_analysis)
    
    # Recommandations par type d'analyse
    print(f"\n🎯 FAISABILITÉ PAR TYPE D'ANALYSE:")
    
    print(f"\n📈 LONG-TERME:")
    long_term_viable = temporal_coverage.filter(
        (pl.col('n_years') >= 10) & (pl.col('ratings_per_year') >= 50)
    )
    print(f"  ✅ {long_term_viable.height} ingrédients viables (≥10 ans, ≥50 ratings/an)")
    if long_term_viable.height > 0:
        top_long_term = long_term_viable.head(5)['ingredient_name'].to_list()
        print(f"  🏆 Top candidates: {', '.join(top_long_term)}")
    
    print(f"\n🌸 SAISONNALITÉ:")
    # Check si présent dans les 4 saisons
    seasonal_cols = [col for col in seasonal_coverage.columns if col != 'ingredient_name']
    if len(seasonal_cols) >= 4:
        seasonal_viable = seasonal_coverage.filter(
            (pl.col('Autumn') > 0) & (pl.col('Spring') > 0) & 
            (pl.col('Summer') > 0) & (pl.col('Winter') > 0)
        )
        print(f"  ✅ {seasonal_viable.height} ingrédients viables (présents 4 saisons)")
        if seasonal_viable.height > 0:
            top_seasonal = seasonal_viable.head(5)['ingredient_name'].to_list()
            print(f"  🏆 Top candidates: {', '.join(top_seasonal)}")
    
    print(f"\n📅 WEEKEND:")
    # Présence weekend ET semaine avec volume suffisant
    weekend_cols = [col for col in weekend_analysis.columns if 'n_ratings' in col]
    if len(weekend_cols) >= 2:
        print(f"  ✅ Tous les ingrédients analysables (présence weekend + semaine)")
        print(f"  💡 Focus sur différentiels de ratings weekend vs semaine")
    
    # Résumé final
    total_ingredients = df_temporal['ingredient_name'].n_unique()
    print(f"\n" + "="*60)
    print(f"🏆 RÉSUMÉ FAISABILITÉ:")
    print(f"  📊 {total_ingredients} ingrédients analysables au total")
    print(f"  📈 Long-terme: {long_term_viable.height if 'long_term_viable' in locals() else '?'} candidats optimaux")
    print(f"  🌸 Saisonnalité: {seasonal_viable.height if 'seasonal_viable' in locals() else '?'} candidats optimaux")
    print(f"  📅 Weekend: {total_ingredients} candidats (tous analysables)")
    print(f"\n✅ Configuration VALIDÉE pour analyses temporelles complètes!")
    
else:
    print("⚠️ Aucune donnée disponible - exécuter d'abord la cellule de validation précédente")

📊 ANALYSE DE FAISABILITÉ TEMPORELLE
🔍 Test des patterns temporels sur ingrédients trouvés...

📈 COUVERTURE TEMPORELLE (Long-terme):


ingredient_name,first_year,last_year,n_years,total_ratings,year_span,ratings_per_year
str,i32,i32,u32,u32,i32,f64
"""bacon""",2000,2018,19,32160,19,1692.631579
"""avocado""",2001,2018,18,6766,18,375.888889
"""coconut""",2000,2018,19,5714,19,300.736842
"""pumpkin""",2000,2018,19,4154,19,218.631579
"""asparagus""",2001,2018,18,3781,18,210.055556
"""kale""",2001,2018,18,2435,18,135.277778
"""quinoa""",2003,2018,16,1990,16,124.375
"""chia seeds""",2008,2018,11,102,11,9.272727
"""truffle""",2006,2016,6,11,11,1.833333
"""oat milk""",2001,2011,3,3,11,1.0



🌸 COUVERTURE SAISONNIÈRE:


ingredient_name,Summer,Winter,Spring,Autumn
str,u32,u32,u32,u32
"""avocado""",2061,1561,1708,1436
"""chia seeds""",25,27,34,16
"""asparagus""",897,727,1518,639
"""bacon""",7592,8419,8076,8073
"""pumpkin""",457,970,536,2191
"""quinoa""",514,496,588,392
"""kale""",457,768,611,599
"""coconut""",1388,1549,1505,1272
"""oat milk""",0,1,0,2
"""truffle""",6,4,1,0



📅 ANALYSE WEEKEND vs SEMAINE:


ingredient_name,n_ratings_1,n_ratings_0,avg_rating_1,avg_rating_0
str,u32,u32,f64,f64
"""truffle""",4,7,4.75,4.142857
"""avocado""",2712,4054,4.537979,4.552787
"""asparagus""",1548,2233,4.534238,4.528437
"""kale""",965,1470,4.349223,4.357143
"""coconut""",2454,3260,4.392013,4.376687
"""quinoa""",804,1186,4.440299,4.375211
"""chia seeds""",40,62,4.175,3.919355
"""oat milk""",1,2,5.0,4.5
"""pumpkin""",1769,2385,4.303561,4.374004
"""bacon""",13464,18696,4.479501,4.484168



🎯 FAISABILITÉ PAR TYPE D'ANALYSE:

📈 LONG-TERME:
  ✅ 7 ingrédients viables (≥10 ans, ≥50 ratings/an)
  🏆 Top candidates: bacon, avocado, coconut, pumpkin, asparagus

🌸 SAISONNALITÉ:
  ✅ 8 ingrédients viables (présents 4 saisons)
  🏆 Top candidates: avocado, chia seeds, asparagus, bacon, pumpkin

📅 WEEKEND:
  ✅ Tous les ingrédients analysables (présence weekend + semaine)
  💡 Focus sur différentiels de ratings weekend vs semaine

🏆 RÉSUMÉ FAISABILITÉ:
  📊 10 ingrédients analysables au total
  📈 Long-terme: 7 candidats optimaux
  🌸 Saisonnalité: 8 candidats optimaux
  📅 Weekend: 10 candidats (tous analysables)

✅ Configuration VALIDÉE pour analyses temporelles complètes!
