# Phase 1: Data Discovery (Incremental Extraction)

This notebook handles the extraction of data from Supabase. It implements an incremental load strategy to minimize data transfer.

In [1]:
import pandas as pd
import yaml
import pathlib
import datetime
import json
import os
import sys

# --- ROBUST PATH CONFIGURATION ---
# Identify project root explicitly to handle running from 'notebooks/' or root
current_dir = pathlib.Path.cwd()

if current_dir.name == "notebooks":
    project_root = current_dir.parent
elif (current_dir / "notebooks").exists():
    project_root = current_dir
else:
    # Fallback: traverse up until markers found or root reached
    project_root = current_dir
    for parent in current_dir.parents:
        if (parent / "config.yaml").exists():
            project_root = parent
            break

print(f"Project Root detected at: {project_root}")

# Add root to sys.path for src imports
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Set working directory to root for relative file access (config.yaml, .env)
os.chdir(project_root)
print(f"Working directory set to: {os.getcwd()}")

# --- IMPORTS ---
from src.connectors.supabase_connector import get_supabase_client

# Load Configuration
try:
    with open("config.yaml", "r") as f:
        config = yaml.safe_load(f)
except FileNotFoundError:
    print("Error: config.yaml not found in project root.")
    raise

# Define Paths
RAW_DATA_PATH = pathlib.Path(config['paths']['data']['raw'])
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)

EXPERIMENT_ARTIFACTS_PATH = pathlib.Path("experiments") / "phase_01_discovery" / "artifacts"
EXPERIMENT_ARTIFACTS_PATH.mkdir(parents=True, exist_ok=True)

FIGURES_PATH = pathlib.Path("experiments") / "phase_01_discovery" / "figures"
FIGURES_PATH.mkdir(parents=True, exist_ok=True)

# Initialize Supabase Client
try:
    supabase = get_supabase_client()
    print("Supabase client initialized successfully.")
except Exception as e:
    print(f"Failed to initialize Supabase client: {e}")
    raise

Project Root detected at: c:\Users\USUARIO\Documents\Forecaster\Forecaster_MisBunuelos
Working directory set to: c:\Users\USUARIO\Documents\Forecaster\Forecaster_MisBunuelos
Supabase client initialized successfully.


In [2]:
def get_remote_max_date(table_name, date_col):
    """
    Queries Supabase for the maximum date in the given table.
    """
    try:
        response = supabase.table(table_name).select(date_col).order(date_col, desc=True).limit(1).execute()
        data = response.data
        if data:
            return data[0][date_col]
        return None
    except Exception as e:
        print(f"Error getting max date for {table_name}: {e}")
        return None

def download_data(table_name, date_col, greater_than=None):
    """
    Downloads data from Supabase. With manual pagination loop.
    """
    query = supabase.table(table_name).select("*")
    if greater_than:
        query = query.gt(date_col, greater_than)
    
    query = query.order(date_col) # Ensure stable ordering
    
    all_rows = []
    start = 0
    batch_size = 1000
    while True:
        try:
             r = query.range(start, start + batch_size - 1).execute()
             rows = r.data
             if not rows:
                 break
             all_rows.extend(rows)
             if len(rows) < batch_size:
                 break
             start += batch_size
        except Exception as e:
            print(f"Error downloading {table_name}: {e}")
            break
            
    return pd.DataFrame(all_rows)

def sync_table(table_name, date_col, full_update):
    local_file = RAW_DATA_PATH / f"{table_name}.parquet"
    operation_status = "Up to Date"
    new_rows_count = 0
    
    # Check local state
    local_df = pd.DataFrame()
    max_local = None
    
    if local_file.exists() and not full_update:
        try:
            local_df = pd.read_parquet(local_file)
            if not local_df.empty and date_col in local_df.columns:
                max_local = local_df[date_col].max()
                # Handle timestamp types for comparison
                if isinstance(max_local, (pd.Timestamp, datetime.date, datetime.datetime)):
                    max_local = max_local.strftime('%Y-%m-%d')
        except Exception as e:
            print(f"Error reading local file {local_file}: {e}. Will trigger full update.")
            max_local = None
    
    # Decide action
    if not local_file.exists() or full_update or max_local is None:
        print(f"Downloading FULL table: {table_name}...")
        new_df = download_data(table_name, date_col)
        if not new_df.empty:
            if date_col in new_df.columns:
                new_df[date_col] = pd.to_datetime(new_df[date_col])
            # Ensure consistency
            new_df.sort_values(by=date_col, inplace=True)
            new_df.to_parquet(local_file, index=False)
            operation_status = "Full Download"
            new_rows_count = len(new_df)
            final_df = new_df
        else:
            final_df = pd.DataFrame()
            print(f"Warning: No data found for {table_name}")
    else:
        # Check remote max
        max_remote = get_remote_max_date(table_name, date_col)
        
        # Compare
        trigger_update = False
        if max_remote:
             # Simple string comparison usually works for ISO dates
             if str(max_remote) > str(max_local):
                 trigger_update = True
        
        if trigger_update:
             print(f"Downloading INCREMENTAL {table_name} (Remote {max_remote} > Local {max_local}) ...")
             delta_df = download_data(table_name, date_col, greater_than=max_local)
             if not delta_df.empty:
                 if date_col in delta_df.columns:
                     delta_df[date_col] = pd.to_datetime(delta_df[date_col])
                 
                 # Concat and dedup
                 final_df = pd.concat([local_df, delta_df])
                 final_df.drop_duplicates(subset=[date_col], keep='last', inplace=True)
                 final_df.sort_values(by=date_col, inplace=True)
                 
                 final_df.to_parquet(local_file, index=False)
                 
                 operation_status = "Incremental Download"
                 new_rows_count = len(delta_df)
             else:
                 final_df = local_df
        else:
             print(f"Table {table_name} is up to date. (Max: {max_local})")
             final_df = local_df

    total_rows = len(final_df)
    
    return {
        "table": table_name,
        "status": operation_status,
        "new_rows": new_rows_count,
        "total_rows": total_rows,
        "download_timestamp": datetime.datetime.now().isoformat()
    }

In [3]:
download_details = []
source_tables = config['data']['source_tables']
full_update_flag = config['data']['full_update']
date_column = config['data']['date_column']

print(f"Starting pipeline. Tables: {source_tables}, Full Update: {full_update_flag}")

for table in source_tables:
    result = sync_table(table, date_column, full_update_flag)
    download_details.append(result)
    print(f"Table {table}: [{result['status']}] (New: {result['new_rows']}, Total: {result['total_rows']})")

Starting pipeline. Tables: ['ventas_diarias', 'redes_sociales', 'promocion_diaria', 'macro_economia'], Full Update: False
Table ventas_diarias is up to date. (Max: 2026-02-10)
Table ventas_diarias: [Up to Date] (New: 0, Total: 2963)
Table redes_sociales is up to date. (Max: 2026-02-10)
Table redes_sociales: [Up to Date] (New: 0, Total: 2963)
Table promocion_diaria is up to date. (Max: 2026-02-10)
Table promocion_diaria: [Up to Date] (New: 0, Total: 2963)
Table macro_economia is up to date. (Max: 2026-02-01)
Table macro_economia: [Up to Date] (New: 0, Total: 98)


In [4]:
validation_details = {}
critical_check_passed = True

print("\n--- SANITY CHECK & VALIDATION ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        df = pd.read_parquet(file_path)
        print(f"--- {table} ---")
        print(df.info())
        print(df.head())
        print("\n")
        
        # Validation checks
        val_info = {
            "columns": list(df.columns),
            "rows": len(df),
            "completeness": 1.0 # Placeholder
        }
        
        # Critical Check: Ventas History > 36 Months
        if table == "ventas_diarias":
            if date_column in df.columns:
                min_date = df[date_column].min()
                max_date = df[date_column].max()
                if pd.notnull(min_date) and pd.notnull(max_date):
                    months_diff = (max_date - min_date) / pd.Timedelta(days=30)
                    val_info["history_months"] = round(months_diff, 2)
                    
                    if months_diff < 36:
                        print(f"CRITICAL WARNING: {table} has less than 36 months history ({months_diff:.1f})")
                        val_info["history_check"] = "FAIL"
                    else:
                        print(f"SUCCESS: {table} history sufficient ({months_diff:.1f} months)")
                        val_info["history_check"] = "PASS"
                else:
                     val_info["history_check"] = "FAIL - Dates Null"
            else:
                 val_info["history_check"] = "FAIL - No Date Col"
                 
        validation_details[table] = val_info
    else:
        print(f"WARNING: File for {table} not found.")

# JSON Report
report = {
    "phase": "Phase 1 - Data Discovery",
    "timestamp": datetime.datetime.now().isoformat(),
    "description": "Incremental download and sanity check of raw data.",
    "download_details": download_details,
    "validation_details": validation_details
}

report_path = EXPERIMENT_ARTIFACTS_PATH / "phase_01_discovery.json"
with open(report_path, "w") as f:
    json.dump(report, f, indent=4)

print(f"\nReport saved to {report_path}")


--- SANITY CHECK & VALIDATION ---

--- ventas_diarias ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2963 entries, 0 to 2962
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          2963 non-null   int64         
 1   fecha                       2963 non-null   datetime64[ns]
 2   total_unidades_entregadas   2963 non-null   int64         
 3   unidades_precio_normal      2963 non-null   int64         
 4   unidades_promo_pagadas      2963 non-null   int64         
 5   unidades_promo_bonificadas  2963 non-null   int64         
 6   precio_unitario_full        2963 non-null   int64         
 7   costo_unitario              2963 non-null   int64         
 8   ingresos_totales            2963 non-null   int64         
 9   costo_total                 2963 non-null   float64       
 10  utilidad                    2963 non-null   float64       
dt