# 1. Setup & Configuration

In [1]:
import pandas as pd
import yaml
import numpy as np
import json
from pathlib import Path
from datetime import datetime, date
import os
import sys

# Add project root to path
# Assuming notebook is running in standard directory structure: project/notebooks/
project_root = Path('..').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.connectors.supabase_connector import get_supabase_client

# Load Configuration
config_path = Path('../config.yaml')
if not config_path.exists():
    raise FileNotFoundError("config.yaml not found in parent directory")

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Define Paths
RAW_DATA_PATH = Path('../data/01_raw')
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Initialize Supabase Client
supabase = get_supabase_client()

print("Setup Complete. Config and Supabase Client Loaded.")


Setup Complete. Config and Supabase Client Loaded.


# 2. Helper Functions (Data Loading & Sync)

In [2]:
def get_remote_max_date(table_name: str, date_col: str):
    try:
        response = supabase.table(table_name).select(date_col).order(date_col, desc=True).limit(1).execute()
        if response.data:
            return response.data[0][date_col]
        return None
    except Exception as e:
        print(f"Error getting max date for {table_name}: {e}")
        return None

def download_data(table_name: str, date_col: str, greater_than=None):
    all_data = []
    page_size = 1000
    offset = 0
    
    print(f"Downloading {table_name}...", end=" ")
    
    query = supabase.table(table_name).select("*").order(date_col)
    
    if greater_than:
        query = query.gt(date_col, greater_than)
        
    while True:
        response = query.range(offset, offset + page_size - 1).execute()
        data = response.data
        if not data:
            break
        all_data.extend(data)
        offset += page_size
        print(f".", end="")
        
    print(f" Done. Retrieved {len(all_data)} rows.")
    return pd.DataFrame(all_data)

def sync_table(table_name: str, date_col: str, full_update: bool):
    file_path = RAW_DATA_PATH / f"{table_name}.parquet"
    
    download_needed = False
    start_date = None
    existing_df = pd.DataFrame()

    if not file_path.exists() or full_update:
        download_needed = True
        print(f"[{table_name}] Full download required (File missing or full_update=True).")
    else:
        try:
            existing_df = pd.read_parquet(file_path)
            if not existing_df.empty and date_col in existing_df.columns:
                max_local = existing_df[date_col].max()
                # Ensure max_local is a string for comparison if needed, or keep as is
                if isinstance(max_local, (datetime, date)):
                    max_local = max_local.isoformat()
                
                max_remote = get_remote_max_date(table_name, date_col)
                
                if max_remote and max_remote > str(max_local):
                    download_needed = True
                    start_date = max_local
                    print(f"[{table_name}] Incremental update required. Local: {max_local}, Remote: {max_remote}")
                else:
                    print(f"[{table_name}] Up to date.")
            else:
                 download_needed = True
                 print(f"[{table_name}] Full download required (Empty file or missing date col).")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            download_needed = True

    if download_needed:
        new_df = download_data(table_name, date_col, greater_than=start_date)
        
        if not new_df.empty:
            if not existing_df.empty and start_date:
                # Append
                combined_df = pd.concat([existing_df, new_df]).drop_duplicates(subset=['id'], keep='last')
                combined_df.to_parquet(file_path, index=False)
                print(f"[{table_name}] Updated. New total rows: {len(combined_df)}")
                return combined_df
            else:
                new_df.to_parquet(file_path, index=False)
                print(f"[{table_name}] Saved. Total rows: {len(new_df)}")
                return new_df
        else:
             print(f"[{table_name}] No new data found.")
             return existing_df
    
    return existing_df


# 3. Execute Pipeline

In [3]:
tables = config['data']['source_tables']
date_column = config['data']['date_column']
full_update = config['data']['full_update']

TABLE_DATA = {}
download_metadata = []

for table in tables:
    # Handle specific date columns if different per table, but config suggests one generic 'date_column'
    # Checking if there's override logic in preprocessing config, but for discovery we stick to simple logic or manual override
    # Config has 'date_column': 'fecha'
    
    df = sync_table(table, date_column, full_update)
    TABLE_DATA[table] = df
    
    download_metadata.append({
        "table": table, 
        "rows": len(df), 
        "timestamp": datetime.now().isoformat()
    })


[ventas_diarias] Up to date.
[redes_sociales] Up to date.
[promocion_diaria] Up to date.
[macro_economia] Up to date.


# 4. Sanity Check & History Validation

In [4]:
min_history = config['data']['min_history_months']

for table, df in TABLE_DATA.items():
    print(f"--- {table} ---")
    print(df.info())
    print(df.head())
    
    # Try generic date parsing for history check
    if 'fecha' in df.columns:
        temp_date = pd.to_datetime(df['fecha'], errors='coerce')
        if not temp_date.isnull().all():
            months = (temp_date.max() - temp_date.min()).days / 30
            print(f"History: {months:.1f} months")
            if table == 'ventas_diarias' and months < min_history:
                print("WARNING: Insufficient history!")


--- ventas_diarias ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2963 entries, 0 to 2962
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          2963 non-null   int64         
 1   fecha                       2963 non-null   datetime64[ns]
 2   total_unidades_entregadas   2963 non-null   int64         
 3   unidades_precio_normal      2963 non-null   int64         
 4   unidades_promo_pagadas      2963 non-null   int64         
 5   unidades_promo_bonificadas  2963 non-null   int64         
 6   precio_unitario_full        2963 non-null   int64         
 7   costo_unitario              2963 non-null   int64         
 8   ingresos_totales            2963 non-null   int64         
 9   costo_total                 2963 non-null   float64       
 10  utilidad                    2963 non-null   float64       
dtypes: datetime64[ns](1), float64(2),

# 5. Statistical Analysis (Numerical)

In [5]:
TABLE_ANALYSIS = {}

for table, df in TABLE_DATA.items():
    TABLE_ANALYSIS[table] = {}
    
    print(f"Analyzing {table}...")
    num_cols = df.select_dtypes(include=[np.number]).columns
    stats = df[num_cols].describe(percentiles=[.25, .5, .75]).to_dict()
    
    TABLE_ANALYSIS[table]['numerical_stats'] = stats


Analyzing ventas_diarias...
Analyzing redes_sociales...
Analyzing promocion_diaria...
Analyzing macro_economia...


# 6. Temporal Analysis

In [6]:
for table, df in TABLE_DATA.items():
    TABLE_ANALYSIS[table]['temporal_stats'] = {}
    
    # Try to find date columns
    date_cols = [col for col in df.columns if 'fecha' in col.lower() or 'date' in col.lower()]
    
    for col in date_cols:
        try:
            series = pd.to_datetime(df[col])
            stats = {
                'min': series.min().isoformat(),
                'max': series.max().isoformat(),
                'nulls': int(series.isnull().sum())
            }
            # Check for gaps if it's daily data
            if table in ['ventas_diarias', 'redes_sociales']:
                expected_range = pd.date_range(start=series.min(), end=series.max())
                missing = set(expected_range.date) - set(series.dt.date)
                stats['gaps_count'] = len(missing)
                
            TABLE_ANALYSIS[table]['temporal_stats'][col] = stats
        except Exception as e:
            print(f"Error temporal analysis {table}.{col}: {e}")


# 7. Categorical Analysis

In [7]:
for table, df in TABLE_DATA.items():
    TABLE_ANALYSIS[table]['categorical_stats'] = {}
    
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        if col not in ['fecha']: # Skip date if caught as object
            counts = df[col].value_counts(normalize=True).to_dict()
            TABLE_ANALYSIS[table]['categorical_stats'][col] = counts


# 8. Outlier Detection

In [8]:
for table, df in TABLE_DATA.items():
    TABLE_ANALYSIS[table]['outliers'] = {}
    
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if col == 'id': continue
        
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        TABLE_ANALYSIS[table]['outliers'][col] = {
            'count': len(outliers),
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        }


# 9. Zero Variance Check

In [9]:
for table, df in TABLE_DATA.items():
    zero_var = []
    for col in df.columns:
        if df[col].nunique() <= 1:
            zero_var.append(col)
    TABLE_ANALYSIS[table]['zero_variance'] = zero_var


# 10. High Cardinality Check

In [10]:
threshold = config['quality'].get('high_cardinality_threshold', 0.9)

for table, df in TABLE_DATA.items():
    high_card = []
    for col in df.columns:
        ratio = df[col].nunique() / len(df)
        if ratio > threshold:
            high_card.append({'column': col, 'ratio': ratio})
    TABLE_ANALYSIS[table]['high_cardinality'] = high_card


# 11. Zero Presence Check

In [11]:
threshold = config['quality'].get('zero_presence_threshold', 0.3)

for table, df in TABLE_DATA.items():
    high_zeros = []
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        zeros = (df[col] == 0).sum()
        ratio = zeros / len(df)
        if ratio > threshold:
            high_zeros.append({'column': col, 'ratio': ratio, 'count': int(zeros)})
    TABLE_ANALYSIS[table]['high_zeros'] = high_zeros


# 12. Duplicate Rows Check

In [12]:
for table, df in TABLE_DATA.items():
    dupes = df.duplicated().sum()
    TABLE_ANALYSIS[table]['duplicate_rows'] = int(dupes)


# 13. Null Analysis

In [13]:
for table, df in TABLE_DATA.items():
    nulls = df.isnull().sum()
    null_stats = nulls[nulls > 0].to_dict()
    TABLE_ANALYSIS[table]['null_stats'] = null_stats


# 14. Sentinel Values Check

In [14]:
sentinels = config['quality']['sentinel_values']

for table, df in TABLE_DATA.items():
    sentinel_report = []
    
    # Numeric
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        for val in sentinels['numeric']:
            count = (df[col] == val).sum()
            if count > 0:
                sentinel_report.append({'column': col, 'value': val, 'count': int(count)})
                
    # Categorical
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        for val in sentinels['categorical']:
            count = (df[col] == val).sum()
            if count > 0:
                sentinel_report.append({'column': col, 'value': val, 'count': int(count)})
                
    TABLE_ANALYSIS[table]['sentinel_values'] = sentinel_report


# 15. Data Contract Validation

In [15]:
contracts = config['data_contract']

for table, df in TABLE_DATA.items():
    report = {'status': 'PASS', 'issues': []}
    
    if table not in contracts:
        continue
        
    expected_schema = contracts[table]
    
    # Check missing columns
    missing = set(expected_schema.keys()) - set(df.columns)
    if missing:
        report['status'] = 'FAIL'
        report['issues'].append(f"Missing columns: {missing}")
        
    # Check extra columns
    extra = set(df.columns) - set(expected_schema.keys())
    if extra:
        report['issues'].append(f"Extra columns: {extra}")
        
    TABLE_ANALYSIS[table]['data_contract'] = report


# 16. Financial Health Validation

In [16]:
financial_targets = config['financial_health']['target_files']

for table in financial_targets:
    if table not in TABLE_DATA: continue
    
    df = TABLE_DATA[table].copy()
    health_report = {}
    
    # Rule 2.1: Units Integrity
    # total_unidades = normal + promo_pagadas + promo_bonificadas
    calc_units = df['unidades_precio_normal'] + df['unidades_promo_pagadas'] + df['unidades_promo_bonificadas']
    diff_units = df['total_unidades_entregadas'] - calc_units
    health_report['rule_2_1_units_integrity'] = (diff_units.abs().sum() == 0)
    
    # Rule 2.2: Promo Equality
    diff_promos = df['unidades_promo_pagadas'] - df['unidades_promo_bonificadas']
    health_report['rule_2_2_promo_equality'] = (diff_promos.abs().sum() == 0)
    
    # Rule 2.3: Margin Integrity (Price >= Cost)
    margin_check = (df['precio_unitario_full'] >= df['costo_unitario']).all()
    health_report['rule_2_3_margin_integrity'] = bool(margin_check)
    
    # Rule 2.4: Utility Calc
    # utilidad = ingresos - costo_total
    calc_util = df['ingresos_totales'] - df['costo_total']
    diff_util = df['utilidad'] - calc_util
    health_report['rule_2_4_utility_calc'] = (diff_util.abs() < 1).all() # Float tolerance
    
    # Rule 2.5: Revenue Calc
    # ingresos = (normal + pagadas) * precio_full
    calc_rev = (df['unidades_precio_normal'] + df['unidades_promo_pagadas']) * df['precio_unitario_full']
    diff_rev = df['ingresos_totales'] - calc_rev
    health_report['rule_2_5_revenue_calc'] = (diff_rev.abs() < 1).all()
    
    # Rule 2.6: Cost Calc
    # costo_total = total_unidades * costo_unitario
    calc_cost = df['total_unidades_entregadas'] * df['costo_unitario']
    diff_cost = df['costo_total'] - calc_cost
    health_report['rule_2_6_cost_calc'] = (diff_cost.abs() < 1).all()
    
    # Rule 2.7: Non-negative
    num_cols = df.select_dtypes(include=[np.number]).columns
    negatives = (df[num_cols] < 0).any().any()
    health_report['rule_2_7_non_negative'] = not negatives
    
    TABLE_ANALYSIS[table]['financial_health'] = health_report


# 17. Generate Final Report

In [17]:
output_path = Path('../experiments/phase_01_discovery/artifacts')
output_path.mkdir(parents=True, exist_ok=True)

final_report = {
    "phase": "Phase 1 - Data Discovery",
    "timestamp": datetime.now().isoformat(),
    "description": "Data Discovery Execution from Notebook",
    "download_details": download_metadata,
    "data_analysis": TABLE_ANALYSIS
}

report_file = output_path / 'phase_01_discovery.json'
with open(report_file, 'w') as f:
    json.dump(final_report, f, indent=4, default=str)

print(f"Report generated at: {report_file}")


Report generated at: ..\experiments\phase_01_discovery\artifacts\phase_01_discovery.json
