# Phase 1: Data Discovery (Incremental Extraction)

This notebook handles the extraction of data from Supabase. It implements an incremental load strategy to minimize data transfer and performs comprehensive statistical analysis.

In [1]:
import pandas as pd
import yaml
import pathlib
import datetime
import json
import os
import sys
import numpy as np

# --- ROBUST PATH CONFIGURATION ---
# Identify project root explicitly to handle running from 'notebooks/' or root
current_dir = pathlib.Path.cwd()

if current_dir.name == "notebooks":
    project_root = current_dir.parent
elif (current_dir / "notebooks").exists():
    project_root = current_dir
else:
    # Fallback: traverse up until markers found or root reached
    project_root = current_dir
    for parent in current_dir.parents:
        if (parent / "config.yaml").exists():
            project_root = parent
            break

print(f"Project Root detected at: {project_root}")

# Add root to sys.path for src imports
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Set working directory to root for relative file access (config.yaml, .env)
os.chdir(project_root)
print(f"Working directory set to: {os.getcwd()}")

# --- IMPORTS ---
from src.connectors.supabase_connector import get_supabase_client

# Load Configuration
try:
    with open("config.yaml", "r") as f:
        config = yaml.safe_load(f)
except FileNotFoundError:
    print("Error: config.yaml not found in project root.")
    raise

# Define Paths
RAW_DATA_PATH = pathlib.Path(config['paths']['data']['raw'])
RAW_DATA_PATH.mkdir(parents=True, exist_ok=True)

EXPERIMENT_ARTIFACTS_PATH = pathlib.Path("experiments") / "phase_01_discovery" / "artifacts"
EXPERIMENT_ARTIFACTS_PATH.mkdir(parents=True, exist_ok=True)

FIGURES_PATH = pathlib.Path("experiments") / "phase_01_discovery" / "figures"
FIGURES_PATH.mkdir(parents=True, exist_ok=True)

# Initialize Supabase Client
try:
    supabase = get_supabase_client()
    print("Supabase client initialized successfully.")
except Exception as e:
    print(f"Failed to initialize Supabase client: {e}")
    raise

Project Root detected at: c:\Users\USUARIO\Documents\Forecaster\Forecaster_MisBunuelos
Working directory set to: c:\Users\USUARIO\Documents\Forecaster\Forecaster_MisBunuelos
Supabase client initialized successfully.


In [2]:
def get_remote_max_date(table_name, date_col):
    """
    Queries Supabase for the maximum date in the given table.
    """
    try:
        response = supabase.table(table_name).select(date_col).order(date_col, desc=True).limit(1).execute()
        data = response.data
        if data:
            return data[0][date_col]
        return None
    except Exception as e:
        print(f"Error getting max date for {table_name}: {e}")
        return None

def download_data(table_name, date_col, greater_than=None):
    """
    Downloads data from Supabase. With manual pagination loop.
    """
    query = supabase.table(table_name).select("*")
    if greater_than:
        query = query.gt(date_col, greater_than)
    
    query = query.order(date_col) # Ensure stable ordering
    
    all_rows = []
    start = 0
    batch_size = 1000
    while True:
        try:
             r = query.range(start, start + batch_size - 1).execute()
             rows = r.data
             if not rows:
                 break
             all_rows.extend(rows)
             if len(rows) < batch_size:
                 break
             start += batch_size
        except Exception as e:
            print(f"Error downloading {table_name}: {e}")
            break
            
    return pd.DataFrame(all_rows)

def sync_table(table_name, date_col, full_update):
    local_file = RAW_DATA_PATH / f"{table_name}.parquet"
    operation_status = "Up to Date"
    new_rows_count = 0
    
    # Check local state
    local_df = pd.DataFrame()
    max_local = None
    
    if local_file.exists() and not full_update:
        try:
            local_df = pd.read_parquet(local_file)
            if not local_df.empty and date_col in local_df.columns:
                max_local = local_df[date_col].max()
                # Handle timestamp types for comparison
                if isinstance(max_local, (pd.Timestamp, datetime.date, datetime.datetime)):
                    max_local = max_local.strftime('%Y-%m-%d')
        except Exception as e:
            print(f"Error reading local file {local_file}: {e}. Will trigger full update.")
            max_local = None
    
    # Decide action
    if not local_file.exists() or full_update or max_local is None:
        print(f"Downloading FULL table: {table_name}...")
        new_df = download_data(table_name, date_col)
        if not new_df.empty:
            if date_col in new_df.columns:
                new_df[date_col] = pd.to_datetime(new_df[date_col])
            # Ensure consistency
            new_df.sort_values(by=date_col, inplace=True)
            new_df.to_parquet(local_file, index=False)
            operation_status = "Full Download"
            new_rows_count = len(new_df)
            final_df = new_df
        else:
            final_df = pd.DataFrame()
            print(f"Warning: No data found for {table_name}")
    else:
        # Check remote max
        max_remote = get_remote_max_date(table_name, date_col)
        
        # Compare
        trigger_update = False
        if max_remote:
             # Simple string comparison usually works for ISO dates
             if str(max_remote) > str(max_local):
                 trigger_update = True
        
        if trigger_update:
             print(f"Downloading INCREMENTAL {table_name} (Remote {max_remote} > Local {max_local}) ...")
             delta_df = download_data(table_name, date_col, greater_than=max_local)
             if not delta_df.empty:
                 if date_col in delta_df.columns:
                     delta_df[date_col] = pd.to_datetime(delta_df[date_col])
                 
                 # Concat and dedup
                 final_df = pd.concat([local_df, delta_df])
                 final_df.drop_duplicates(subset=[date_col], keep='last', inplace=True)
                 final_df.sort_values(by=date_col, inplace=True)
                 
                 final_df.to_parquet(local_file, index=False)
                 
                 operation_status = "Incremental Download"
                 new_rows_count = len(delta_df)
             else:
                 final_df = local_df
        else:
             print(f"Table {table_name} is up to date. (Max: {max_local})")
             final_df = local_df

    total_rows = len(final_df)
    
    return {
        "table": table_name,
        "status": operation_status,
        "new_rows": new_rows_count,
        "total_rows": total_rows,
        "download_timestamp": datetime.datetime.now().isoformat()
    }

In [3]:
download_details = []
source_tables = config['data']['source_tables']
full_update_flag = config['data']['full_update']
date_column = config['data']['date_column']

print(f"Starting pipeline. Tables: {source_tables}, Full Update: {full_update_flag}")

for table in source_tables:
    result = sync_table(table, date_column, full_update_flag)
    download_details.append(result)
    print(f"Table {table}: [{result['status']}] (New: {result['new_rows']}, Total: {result['total_rows']})")

Starting pipeline. Tables: ['ventas_diarias', 'redes_sociales', 'promocion_diaria', 'macro_economia'], Full Update: False
Table ventas_diarias is up to date. (Max: 2026-02-10)
Table ventas_diarias: [Up to Date] (New: 0, Total: 2963)
Table redes_sociales is up to date. (Max: 2026-02-10)
Table redes_sociales: [Up to Date] (New: 0, Total: 2963)
Table promocion_diaria is up to date. (Max: 2026-02-10)
Table promocion_diaria: [Up to Date] (New: 0, Total: 2963)
Table macro_economia is up to date. (Max: 2026-02-01)
Table macro_economia: [Up to Date] (New: 0, Total: 98)


In [4]:
print("\n--- SANITY CHECK ---\n")

# Global dictionary to hold all analysis results per table
TABLE_ANALYSIS = {}
source_tables = config['data']['source_tables']

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            print(f"\n--- {table} ---")
            print(df.info())
            print(df.head())
            
            # Basic Validation checks
            val_info = {
                "columns": list(df.columns),
                "rows": len(df),
            }
            
            # Critical Check: Ventas History > 36 Months
            if table == "ventas_diarias":
                if date_column in df.columns:
                    min_date = df[date_column].min()
                    max_date = df[date_column].max()
                    if pd.notnull(min_date) and pd.notnull(max_date):
                        months_diff = (max_date - min_date) / pd.Timedelta(days=30)
                        val_info["history_months"] = round(months_diff, 2)
                        
                        if months_diff < 36:
                            print(f"CRITICAL WARNING: {table} has less than 36 months history ({months_diff:.1f})")
                            val_info["history_check"] = "FAIL"
                        else:
                            print(f"SUCCESS: {table} history sufficient ({months_diff:.1f} months)")
                            val_info["history_check"] = "PASS"
                    else:
                         val_info["history_check"] = "FAIL - Dates Null"
                else:
                     val_info["history_check"] = "FAIL - No Date Col"
            
            # Initialize analysis dict
            TABLE_ANALYSIS[table] = {
                "validation": val_info,
                "statistics": {},
                "temporal": {},
                "categorical": {},
                "outliers": {},
                "zero_variance": [],
                "high_cardinality": {},
                "zero_presence": {},
                "duplicates": {},
                "null_values": {},
                "sentinels": {}
            }
        except Exception as e:
            print(f"Error reading local file {file_path}: {e}")
    else:
        print(f"WARNING: File for {table} not found at {file_path}.")


--- SANITY CHECK ---


--- ventas_diarias ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2963 entries, 0 to 2962
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   id                          2963 non-null   int64         
 1   fecha                       2963 non-null   datetime64[ns]
 2   total_unidades_entregadas   2963 non-null   int64         
 3   unidades_precio_normal      2963 non-null   int64         
 4   unidades_promo_pagadas      2963 non-null   int64         
 5   unidades_promo_bonificadas  2963 non-null   int64         
 6   precio_unitario_full        2963 non-null   int64         
 7   costo_unitario              2963 non-null   int64         
 8   ingresos_totales            2963 non-null   int64         
 9   costo_total                 2963 non-null   float64       
 10  utilidad                    2963 non-null   float64       
dtypes: dateti

In [5]:
print("\n--- STATISTICAL ANALYSIS (NUMERIC) ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            stats_dict = {}
            
            for col in numeric_cols:
                desc = df[col].describe()
                stats_dict[col] = {
                    "mean": float(desc['mean']),
                    "median": float(df[col].median()),
                    "std": float(desc['std']),
                    "min": float(desc['min']),
                    "max": float(desc['max']),
                    "q25": float(desc['25%']),
                    "q50": float(desc['50%']),
                    "q75": float(desc['75%'])
                }
                
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["statistics"] = stats_dict
            
            print(f"{table}: Analyzed {len(numeric_cols)} numeric columns.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- STATISTICAL ANALYSIS (NUMERIC) ---

ventas_diarias: Analyzed 10 numeric columns.
redes_sociales: Analyzed 4 numeric columns.
promocion_diaria: Analyzed 2 numeric columns.
macro_economia: Analyzed 6 numeric columns.


In [6]:
print("\n--- TEMPORAL ANALYSIS (DATETIME) ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            
            if date_column in df.columns:
                 df[date_column] = pd.to_datetime(df[date_column])

            datetime_cols = df.select_dtypes(include=['datetime64[ns]', 'datetime64', 'datetime']).columns
            
            temp_dict = {}
            
            for col in datetime_cols:
                series = df[col].dropna()
                if series.empty:
                    continue
                    
                min_date = series.min()
                max_date = series.max()
                
                # Check for duplicates
                duplicates = series.duplicated().sum()
                
                # Check for gaps (only if it looks like a daily series)
                gaps_info = "N/A"
                if col == date_column:
                    try:
                        full_range = pd.date_range(start=min_date, end=max_date, freq='D')
                        missing_dates = full_range.difference(series)
                        gaps_info = len(missing_dates)
                    except:
                        pass

                temp_dict[col] = {
                    "min_date": str(min_date),
                    "max_date": str(max_date),
                    "duplicates": int(duplicates),
                    "missing_days_in_sequence": gaps_info
                }
                
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["temporal"] = temp_dict
                
            print(f"{table}: Analyzed {len(datetime_cols)} datetime columns.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- TEMPORAL ANALYSIS (DATETIME) ---

ventas_diarias: Analyzed 1 datetime columns.
redes_sociales: Analyzed 1 datetime columns.
promocion_diaria: Analyzed 1 datetime columns.
macro_economia: Analyzed 1 datetime columns.


In [7]:
print("\n--- CATEGORICAL ANALYSIS (OBJECT) ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            
            obj_cols = df.select_dtypes(include=['object', 'category']).columns
            cat_dict = {}
            
            for col in obj_cols:
                # Frequency and Weight
                counts = df[col].value_counts()
                total = len(df)
                
                # Top 10 categories to avoid exploding JSON
                top_counts = counts.head(10)
                
                cat_details = {}
                for cat_name, count in top_counts.items():
                    weight = (count / total) * 100
                    cat_details[str(cat_name)] = {
                        "freq": int(count),
                        "weight_percent": round(float(weight), 2)
                    }
                    
                cat_dict[col] = {
                    "unique_count": int(df[col].nunique()),
                    "top_categories": cat_details
                }
                
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["categorical"] = cat_dict
                
            print(f"{table}: Analyzed {len(obj_cols)} categorical columns.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CATEGORICAL ANALYSIS (OBJECT) ---

ventas_diarias: Analyzed 0 categorical columns.
redes_sociales: Analyzed 1 categorical columns.
promocion_diaria: Analyzed 0 categorical columns.
macro_economia: Analyzed 0 categorical columns.


In [8]:
print("\n--- OUTLIER DETECTION ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            outlier_dict = {}
            
            for col in numeric_cols:
                try:
                    q1 = df[col].quantile(0.25)
                    q3 = df[col].quantile(0.75)
                    iqr = q3 - q1
                    
                    lower_bound = q1 - 1.5 * iqr
                    upper_bound = q3 + 1.5 * iqr
                    
                    outliers_low = df[df[col] < lower_bound]
                    outliers_high = df[df[col] > upper_bound]
                    
                    outlier_dict[col] = {
                        "lower_limit": float(lower_bound),
                        "upper_limit": float(upper_bound),
                        "count_low": int(len(outliers_low)),
                        "count_high": int(len(outliers_high)),
                        "total_outliers": int(len(outliers_low) + len(outliers_high))
                    }
                except Exception as e:
                     print(f"Skipping outlier check for {col}: {e}")
                
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["outliers"] = outlier_dict
                
            print(f"{table}: Outlier analysis complete.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- OUTLIER DETECTION ---

ventas_diarias: Outlier analysis complete.
redes_sociales: Outlier analysis complete.
promocion_diaria: Outlier analysis complete.
macro_economia: Outlier analysis complete.


In [9]:
print("\n--- CHECK: ZERO VARIANCE ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            zero_variance_cols = []
            for col in df.columns:
                if df[col].nunique() <= 1:
                    zero_variance_cols.append(col)
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["zero_variance"] = zero_variance_cols
            
            if zero_variance_cols:
                print(f"{table}: Found {len(zero_variance_cols)} columns with zero variance: {zero_variance_cols}")
            else:
                print(f"{table}: No zero variance columns found.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: ZERO VARIANCE ---

ventas_diarias: No zero variance columns found.
redes_sociales: No zero variance columns found.
promocion_diaria: No zero variance columns found.
macro_economia: No zero variance columns found.


In [10]:
print("\n--- CHECK: HIGH CARDINALITY ---\n")

high_card_threshold = config.get('quality', {}).get('high_cardinality_threshold', 50)
print(f"Using High Cardinality Threshold: {high_card_threshold}")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            high_card_cols = {}
            
            # Start with all columns, but typically relevant for categorical
            for col in df.columns:
                unique_count = df[col].nunique()
                
                # If threshold is float < 1.0, treat as ratio
                if isinstance(high_card_threshold, float) and high_card_threshold < 1.0:
                    ratio = unique_count / len(df) if len(df) > 0 else 0
                    if ratio > high_card_threshold:
                        high_card_cols[col] = {"unique": unique_count, "ratio": round(ratio, 4)}
                else:
                    # Treat as absolute count
                    if unique_count > high_card_threshold:
                         # Optional: Ignore if it looks like a float/continuous variable unless requested
                         high_card_cols[col] = {"unique": unique_count}
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["high_cardinality"] = high_card_cols
            
            if high_card_cols:
                print(f"{table}: Found {len(high_card_cols)} columns with high cardinality.")
            else:
                print(f"{table}: No high cardinality columns found.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: HIGH CARDINALITY ---

Using High Cardinality Threshold: 0.9
ventas_diarias: Found 2 columns with high cardinality.
redes_sociales: Found 2 columns with high cardinality.
promocion_diaria: Found 2 columns with high cardinality.
macro_economia: Found 6 columns with high cardinality.


In [11]:
print("\n--- CHECK: ZERO PRESENCE ---\n")

zero_presence_threshold = config.get('quality', {}).get('zero_presence_threshold', 0.5)
print(f"Using Zero Presence Threshold: {zero_presence_threshold}")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            zero_presence_cols = {}
            
            # Check only numeric columns usually
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            
            for col in numeric_cols:
                zero_count = (df[col] == 0).sum()
                total_count = len(df)
                ratio = zero_count / total_count if total_count > 0 else 0
                
                if ratio > zero_presence_threshold:
                    zero_presence_cols[col] = {
                        "zero_count": int(zero_count),
                        "ratio": round(ratio, 4)
                    }
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["zero_presence"] = zero_presence_cols
            
            if zero_presence_cols:
                 print(f"{table}: Found {len(zero_presence_cols)} columns with high zero presence.")
            else:
                 print(f"{table}: No columns with high zero presence found.")

        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: ZERO PRESENCE ---

Using Zero Presence Threshold: 0.3
ventas_diarias: Found 2 columns with high zero presence.
redes_sociales: Found 3 columns with high zero presence.
promocion_diaria: Found 1 columns with high zero presence.
macro_economia: No columns with high zero presence found.


In [12]:
print("\n--- CHECK: REPEATED ROWS ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            
            duplicates_count = df.duplicated().sum()
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["duplicates"] = {
                    "count": int(duplicates_count),
                    "has_duplicates": bool(duplicates_count > 0)
                }
            
            if duplicates_count > 0:
                print(f"{table}: Found {duplicates_count} duplicated rows.")
            else:
                print(f"{table}: No duplicated rows found.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: REPEATED ROWS ---

ventas_diarias: No duplicated rows found.
redes_sociales: No duplicated rows found.
promocion_diaria: No duplicated rows found.
macro_economia: No duplicated rows found.


In [13]:
print("\n--- CHECK: NULL VALUES ---\n")

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            null_cols = {}
            
            for col in df.columns:
                null_count = df[col].isnull().sum()
                if null_count > 0:
                    ratio = null_count / len(df)
                    null_cols[col] = {
                        "count": int(null_count),
                        "ratio": round(ratio, 4)
                    }
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["null_values"] = null_cols
            
            if null_cols:
                print(f"{table}: Found {len(null_cols)} columns with null values.")
            else:
                print(f"{table}: No null values found.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: NULL VALUES ---

ventas_diarias: No null values found.
redes_sociales: No null values found.
promocion_diaria: No null values found.
macro_economia: No null values found.


In [14]:
print("\n--- CHECK: SENTINEL VALUES ---\n")

sentinel_config = config.get('quality', {}).get('sentinel_values', {})

for table in source_tables:
    file_path = RAW_DATA_PATH / f"{table}.parquet"
    if file_path.exists():
        try:
            df = pd.read_parquet(file_path)
            sentinel_report = {}
            
            for col in df.columns:
                dtype = df[col].dtype
                sentinels_to_check = []
                
                # Determine which sentinels to check based on dtype
                if pd.api.types.is_numeric_dtype(dtype):
                    sentinels_to_check = sentinel_config.get('numeric', [])
                elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
                    sentinels_to_check = sentinel_config.get('categorical', [])
                elif pd.api.types.is_datetime64_any_dtype(dtype):
                    sentinels_to_check = sentinel_config.get('datetime', [])
                elif pd.api.types.is_bool_dtype(dtype):
                    sentinels_to_check = sentinel_config.get('boolean', [])
                
                # Check column against its relevant sentinels
                for sentinel in sentinels_to_check:
                    # Use isin for robustness or simple equality
                    try:
                        if pd.api.types.is_datetime64_any_dtype(dtype):
                             # Handle date comparison carefully
                             count = (df[col] == pd.Timestamp(sentinel)).sum()
                        else:
                             count = (df[col] == sentinel).sum()
                        
                        if count > 0:
                            if col not in sentinel_report:
                                sentinel_report[col] = []
                            sentinel_report[col].append({
                                "value": sentinel,
                                "count": int(count)
                            })
                    except Exception:
                        # Ignore comparison errors (e.g. string vs int)
                        pass
            
            if table in TABLE_ANALYSIS:
                TABLE_ANALYSIS[table]["sentinels"] = sentinel_report
            
            if sentinel_report:
                print(f"{table}: Found sentinel values in {len(sentinel_report)} columns.")
            else:
                print(f"{table}: No sentinel values found.")
        except Exception as e:
            print(f"Skipping {table}: {e}")


--- CHECK: SENTINEL VALUES ---

ventas_diarias: Found sentinel values in 1 columns.
redes_sociales: Found sentinel values in 1 columns.
promocion_diaria: Found sentinel values in 1 columns.
macro_economia: Found sentinel values in 1 columns.


  elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
  elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
  elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
  elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):


In [15]:
# Consolidate Report
final_report = {
    "phase": "Phase 1 - Data Discovery",
    "timestamp": datetime.datetime.now().isoformat(),
    "description": "Incremental download and detailed statistical analysis (Includes Variance, Cardinality, Zeros, Duplicates, Nulls, Sentinels).",
    "download_details": download_details,
    "data_analysis": TABLE_ANALYSIS
}

report_path = EXPERIMENT_ARTIFACTS_PATH / "phase_01_discovery.json"
with open(report_path, "w") as f:
    json.dump(final_report, f, indent=4)

print(f"\nFull Report saved to {report_path}")


Full Report saved to experiments\phase_01_discovery\artifacts\phase_01_discovery.json
