BigQuery to Hyper

In [None]:
%pip -qqq install pandas pantab psutil google-cloud-bigquery google-auth
%pip -qqq install ipython

In [23]:
import pandas as pd
import pantab
import gc
import os
import time
from datetime import datetime
from typing import Optional, Dict, Any, List, Tuple
import numpy as np
from pathlib import Path
import json
from google.cloud import bigquery
from google.oauth2 import service_account
import logging
import sys
import psutil
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import multiprocessing
import threading
from functools import partial
from IPython.display import clear_output
import concurrent.futures

class ProgressTracker:
    def __init__(self, total_rows: int, chunk_size: int, update_interval: float = 0.5):
        self.total_rows = total_rows
        self.chunk_size = chunk_size
        self.processed_rows = 0
        self.start_time = time.time()
        self.last_update = 0
        self.update_interval = update_interval
        self.chunks_processed = 0
        
    def _get_memory_usage(self) -> float:
        process = psutil.Process()
        return process.memory_info().rss / (1024 * 1024)
        
    def _format_bar(self, width: int = 50) -> str:
        progress = min(1.0, self.processed_rows / self.total_rows)
        filled = int(width * progress)
        bar = '█' * filled + '░' * (width - filled)
        return f'[{bar}]'
    
    def _format_time(self, seconds: float) -> str:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        
        if hours > 0:
            return f"{hours}h {minutes}m {secs}s"
        elif minutes > 0:
            return f"{minutes}m {secs}s"
        else:
            return f"{secs}s"
    
    def _estimate_time_remaining(self, elapsed_time: float, progress: float) -> str:
        if progress <= 0:
            return "calculating..."
        
        total_estimated_time = elapsed_time / progress
        remaining_time = total_estimated_time - elapsed_time
        return self._format_time(remaining_time)
    
    def update(self, rows_processed: int):
        self.processed_rows += rows_processed
        self.chunks_processed += 1
        current_time = time.time()
        
        if (current_time - self.last_update) >= self.update_interval:
            elapsed_time = current_time - self.start_time
            processing_speed = self.processed_rows / elapsed_time if elapsed_time > 0 else 0
            progress = (self.processed_rows / self.total_rows)
            memory_usage = self._get_memory_usage()
            
            if 'ipykernel' in sys.modules:
                clear_output(wait=True)
            
            eta = self._estimate_time_remaining(elapsed_time, progress)
            status = f"""
╔══════════════════════════════════════════════════════════════════════════════
║ Progress: {self._format_bar()} {progress*100:.1f}%
║ Chunks: {self.chunks_processed}/{(self.total_rows + self.chunk_size - 1) // self.chunk_size}
║ Rows: {self.processed_rows:,}/{self.total_rows:,}
║ Speed: {processing_speed:,.1f} rows/sec
║ Memory: {memory_usage:.1f} MB
║ Elapsed: {self._format_time(elapsed_time)}
║ ETA: {eta}
╚══════════════════════════════════════════════════════════════════════════════"""
            print(status)
            self.last_update = current_time

    def finish(self):
        import gc
        total_time = time.time() - self.start_time
        final_speed = self.processed_rows / total_time if total_time > 0 else 0
        
        # Memory usage statistics
        memory_usage = self._get_memory_usage()
        gc.collect()  # Explicitly invoke garbage collection
        peak_memory = psutil.Process().memory_info().rss / (1024 * 1024)
        
        if 'ipykernel' in sys.modules:
            clear_output(wait=True)
            
        status = f"""
    ╔══════════════════════════════════════════════════════════════════════════════
    ║ Processing Complete!
    ║ Total Rows: {self.processed_rows:,}
    ║ Total Time: {self._format_time(total_time)}
    ║ Average Speed: {final_speed:,.1f} rows/sec
    ║ Final Memory: {memory_usage:.1f} MB
    ║ Peak Memory: {peak_memory:.1f} MB
    ║ Garbage Collected: {gc.collect()} objects
    ╚══════════════════════════════════════════════════════════════════════════════"""
        print(status)

class BigQueryToHyperETL:
    def __init__(
        self,
        project_id: str,
        source_project: str,
        dataset_id: str,
        table_id: str,
        service_account_path: Optional[str] = None,
        chunk_size: int = 100000,
        cache_dir: str = "./cache",
        cleanup_cache: bool = False
    ):
        self.project_id = project_id
        self.source_project = source_project
        self.dataset_id = dataset_id
        self.table_id = table_id
        self.service_account_path = service_account_path
        self.chunk_size = chunk_size
        self.cache_dir = Path(cache_dir)
        self.cleanup_cache = cleanup_cache
        self.setup_logging()
        self.schema = None
        self.order_by_columns = None
        
        self.type_mapping = {
            'STRING': 'string',
            'INTEGER': 'Int32',
            'FLOAT': 'float32',
            'BOOLEAN': 'boolean',
            'DATE': 'datetime64[ns]',
            'DATETIME': 'datetime64[ns]',
            'TIMESTAMP': 'datetime64[ns]',
            'TIME': 'string',
            'NUMERIC': 'float64',
            'BIGNUMERIC': 'float64',
            'BYTES': 'string',
            'RECORD': 'string',
        }
        
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    def setup_logging(self):
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        
        if not self.logger.handlers:
            formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
            
            console_handler = logging.StreamHandler(sys.stdout)
            console_handler.setFormatter(formatter)
            self.logger.addHandler(console_handler)
            
            file_handler = logging.FileHandler('etl_process.log')
            file_handler.setFormatter(formatter)
            self.logger.addHandler(file_handler)

    def get_cache_dir(self) -> Path:
        cache_dir = self.cache_dir / f"{self.source_project}_{self.dataset_id}_{self.table_id}"
        cache_dir.mkdir(parents=True, exist_ok=True)
        return cache_dir

    def initialize_auth(self):
        try:
            if self.service_account_path:
                self.logger.info(f"Authenticating using service account: {self.service_account_path}")
                credentials = service_account.Credentials.from_service_account_file(
                    self.service_account_path,
                    scopes=['https://www.googleapis.com/auth/bigquery']
                )
                self.client = bigquery.Client(
                    credentials=credentials,
                    project=self.project_id
                )
            else:
                self.logger.info("Attempting to use default credentials")
                self.client = bigquery.Client(project=self.project_id)
            
            self._verify_permissions()
            self.logger.info("Successfully authenticated with Google Cloud")
            
        except Exception as e:
            self.logger.error(f"Authentication failed: {str(e)}")
            raise

    def _verify_permissions(self):
        try:
            test_query = f"""
            SELECT 1
            FROM `{self.source_project}.{self.dataset_id}.{self.table_id}`
            LIMIT 1
            """
            self.client.query(test_query).result()
            self.logger.info("Successfully verified BigQuery permissions")
        except Exception as e:
            self.logger.error(f"Permission verification failed: {str(e)}")
            raise

    def get_total_rows(self) -> int:
        query = f"""
        SELECT COUNT(*) as count
        FROM `{self.source_project}.{self.dataset_id}.{self.table_id}`
        """
        query_job = self.client.query(query)
        rows = query_job.result()
        return next(rows)['count']

    def _get_table_schema(self):
        """Modified schema retrieval without focusing on ordering columns."""
        table_ref = f"{self.source_project}.{self.dataset_id}.{self.table_id}"
        table = self.client.get_table(table_ref)
        
        self.schema = {}
        for field in table.schema:
            pandas_type = self.type_mapping.get(field.field_type, 'string')
            self.schema[field.name] = {
                'bq_type': field.field_type,
                'pandas_type': pandas_type,
                'nullable': field.is_nullable
            }
        
        self.logger.info(f"Table schema with types: {self.schema}")

    def convert_types(self, df: pd.DataFrame) -> pd.DataFrame:
        for col, type_info in self.schema.items():
            if col not in df.columns:
                continue
                
            try:
                pandas_type = type_info['pandas_type']
                
                if pandas_type == 'datetime64[ns]':
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                elif pandas_type == 'Int64':
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int32')
                elif pandas_type == 'float64':
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')
                elif pandas_type == 'boolean':
                    df[col] = df[col].astype('boolean')
                else:
                    df[col] = df[col].astype(str)
                
                if df[col].isna().any():
                    if pandas_type == 'string':
                        df[col] = df[col].fillna('')
                    elif pandas_type in ['Int32', 'float32']:
                        df[col] = df[col].fillna(0)
                    elif pandas_type == 'boolean':
                        df[col] = df[col].fillna(False)
                        
            except Exception as e:
                self.logger.warning(f"Error converting column {col}: {str(e)}. Converting to string.")
                df[col] = df[col].astype(str)
        
        return df

    def process_chunk(self, offset: int) -> pd.DataFrame:
        """Base chunk processing method without ordering."""
        try:
            query = f"""
            SELECT *
            FROM `{self.source_project}.{self.dataset_id}.{self.table_id}`
            LIMIT {self.chunk_size}
            OFFSET {offset}
            """
            
            job_config = bigquery.QueryJobConfig(
                priority=bigquery.QueryPriority.INTERACTIVE,
                use_query_cache=True,
                maximum_bytes_billed=1000000000000,
                allow_large_results=True,
                maximum_bytes_shuffled=None,
                use_legacy_sql=False
            )
            
            query_job = self.client.query(query, job_config=job_config)
            results = query_job.result()
            df = results.to_dataframe()
            
            return self.convert_types(df)
            
        except Exception as e:
            self.logger.error(f"Error processing chunk at offset {offset}: {str(e)}")
            if "Resources exceeded" in str(e):
                original_chunk_size = self.chunk_size
                self.chunk_size = max(1000, self.chunk_size // 2)
                self.logger.warning(
                    f"Memory limit exceeded. Reducing chunk size from "
                    f"{original_chunk_size:,} to {self.chunk_size:,}"
                )
                return self.process_chunk(offset)
            raise
        gc.collect()

class ParallelETL(BigQueryToHyperETL):
    def __init__(
        self,
        project_id: str,
        source_project: str,
        dataset_id: str,
        table_id: str,
        service_account_path: Optional[str] = None,
        chunk_size: int = 100000,
        cache_dir: str = "./cache",
        cleanup_cache: bool = False,
        max_workers: int = None,
        batch_size: int = 10
    ):
        super().__init__(
            project_id=project_id,
            source_project=source_project,
            dataset_id=dataset_id,
            table_id=table_id,
            service_account_path=service_account_path,
            chunk_size=chunk_size,
            cache_dir=cache_dir,
            cleanup_cache=cleanup_cache
        )
        self.max_workers = max_workers or multiprocessing.cpu_count()
        self.batch_size = batch_size
        self.thread_local = threading.local()

    def get_client(self):
        if not hasattr(self.thread_local, "client"):
            if self.service_account_path:
                credentials = service_account.Credentials.from_service_account_file(
                    self.service_account_path,
                    scopes=['https://www.googleapis.com/auth/bigquery']
                )
                self.thread_local.client = bigquery.Client(
                    credentials=credentials,
                    project=self.project_id
                )
            else:
                self.thread_local.client = bigquery.Client(project=self.project_id)
        return self.thread_local.client

    def process_chunk_parallel(self, offset: int, batch_id: int) -> Tuple[bool, int]:
        """Process a chunk using parallel processing without ordering."""
        cache_dir = self.get_cache_dir()
        cache_file = cache_dir / f"chunk_{batch_id}_{offset}.parquet"
        
        if cache_file.exists():
            df = pd.read_parquet(cache_file)
            return True, len(df)
        
        try:
            client = self.get_client()
            query = f"""
            SELECT *
            FROM `{self.source_project}.{self.dataset_id}.{self.table_id}`
            LIMIT {self.chunk_size}
            OFFSET {offset}
            """
            
            job_config = bigquery.QueryJobConfig(
                priority=bigquery.QueryPriority.INTERACTIVE,
                use_query_cache=True,
                maximum_bytes_billed=1000000000000,
                allow_large_results=True,
                use_legacy_sql=False
            )
            
            query_job = client.query(query, job_config=job_config)
            df = query_job.result().to_dataframe()
            
            if not df.empty:
                df = self.convert_types(df)
                cache_file.parent.mkdir(parents=True, exist_ok=True)
                df.to_parquet(
                    cache_file,
                    compression='snappy',
                    engine='pyarrow'
                )
                return True, len(df)
            return False, 0
            
        except Exception as e:
            self.logger.error(f"Error in batch {batch_id}, offset {offset}: {str(e)}")
            if "Resources exceeded" in str(e):
                self.chunk_size = max(1000, self.chunk_size // 2)
                self.logger.warning(f"Reducing chunk size to {self.chunk_size}")
                return self.process_chunk_parallel(offset, batch_id)
            raise

    @staticmethod
    def _convert_chunk(file: Path) -> pd.DataFrame:
        """Static method to convert a parquet file to DataFrame."""
        return pd.read_parquet(file)

    @staticmethod
    def _write_hyper_batch(dfs: List[pd.DataFrame], output_file: str, mode: str):
        """Static method to write DataFrames to hyper file."""
        combined_df = pd.concat(dfs, ignore_index=True)
        pantab.frame_to_hyper(
            df=combined_df,
            table='Extract',
            database=output_file,
            table_mode=mode,
            atomic=False
    )

    def parallel_convert_to_hyper(self, output_file: str, cached_files: List[Path]):
        """Convert cached files to hyper format using parallel processing."""
        batch_size = min(len(cached_files), self.batch_size)
        
        try:
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                for i in range(0, len(cached_files), batch_size):
                    batch_files = cached_files[i:i + batch_size]
                    
                    # Read parquet files in parallel
                    dfs = list(executor.map(self._convert_chunk, batch_files))
                    
                    # Write to hyper file
                    mode = 'w' if i == 0 else 'a'
                    self._write_hyper_batch(dfs, output_file, mode)
                    
                    del dfs
                    gc.collect()
                    
                    # Log progress
                    self.logger.info(f"Converted batch {i//batch_size + 1}/{(len(cached_files) + batch_size - 1)//batch_size}")
                    
        except Exception as e:
            self.logger.error(f"Error during parallel conversion: {str(e)}")
            self.logger.info("Falling back to sequential processing...")
            
            # Fallback to sequential processing
            for i, file in enumerate(cached_files):
                try:
                    df = self._convert_chunk(file)
                    mode = 'w' if i == 0 else 'a'
                    self._write_hyper_batch([df], output_file, mode)
                except Exception as e:
                    self.logger.error(f"Error processing file {file}: {str(e)}")
                    continue

    def execute(self, output_file: str):
        """Execute the ETL process with parallel processing."""
        try:
            self.initialize_auth()
            self._get_table_schema()
            total_rows = self.get_total_rows()
            
            # Initialize progress tracker
            progress = ProgressTracker(
                total_rows=total_rows,
                chunk_size=self.chunk_size
            )
            
            cache_dir = self.get_cache_dir()
            checkpoint_file = cache_dir / "checkpoint.json"
            processed_rows = 0
            
            # Calculate batch offsets
            offsets = list(range(0, total_rows, self.chunk_size))
            batches = [
                offsets[i:i + self.batch_size] 
                for i in range(0, len(offsets), self.batch_size)
            ]
            
            self.logger.info(f"Starting ETL process with {len(batches)} batches")
            self.logger.info(f"Each batch will process up to {self.batch_size} chunks")
            self.logger.info(f"Using {self.max_workers} workers for parallel processing")
            
            # Phase 1: Download and cache chunks in parallel
            failed_chunks = []
            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                for batch_id, batch_offsets in enumerate(batches):
                    # Submit all chunks in the batch
                    future_to_offset = {
                        executor.submit(self.process_chunk_parallel, offset, batch_id): offset 
                        for offset in batch_offsets
                    }
                    
                    # Process completed chunks
                    for future in concurrent.futures.as_completed(future_to_offset):
                        try:
                            success, rows = future.result()
                            if success:
                                processed_rows += rows
                                progress.update(rows)
                                
                                # Save checkpoint
                                with open(checkpoint_file, 'w') as f:
                                    json.dump({
                                        'processed_rows': processed_rows,
                                        'last_batch': batch_id
                                    }, f)
                            else:
                                offset = future_to_offset[future]
                                failed_chunks.append((batch_id, offset))
                                
                        except Exception as e:
                            self.logger.error(f"Error processing chunk: {str(e)}")
                            offset = future_to_offset[future]
                            failed_chunks.append((batch_id, offset))
                            continue
            
            if failed_chunks:
                self.logger.warning(f"Failed to process {len(failed_chunks)} chunks")
            
            # Phase 2: Convert cached files to hyper format
            self.logger.info("Converting cached files to hyper format...")
            cached_files = sorted(
                cache_dir.glob("chunk_*.parquet"),
                key=lambda x: int(x.stem.split('_')[1])
            )
            
            if not cached_files:
                raise ValueError(f"No cached files found in {cache_dir}")
            
            self.logger.info(f"Found {len(cached_files)} cached files to convert")
            self.parallel_convert_to_hyper(output_file, cached_files)
            
            progress.finish()
            
            # Cleanup if requested
            if self.cleanup_cache:
                self.logger.info("Cleaning up cache...")
                for file in cache_dir.glob("*"):
                    file.unlink()
                cache_dir.rmdir()
            
        except Exception as e:
            self.logger.error(f"Critical error in ETL process: {str(e)}")
            self.logger.error(f"Cache directory contents: {list(cache_dir.glob('*'))}")
            raise

In [25]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.api_core import exceptions
import psutil
import multiprocessing

def get_system_resources(avg_row_size=None):
    """
    Get system resources and calculate optimal ETL parameters.
    
    Args:
        avg_row_size (float): Average row size in bytes from BigQuery table
    """
    total_memory_gb = psutil.virtual_memory().total / (1024 ** 3)
    available_cpus = multiprocessing.cpu_count()
    
    # Use actual row size if provided, otherwise use conservative estimate
    row_size = avg_row_size if avg_row_size is not None else 300
    
    # Calculate optimal parameters with actual row size
    optimal_chunk_size = int((total_memory_gb * 0.45 * 1024 * 1024 * 1024) / (row_size * 2))
    chunk_size = min(100000, optimal_chunk_size)
    
    optimal_workers = min(
        available_cpus - 2,
        int(total_memory_gb / 4),
        8
    )
    
    optimal_batch_size = min(
        max(1, int(optimal_workers * 1.5)),
        12
    )
    
    # Calculate actual memory usage for this configuration
    estimated_memory_usage_gb = (chunk_size * row_size * optimal_workers) / (1024 * 1024 * 1024)
    
    return {
        'total_memory_gb': total_memory_gb,
        'available_cpus': available_cpus,
        'chunk_size': chunk_size,
        'max_workers': optimal_workers,
        'batch_size': optimal_batch_size,
        'row_size': row_size,
        'estimated_memory_gb': estimated_memory_usage_gb
    }


def estimate_row_size(project_id, source_project, dataset_id, table_id, credentials_path=None):
    """
    Estimates the average row size of a BigQuery table and suggests optimal ETL parameters.
    
    Args:
        project_id (str): The GCP project ID where the query job will run.
        source_project (str): The GCP project ID where the dataset and table reside.
        dataset_id (str): The dataset ID containing the table.
        table_id (str): The name of the table.
        credentials_path (str, optional): Path to the service account JSON file for authentication.
    
    Returns:
        tuple: (average row size in bytes, system resources dict)
    """
    # Get system resources first
    system_resources = get_system_resources()
    
    # Initialize the BigQuery client
    try:
        if credentials_path:
            credentials = service_account.Credentials.from_service_account_file(
                credentials_path, 
                scopes=["https://www.googleapis.com/auth/cloud-platform"]
            )
            client = bigquery.Client(credentials=credentials, project=project_id)
        else:
            client = bigquery.Client(project=project_id)
        
        # Try table metadata first
        try:
            table_ref = client.dataset(dataset_id, project=source_project).table(table_id)
            table = client.get_table(table_ref)
            
            if table.num_bytes is not None and table.num_rows is not None and table.num_rows > 0:
                avg_row_size = table.num_bytes / table.num_rows
                # Pass the actual row size to get_system_resources
                system_resources = get_system_resources(avg_row_size)
                print_configuration(avg_row_size, system_resources, "table metadata")
                return avg_row_size, system_resources
                
        except exceptions.NotFound:
            print("\nTable metadata not directly accessible, trying alternative methods...")
        
        # Try INFORMATION_SCHEMA.TABLES
        size_query = f"""
            SELECT
                total_bytes,
                row_count
            FROM `{source_project}.{dataset_id}.INFORMATION_SCHEMA.TABLES`
            WHERE table_name = '{table_id}'
        """
        
        try:
            query_job = client.query(size_query)
            rows = list(query_job.result())
            
            if rows and len(rows) > 0:
                total_bytes = rows[0].total_bytes or 0
                row_count = rows[0].row_count or 1
                
                if row_count > 0:
                    avg_row_size = total_bytes / row_count
                    print_configuration(avg_row_size, system_resources, "INFORMATION_SCHEMA")
                    return avg_row_size, system_resources
                    
        except Exception as e:
            print(f"\nError querying INFORMATION_SCHEMA.TABLES: {str(e)}")
        
        # Final fallback method
        print("\nUsing fallback method with direct table query...")
        storage_query = f"""
            SELECT APPROXIMATE_STORAGE_USAGE AS total_bytes
            FROM `{source_project}.{dataset_id}.INFORMATION_SCHEMA.TABLE_STORAGE_STATS`
            WHERE table_name = '{table_id}'
        """
        count_query = f"""
            SELECT COUNT(*) AS row_count 
            FROM `{source_project}.{dataset_id}.{table_id}`
        """
        
        try:
            storage_job = client.query(storage_query)
            storage_result = list(storage_job.result())
            total_bytes = storage_result[0].total_bytes if storage_result else 0
            
            count_job = client.query(count_query)
            count_result = list(count_job.result())
            row_count = count_result[0].row_count if count_result else 1
            
            if row_count > 0:
                avg_row_size = total_bytes / row_count
                print_configuration(avg_row_size, system_resources, "fallback method")
                return avg_row_size, system_resources
            
        except Exception as e:
            print(f"\nError in fallback method: {str(e)}")
        
        return None, system_resources
        
    except exceptions.Forbidden as e:
        print(f"\nPermission error: {str(e)}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {str(e)}")
    return None, system_resources

def print_configuration(avg_row_size, resources, method):
    """Print a formatted configuration summary."""
    width = 60
    
    def center_text(text, width=width):
        return f"║ {text.center(width-4)} ║"
    
    def left_text(text, width=width):
        return f"║ {text:<{width-4}} ║"
    
    # Calculate memory estimations
    chunk_memory = (resources['chunk_size'] * resources['row_size']) / (1024 * 1024 * 1024)
    total_memory_usage = resources['estimated_memory_gb']
    memory_percentage = (total_memory_usage/resources['total_memory_gb']*100)
    
    # Single unified output
    print("╔" + "═" * (width-2) + "╗")
    print(center_text("ETL Configuration Summary"))
    print("╠" + "═" * (width-2) + "╣")
    
    # Memory Usage Estimation
    print(left_text("Memory Usage Estimation:"))
    print(left_text(f"  • Row Size Used:    {resources['row_size']:.2f} bytes"))
    print(left_text(f"  • Chunk Memory:     {chunk_memory:.2f} GB per chunk"))
    print(left_text(f"  • Total Memory:     {total_memory_usage:.2f} GB ({memory_percentage:.1f}% RAM)"))
    print(left_text(""))
    
    # System Resources
    print(left_text("System Resources:"))
    print(left_text(f"  • Total Memory:     {resources['total_memory_gb']:.2f} GB"))
    print(left_text(f"  • Available CPUs:   {resources['available_cpus']} cores"))
    print(left_text(""))
    
    # Table Statistics
    print(left_text("Table Statistics:"))
    print(left_text(f"  • Avg Row Size:     {avg_row_size:.2f} bytes"))
    print(left_text(f"  • Method Used:      {method}"))
    print(left_text(""))
    
    # Recommended Parameters
    print(left_text("Recommended Parameters:"))
    print(left_text(f"  • Chunk Size:       {resources['chunk_size']:,} rows"))
    print(left_text(f"  • Max Workers:      {resources['max_workers']} workers"))
    print(left_text(f"  • Batch Size:       {resources['batch_size']} chunks per batch"))
    
    print("╚" + "═" * (width-2) + "╝")

# Example usage
avg_row_size, resources = estimate_row_size(
    project_id='your_project_id',
    source_project='bigquery-public-data',
    dataset_id='samples',
    table_id='github_timeline',
    credentials_path='/path/to/credentials.json'
)


╔══════════════════════════════════════════════════════════╗
║                ETL Configuration Summary                 ║
╠══════════════════════════════════════════════════════════╣
║ Memory Usage Estimation:                                 ║
║   • Row Size Used:    611.27 bytes                       ║
║   • Chunk Memory:     0.06 GB per chunk                  ║
║   • Total Memory:     0.40 GB (1.3% RAM)                 ║
║                                                          ║
║ System Resources:                                        ║
║   • Total Memory:     31.35 GB                           ║
║   • Available CPUs:   16 cores                           ║
║                                                          ║
║ Table Statistics:                                        ║
║   • Avg Row Size:     611.27 bytes                       ║
║   • Method Used:      table metadata                     ║
║                                                          ║
║ Recommended Parameters

In [26]:
# Initialize the parallel ETL process
etl = ParallelETL(
    project_id='your_project_id',
    source_project='bigquery-public-data',
    dataset_id='samples',
    table_id='github_timeline',
    service_account_path='/path/to/credentials.json',
    chunk_size=100000,  # Start with a moderate chunk size
    cache_dir="./cache",
    max_workers=6,
    batch_size=6,  # Process x chunks at a time
    cleanup_cache=False  # Keep cache files for debugging
)

# Execute with proper error handling
try:
    etl.execute('github_timeline_v2.hyper')
except Exception as e:
    print(f"Error during execution: {str(e)}")


    ╔══════════════════════════════════════════════════════════════════════════════
    ║ Processing Complete!
    ║ Total Rows: 6,219,749
    ║ Total Time: 14m 51s
    ║ Average Speed: 6,974.5 rows/sec
    ║ Final Memory: 6593.0 MB
    ║ Peak Memory: 6592.0 MB
    ║ Garbage Collected: 0 objects
    ╚══════════════════════════════════════════════════════════════════════════════
