<a href="https://colab.research.google.com/github/larry-tableau/tableau/blob/main/Read_from_BQ_into_Hyper_via_Pantab_v3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BigQuery Data Extraction Tool

A Python-based tool for extracting large datasets from Google BigQuery with support for parallel processing and multiple output formats (Hyper, Parquet, CSV).

---

## Features

- Dynamic chunk sizing for optimal memory usage.
- Parallel processing using ThreadPoolExecutor.
- Support for Hyper, Parquet, and CSV file outputs.
- Detailed logging and error handling.
- Schema-based type conversion for BigQuery data.

---

## Requirements

- Python 3.8+
- Google Cloud SDK with authenticated credentials.

In [None]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-cloud-bigquery pandas pantab==4.1.0



Will need to configure the relevant sections below
```
        # Configuration setup
        config = BigQueryConfig(
            project_id='pre-sales-demo',
            source_project='bigquery-public-data',
            dataset_id='google_trends',
            table_id='top_terms',
            output_format='hyper',
            output_path='./data',
            max_bytes_billed=1000 * 1024 * 1024 * 1024,  # 1TB
            initial_chunk_size=500_000,  # Start with smaller chunks
            max_workers=4,
            clean_up_temp_files=True
        )
```



In [None]:
# Imports and Configuration
import subprocess
import sys
import os
import logging
import time
import json
from datetime import datetime, timezone
from google.colab import auth
from google.cloud import bigquery
import pandas as pd
import pandas_gbq
import numpy as np
from typing import List, Optional, Tuple, Dict, Any
from dataclasses import dataclass
from IPython.display import clear_output, display, HTML
from concurrent.futures import ThreadPoolExecutor, as_completed
import psutil
from pathlib import Path

def install_requirements():
    """Install required packages silently."""
    requirements = [
        'pandas-gbq',
        'google-cloud-bigquery',
        'pyarrow',
        'pantab==4.1.0',
        'tableauHyperapi',
        'psutil'
    ]
    for package in requirements:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])

install_requirements()

# Configure logging with more detailed format
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class BigQueryConfig:
    """Enhanced configuration for BigQuery extraction."""
    project_id: str
    source_project: str
    dataset_id: str
    table_id: str
    output_format: str = 'hyper'  # 'hyper', 'parquet', or 'csv'
    output_path: str = './data'
    max_bytes_billed: int = 100 * 1024 * 1024 * 1024  # 100 GB
    initial_chunk_size: int = 1_000_000
    max_workers: int = 4
    where_clause: Optional[str] = None
    columns: Optional[List[str]] = None
    clean_up_temp_files: bool = True

    def __post_init__(self):
        """Validate configuration parameters."""
        if self.output_format not in ['hyper', 'parquet', 'csv']:
            raise ValueError("output_format must be one of: hyper, parquet, csv")
        if self.initial_chunk_size <= 0:
            raise ValueError("chunk_size must be positive")
        if self.max_workers <= 0:
            raise ValueError("max_workers must be positive")
        self.output_path = str(Path(self.output_path).resolve())

class BigQueryExtractor:
    """Enhanced BigQuery data extraction utility with parallel processing and memory management."""

    def __init__(self, config: BigQueryConfig):
        """Initialize the extractor with configuration."""
        self.config = config
        self._ensure_output_directory()
        self.client = self._initialize_client()
        self.schema = self._get_schema()
        self.total_rows = 0
        self.start_time = time.time()
        self.chunk_size = self.config.initial_chunk_size
        self.type_mapping = self._get_bq_type_mapping()
        self.processed_chunks = 0
        self.failed_chunks = []
        self._setup_progress_display()

    def _setup_progress_display(self):
        """Initialize progress display styling."""
        display(HTML("""
        <style>
            .bq-progress {
                font-family: monospace;
                padding: 10px;
                border: 1px solid #ccc;
                border-radius: 4px;
                margin: 10px 0;
                background-color: #f8f9fa;
            }
            .progress-bar {
                color: #fff;
                background-color: #28a745;
                height: 20px;
                border-radius: 3px;
                transition: width 0.3s ease;
            }
        </style>
        """))

    def _get_bq_type_mapping(self) -> Dict[str, Any]:
        """Get mapping of BigQuery data types to Python/Pandas types."""
        return {
            'STRING': str,
            'BYTES': str,
            'INTEGER': 'Int64',
            'INT64': 'Int64',
            'FLOAT': 'float64',
            'FLOAT64': 'float64',
            'NUMERIC': 'float64',
            'BIGNUMERIC': 'float64',
            'BOOLEAN': 'boolean',
            'BOOL': 'boolean',
            'DATE': 'datetime64[ns]',
            'DATETIME': 'datetime64[ns]',
            'TIME': str,
            'TIMESTAMP': 'datetime64[ns]',
            'RECORD': str,
            'STRUCT': str,
            'ARRAY': str,
            'GEOGRAPHY': str
        }

    def _ensure_output_directory(self):
        """Create output directory if it doesn't exist."""
        os.makedirs(self.config.output_path, exist_ok=True)

    def _initialize_client(self) -> bigquery.Client:
        """Initialize BigQuery client with authentication."""
        try:
            auth.authenticate_user()
            return bigquery.Client(project=self.config.project_id)
        except Exception as e:
            logger.error(f"Failed to initialize BigQuery client: {str(e)}")
            raise

    def _get_schema(self) -> List[bigquery.SchemaField]:
        """Get table schema information."""
        try:
            dataset_ref = self.client.dataset(self.config.dataset_id,
                                            project=self.config.source_project)
            table_ref = dataset_ref.table(self.config.table_id)
            return self.client.get_table(table_ref).schema
        except Exception as e:
            logger.error(f"Failed to get schema: {str(e)}")
            raise

    def adjust_chunk_size(self):
        """Dynamically adjust chunk size based on available system memory."""
        try:
            available_memory = psutil.virtual_memory().available
            ideal_chunk_size = available_memory // (200 * len(self.schema))  # Adjust based on column count
            self.chunk_size = max(100_000, min(self.config.initial_chunk_size, ideal_chunk_size))
            logger.info(f"Adjusted chunk size to {self.chunk_size:,} rows based on available memory")
        except Exception as e:
            logger.warning(f"Failed to adjust chunk size: {str(e)}. Using default size.")
            self.chunk_size = self.config.initial_chunk_size

    def _count_records(self) -> int:
        """Count total records in table."""
        query = f"""
        SELECT COUNT(*) as total
        FROM `{self.config.source_project}.{self.config.dataset_id}.{self.config.table_id}`
        """
        if self.config.where_clause:
            query += f" WHERE {self.config.where_clause}"

        df = pandas_gbq.read_gbq(query, project_id=self.config.project_id)
        total = int(df['total'].iloc[0])
        logger.info(f"Total records to process: {total:,}")
        return total
    def _build_query(self, offset: int) -> str:
        """Build optimized BigQuery query."""
        columns = self.config.columns or [field.name for field in self.schema]

        query = f"""
        -- Add optimization hints
        /*
        @param partition_pruning=true
        */
        SELECT {', '.join(columns)}
        FROM `{self.config.source_project}.{self.config.dataset_id}.{self.config.table_id}`
        """

        if self.config.where_clause:
            query += f" WHERE {self.config.where_clause}"

        query += f"""
        ORDER BY {self.schema[0].name}
        LIMIT {self.chunk_size}
        OFFSET {offset}
        """

        return query

    def _process_chunk(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process and convert data types in chunk."""
        if df.empty:
            return df

        column_types = {field.name: field.field_type for field in self.schema}

        for column in df.columns:
            try:
                bq_type = column_types.get(column, 'STRING')

                # Handle different types
                if bq_type in ['RECORD', 'STRUCT']:
                    df[column] = df[column].apply(lambda x: json.dumps(x) if x is not None else None)
                elif bq_type == 'ARRAY':
                    df[column] = df[column].apply(
                        lambda x: json.dumps(list(x)) if isinstance(x, (list, np.ndarray)) else
                                (json.dumps([x]) if x is not None else None)
                    )
                elif bq_type in ['DATE', 'DATETIME', 'TIMESTAMP']:
                    df[column] = pd.to_datetime(df[column], errors='coerce')
                elif bq_type in ['NUMERIC', 'BIGNUMERIC', 'FLOAT', 'FLOAT64']:
                    df[column] = pd.to_numeric(df[column], errors='coerce')
                elif bq_type in ['INTEGER', 'INT64']:
                    df[column] = df[column].astype('Int64', errors='ignore')
                elif bq_type in ['BOOLEAN', 'BOOL']:
                    df[column] = df[column].astype('boolean', errors='ignore')
                elif bq_type == 'STRING' and df[column].nunique() / len(df) < 0.5:
                    df[column] = df[column].astype('category')
            except Exception as e:
                logger.warning(f"Error processing column {column}: {str(e)}")
                df[column] = df[column].astype(str)

        return df

    def _save_chunk(self, df: pd.DataFrame, chunk_num: int) -> str:
        """Save data chunk in specified format."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_filename = f"{self.config.table_id}_chunk_{chunk_num}_{timestamp}"

        try:
            if self.config.output_format == 'parquet':
                filename = f"{base_filename}.parquet"
                full_path = f"{self.config.output_path}/{filename}"
                df.to_parquet(full_path, index=False)
            else:  # Save as parquet temporarily even if final format is different
                filename = f"{base_filename}.parquet"
                full_path = f"{self.config.output_path}/{filename}"
                df.to_parquet(full_path, index=False)

            logger.debug(f"Saved chunk {chunk_num} to {full_path}")
            return filename

        except Exception as e:
            logger.error(f"Failed to save chunk {chunk_num}: {str(e)}")
            raise

    def _fetch_and_save_chunk(self, offset: int, chunk_num: int) -> Optional[str]:
        """Fetch and save a single chunk of data."""
        try:
            query = self._build_query(offset)
            df_chunk = pandas_gbq.read_gbq(
                query,
                project_id=self.config.project_id,
                configuration={
                    'query': {
                        'useQueryCache': True,
                        'maximumBytesBilled': self.config.max_bytes_billed
                    }
                }
            )

            if not df_chunk.empty:
                df_chunk = self._process_chunk(df_chunk)
                saved_file = self._save_chunk(df_chunk, chunk_num)
                self.total_rows += len(df_chunk)
                self.processed_chunks += 1
                return saved_file

        except Exception as e:
            logger.error(f"Error processing chunk {chunk_num}: {str(e)}")
            self.failed_chunks.append(chunk_num)
            return None

    def _update_progress(self, total_chunks: int):
        """Update progress with a static, cleaner display."""
        clear_output(wait=True)  # Clear previous output

        elapsed_time = time.time() - self.start_time
        progress = (self.processed_chunks / total_chunks) * 100

        # Calculate processing statistics
        rows_per_second = self.total_rows / elapsed_time if elapsed_time > 0 else 0
        remaining_chunks = total_chunks - self.processed_chunks
        estimated_remaining = (remaining_chunks * elapsed_time) / max(1, self.processed_chunks)

        # Get memory information
        memory_info = psutil.Process(os.getpid()).memory_info()
        memory_usage_mb = memory_info.rss / 1024 / 1024

        # Create progress display
        progress_html = f"""
        <div class="bq-progress">
            <div>Processing BigQuery Table: {self.config.source_project}.{self.config.dataset_id}.{self.config.table_id}</div>
            <div style="margin: 10px 0;">
                <div style="width: 100%; background-color: #eee; border-radius: 3px;">
                    <div class="progress-bar" style="width: {min(100, progress)}%;">
                        {progress:.1f}%
                    </div>
                </div>
            </div>
            <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 10px;">
                <div>Chunks: {self.processed_chunks}/{total_chunks}</div>
                <div>Rows: {self.total_rows:,}</div>
                <div>Failed Chunks: {len(self.failed_chunks)}</div>
                <div>Memory: {memory_usage_mb:.1f} MB</div>
                <div>Rate: {rows_per_second:.1f} rows/sec</div>
                <div>Time Left: {estimated_remaining:.1f}s</div>
            </div>
        </div>
        """
        display(HTML(progress_html))


    def _merge_to_final_format(self, saved_files: List[str]) -> str:
        """Merge chunks into final format."""
        final_filename = f"{self.config.table_id}_complete_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        try:
            if self.config.output_format == 'hyper':
                final_path = f"{self.config.output_path}/{final_filename}.hyper"
                import pantab

                # Read and combine chunks
                dfs = []
                for file in saved_files:
                    if file:
                        df = pd.read_parquet(f"{self.config.output_path}/{file}")
                        dfs.append(df)

                combined_df = pd.concat(dfs, ignore_index=True)
                pantab.frame_to_hyper(combined_df, final_path, table=self.config.table_id)

            elif self.config.output_format == 'csv':
                final_path = f"{self.config.output_path}/{final_filename}.csv"
                pd.concat(
                    [pd.read_parquet(f"{self.config.output_path}/{f}") for f in saved_files if f],
                    ignore_index=True
                ).to_csv(final_path, index=False)

            # Clean up temporary files if requested
            if self.config.clean_up_temp_files:
                for file in saved_files:
                    if file:
                        try:
                            os.remove(f"{self.config.output_path}/{file}")
                        except Exception as e:
                            logger.warning(f"Failed to remove temporary file {file}: {str(e)}")

            return f"{final_filename}.{self.config.output_format}"

        except Exception as e:
            logger.error(f"Error merging files: {str(e)}")
            raise

    def extract_data(self) -> Tuple[int, str]:
        """Main method to extract data with parallel processing."""
        try:
            # Adjust chunk size based on memory
            self.adjust_chunk_size()

            # Get total record count
            total_records = self._count_records()
            if total_records == 0:
                logger.warning("No records found to extract")
                return 0, None

            total_chunks = (total_records + self.chunk_size - 1) // self.chunk_size
            saved_files = []

            # Extract data in parallel
            with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
                future_to_chunk = {
                    executor.submit(
                        self._fetch_and_save_chunk,
                        chunk_num * self.chunk_size,
                        chunk_num
                    ): chunk_num
                    for chunk_num in range(total_chunks)
                }

                for future in as_completed(future_to_chunk):
                    chunk_num = future_to_chunk[future]
                    try:
                        saved_file = future.result()
                        if saved_file:
                            saved_files.append(saved_file)
                            self._update_progress(total_chunks)
                    except Exception as e:
                        logger.error(f"Error processing chunk {chunk_num}: {str(e)}")
                        self.failed_chunks.append(chunk_num)
                        self._update_progress(total_chunks)  # Update even on failure

            # Handle failed chunks retry
            if self.failed_chunks:
                logger.info(f"Retrying {len(self.failed_chunks)} failed chunks...")
                for chunk_num in self.failed_chunks[:]:  # Copy list to iterate
                    try:
                        saved_file = self._fetch_and_save_chunk(chunk_num * self.chunk_size, chunk_num)
                        if saved_file:
                            saved_files.append(saved_file)
                            self.failed_chunks.remove(chunk_num)
                    except Exception as e:
                        logger.error(f"Retry failed for chunk {chunk_num}: {str(e)}")

            # Merge chunks to final format if any data was extracted
            if saved_files:
                try:
                    final_file = self._merge_to_final_format(saved_files)
                    return self.total_rows, final_file
                except Exception as e:
                    logger.error(f"Failed to create final file: {str(e)}")
                    raise
            else:
                logger.warning("No data was extracted successfully")
                return 0, None

        except Exception as e:
            logger.error(f"Error during extraction: {str(e)}")
            raise

    def get_memory_usage(self) -> str:
        """Get current memory usage information."""
        process = psutil.Process(os.getpid())
        memory_info = process.memory_info()
        return f"""Memory Usage:
        - RSS: {memory_info.rss / 1024 / 1024:.2f} MB
        - VMS: {memory_info.vms / 1024 / 1024:.2f} MB
        - Available System Memory: {psutil.virtual_memory().available / 1024 / 1024:.2f} MB"""

def extract_bigquery_data(config: BigQueryConfig) -> Tuple[int, Optional[str]]:
    """Main function to extract data from BigQuery."""
    extractor = BigQueryExtractor(config)
    return extractor.extract_data()

if __name__ == "__main__":
    try:
        # Example usage with enhanced error handling and logging
        logger.info("Starting BigQuery data extraction process")

        # Configuration setup
        config = BigQueryConfig(
            project_id='pre-sales-demo',
            source_project='bigquery-public-data',
            dataset_id='google_trends',
            table_id='top_terms',
            output_format='hyper',
            output_path='./data',
            max_bytes_billed=1000 * 1024 * 1024 * 1024,  # 1TB
            initial_chunk_size=500_000,  # Start with smaller chunks
            max_workers=4,
            clean_up_temp_files=True
        )

        logger.info("Configuration:")
        logger.info(f"- Source: {config.source_project}.{config.dataset_id}.{config.table_id}")
        logger.info(f"- Output Format: {config.output_format}")
        logger.info(f"- Initial Chunk Size: {config.initial_chunk_size:,} rows")
        logger.info(f"- Max Workers: {config.max_workers}")

        # Extract data
        total_rows, final_file = extract_bigquery_data(config)

        # Log results
        logger.info("Extraction completed successfully")
        logger.info(f"Total rows processed: {total_rows:,}")
        logger.info(f"Final output file: {final_file}")

    except Exception as e:
        logger.error(f"Extraction failed: {str(e)}", exc_info=True)
        raise
    finally:
        # Clean up and final status
        logger.info("Process completed")