# BigQuery Data Analysis

This notebook demonstrates how to connect to Google BigQuery, run queries, and export data as CSV.

## 1. Setup and Authentication

In [None]:
import os
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account

In [None]:
# Option 1: Using service account key file
# Place your service account JSON file in the project directory
# and update the path below

# credentials = service_account.Credentials.from_service_account_file(
#     'path/to/your/service-account-key.json'
# )

# Option 2: Using environment variable
# Set GOOGLE_APPLICATION_CREDENTIALS environment variable to your JSON key path
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/service-account-key.json'

# Initialize BigQuery client
# client = bigquery.Client(credentials=credentials, project='your-project-id')
# OR if using environment variable:
# client = bigquery.Client()

## 2. Running Queries

In [None]:
# Example query - replace with your actual query
query = """
    SELECT 
        column1,
        column2,
        COUNT(*) as count
    FROM 
        `your-project.dataset.table`
    WHERE 
        date >= '2024-01-01'
    GROUP BY 
        column1, column2
    ORDER BY 
        count DESC
    LIMIT 1000
"""

# Run the query and convert to pandas DataFrame
# df = client.query(query).to_dataframe()

# Display first few rows
# df.head()

## 3. Data Exploration

In [None]:
# Basic data exploration
# print(f"Dataset shape: {df.shape}")
# print(f"\nColumn types:")
# print(df.dtypes)
# print(f"\nBasic statistics:")
# df.describe()

## 4. Export to CSV

In [None]:
# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Export to CSV
# output_file = 'data/query_results.csv'
# df.to_csv(output_file, index=False)
# print(f"Data exported to {output_file}")

## 5. Advanced Query with Parameters

In [None]:
# Using query parameters for safer queries
from datetime import datetime

# Define parameters
start_date = datetime(2024, 1, 1)
limit = 1000

# Parameterized query
param_query = """
    SELECT *
    FROM `your-project.dataset.table`
    WHERE date >= @start_date
    LIMIT @limit
"""

# Configure query parameters
job_config = bigquery.QueryJobConfig(
    query_parameters=[
        bigquery.ScalarQueryParameter("start_date", "DATE", start_date),
        bigquery.ScalarQueryParameter("limit", "INT64", limit),
    ]
)

# Run parameterized query
# df_param = client.query(param_query, job_config=job_config).to_dataframe()

## Helper Functions

In [None]:
def run_query_to_csv(client, query, output_filename, chunk_size=None):
    """
    Run a BigQuery query and save results to CSV.
    
    Args:
        client: BigQuery client instance
        query: SQL query string
        output_filename: Path for output CSV file
        chunk_size: If specified, write in chunks (useful for large datasets)
    """
    print(f"Running query...")
    query_job = client.query(query)
    
    if chunk_size:
        # For very large datasets, write in chunks
        print(f"Writing to {output_filename} in chunks of {chunk_size}...")
        for i, chunk in enumerate(query_job.to_dataframe_iterable(max_results=chunk_size)):
            mode = 'w' if i == 0 else 'a'
            header = i == 0
            chunk.to_csv(output_filename, mode=mode, header=header, index=False)
            print(f"Written chunk {i+1}")
    else:
        # For smaller datasets, load all at once
        df = query_job.to_dataframe()
        df.to_csv(output_filename, index=False)
        print(f"Query complete. {len(df)} rows saved to {output_filename}")
    
    return output_filename