In [3]:
import boto3
import pandas as pd
import time

# Initialize Athena client
athena_client = boto3.client('athena')

# Set your S3 output location where query results will be stored
s3_output = 's3://jkim27-etl-5b9d2da3-5f5d-4ab5-bda1-80307b8dc702/athena/'

# Your query
query = """
SELECT
    pickup_hour,
    pickup_location_id,
    rides,
    month
FROM glue_transformed
WHERE pickup_hour BETWEEN
    '2023-01-01 00:00:00' AND
    '2023-12-31 23:59:59'
ORDER BY pickup_hour;
"""

# Start the query execution
response = athena_client.start_query_execution(
    QueryString=query,
    QueryExecutionContext={
        'Database': 'etl_taxi_transformed'
    },
    ResultConfiguration={
        'OutputLocation': s3_output,
    }
)

# Get the query execution ID
query_execution_id = response['QueryExecutionId']

# Wait for the query to complete
state = 'RUNNING'
while state in ['RUNNING', 'QUEUED']:
    response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    state = response['QueryExecution']['Status']['State']
    if state in ['RUNNING', 'QUEUED']:
        time.sleep(1)

# If query executed successfully, fetch the results
if state == 'SUCCEEDED':
    # Initialize variables for pagination
    next_token = None
    all_rows = []
    columns = None
    row_count = 0
    last_reported = 0

    print("Starting to fetch results...")

    while True:
        # Fetch results with pagination
        if next_token:
            results = athena_client.get_query_results(
                QueryExecutionId=query_execution_id,
                NextToken=next_token,
                MaxResults=1000  # Maximum allowed value
            )
        else:
            results = athena_client.get_query_results(
                QueryExecutionId=query_execution_id,
                MaxResults=1000  # Maximum allowed value
            )

        # Extract column names (only once)
        if not columns:
            columns = [col['Label'] for col in results['ResultSet']['ResultSetMetadata']['ColumnInfo']]
            # Skip header row for first batch only
            rows = results['ResultSet']['Rows'][1:]
        else:
            # For subsequent batches, include all rows
            rows = results['ResultSet']['Rows']

        # Extract data rows
        batch_rows = [
            [field.get('VarCharValue', '') for field in row['Data']]
            for row in rows
        ]

        all_rows.extend(batch_rows)
        row_count += len(batch_rows)

        # Print progress at every 100,000 rows
        if row_count // 100000 > last_reported:
            last_reported = row_count // 100000
            print(f"Fetched {row_count:,} rows so far...")

        # Check if there is a next page of results
        next_token = results.get('NextToken')
        if not next_token:
            break

    print(f"Completed fetching {row_count:,} total rows")

    # Create pandas DataFrame
    df = pd.DataFrame(all_rows, columns=columns)

    # Convert data types as needed
    df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])
    df['pickup_location_id'] = df['pickup_location_id'].astype(int)
    df['rides'] = df['rides'].astype(int)
    df['month'] = df['month'].astype(int)

    print(f"Query results loaded into DataFrame with {len(df)} rows")
    print(df.head())
else:
    print(f"Query failed with state: {state}")
    print(response['QueryExecution']['Status']['StateChangeReason'])

Starting to fetch results...
Fetched 100,999 rows so far...
Fetched 200,999 rows so far...
Fetched 300,999 rows so far...
Fetched 400,999 rows so far...
Fetched 500,999 rows so far...
Fetched 600,999 rows so far...
Fetched 700,999 rows so far...
Fetched 800,999 rows so far...
Fetched 900,999 rows so far...
Fetched 1,000,999 rows so far...
Fetched 1,100,999 rows so far...
Fetched 1,200,999 rows so far...
Fetched 1,300,999 rows so far...
Fetched 1,400,999 rows so far...
Fetched 1,500,999 rows so far...
Fetched 1,600,999 rows so far...
Fetched 1,700,999 rows so far...
Fetched 1,800,999 rows so far...
Fetched 1,900,999 rows so far...
Fetched 2,000,999 rows so far...
Fetched 2,100,999 rows so far...
Fetched 2,200,224 rows so far...
Completed fetching 2,200,224 total rows
Query results loaded into DataFrame with 2200224 rows
  pickup_hour  pickup_location_id  rides  month
0  2023-01-01                 106      0      1
1  2023-01-01                  66      2      1
2  2023-01-01            