# Sheets to BigQuery - Scheduled Version

This notebook is designed for scheduled execution via Airflow with PythonVirtualenvOperator.
It reads data from Google Sheets and writes to BigQuery temp tables.

Packages are pre-installed in the virtualenv, so no runtime installation is needed.

In [None]:
# Parameters (injected by Papermill)
import logging

# Set up logging for scheduled execution
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Papermill will inject these parameters
GCP_PROJECT = "your-project-id"  # Default, will be overridden
GCP_REGION = "us-central1"  # Default, will be overridden

In [None]:
from oauth2client.service_account import ServiceAccountCredentials
import bigframes.pandas as bpd
import gspread
import os
import subprocess

logger.info(f"Starting notebook execution for project: {GCP_PROJECT}")

# Credentials path in GCS
credentials_path = f"gs://{GCP_PROJECT}-notebooks/credentials/drive-api.json"

# Configure BigQuery
bpd.options.bigquery.project = GCP_PROJECT
bpd.options.bigquery.location = GCP_REGION

logger.info(f"BigQuery project: {bpd.options.bigquery.project}")
logger.info(f"BigQuery location: {bpd.options.bigquery.location}")

In [None]:
# Authenticate with Google Sheets API
scope = ["https://spreadsheets.google.com/feeds",
         "https://www.googleapis.com/auth/drive"]

# Download credentials from GCS to local file
local_creds = "drive-api.json"
if not os.path.exists(local_creds):
    logger.info(f"Downloading credentials from {credentials_path}")
    subprocess.check_call(["gsutil", "cp", credentials_path, local_creds])
    logger.info("Credentials downloaded successfully")

credentials = ServiceAccountCredentials.from_json_keyfile_name(
    local_creds, scope)
googleClient = gspread.authorize(credentials)

logger.info("Successfully authenticated with Google Sheets API")

In [None]:
# Load legacy_charges data
LEGACY_CHARGES_SHEET_ID = os.getenv(
    "LEGACY_CHARGES_SHEET_ID",
    "1kQENu6sumzEQX60fjQtgmXvwPGlUfaNRgW7v_TWFUXo"
)

sheet = googleClient.open_by_key(LEGACY_CHARGES_SHEET_ID)
worksheet = sheet.get_worksheet(0)
legacy_charges = worksheet.get_all_records(numericise_ignore=['all'])

# Clean data
for record in legacy_charges:
    record['mid_label'] = None if record['mid_label'] == '' else record['mid_label']
    record['installment_count'] = None if record['installment_count'] == '' else int(
        record['installment_count'])

# Convert to BigQuery DataFrame
legacy_charges_df = bpd.DataFrame(legacy_charges)
legacy_charges_df['installment_count'] = legacy_charges_df['installment_count'].astype(
    'Int64')

logger.info(
    f"Successfully loaded {len(legacy_charges)} records from legacy_charges sheet")

In [None]:
# Load merchant_send_mid_label data
MERCHANT_SEND_MID_LABEL_SHEET_ID = os.getenv(
    "MERCHANT_SEND_MID_LABEL_SHEET_ID",
    "1_8sm8QciAU3T8oDlNS1Pfj-GQlmlJBrAi1TYdnnMlkw"
)

sheet = googleClient.open_by_key(MERCHANT_SEND_MID_LABEL_SHEET_ID)
worksheet = sheet.get_worksheet(0)
merchant_send_mid_label = worksheet.get_all_records()
merchant_send_mid_label_df = bpd.DataFrame(merchant_send_mid_label)

logger.info(
    f"Successfully loaded {len(merchant_send_mid_label)} records from merchant_send_mid_label sheet")

In [None]:
# Load merchant_excluded data
MERCHANT_EXCLUDED_SHEET_ID = os.getenv(
    "MERCHANT_EXCLUDED_SHEET_ID",
    "1orVBlPP77HTt9d8x-lC1Oo5xrPp0r1FgVUQ-43DYqYc"
)

sheet = googleClient.open_by_key(MERCHANT_EXCLUDED_SHEET_ID)
worksheet = sheet.get_worksheet(0)
merchant_excluded = worksheet.get_all_records()
merchant_excluded_df = bpd.DataFrame(merchant_excluded)

logger.info(
    f"Successfully loaded {len(merchant_excluded)} records from merchant_excluded sheet")

In [None]:
# Preview data
legacy_charges_df.head()

In [None]:
# Write to BigQuery temp tables
legacy_charges_df.to_gbq('temp.legacy_charges', if_exists='replace')
merchant_send_mid_label_df.to_gbq(
    'temp.merchant_send_mid_label', if_exists='replace')
merchant_excluded_df.to_gbq('temp.merchant_excluded', if_exists='replace')

logger.info("Successfully wrote all dataframes to BigQuery temp tables")

In [None]:
# Create final results table
RESULTS_TABLE = os.getenv("RESULTS_TABLE", "temp.filtered_legacy_charges")

create_table_sql = f"""
CREATE OR REPLACE TABLE `{RESULTS_TABLE}` AS
SELECT
    lc.*,
    CURRENT_TIMESTAMP() as processed_at
FROM
    `temp.legacy_charges` lc
LEFT JOIN
    `temp.merchant_send_mid_label` msml ON lc.business_id = msml.business_id
LEFT JOIN
    `temp.merchant_excluded` me ON lc.business_id = me.business_id
WHERE
    msml.business_id IS NULL AND me.business_id IS NULL
"""

# Execute using bpd
bpd.read_gbq_query(create_table_sql)

In [None]:
# Get row count and log completion
count_df = bpd.read_gbq(f"SELECT COUNT(*) as cnt FROM `{RESULTS_TABLE}`")
count = count_df['cnt'].iloc[0]

logger.info(f"Successfully created table {RESULTS_TABLE} with {count} records")
logger.info(
    f"View results: https://console.cloud.google.com/bigquery?project={GCP_PROJECT}&d=temp&t=filtered_legacy_charges&page=table")
logger.info("Notebook execution completed successfully")