# Sheets to BigQuery - Scheduled Version

This notebook is designed for scheduled execution via Airflow/Vertex AI.
It reads data from Google Sheets and writes to BigQuery temp tables.

In [None]:
import os
import logging

# Set up logging for scheduled execution
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
import bigframes.pandas as bpd

# Set BigQuery project and location from environment variables
# Falls back to defaults if not set
bpd.options.bigquery.project = os.getenv("GCP_PROJECT", "your-project-id")
bpd.options.bigquery.location = os.getenv("GCP_REGION", "us-central1")

logger.info(f"BigQuery project: {bpd.options.bigquery.project}")
logger.info(f"BigQuery location: {bpd.options.bigquery.location}")

In [None]:
# Prepare The Environment
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from IPython.core.magic import register_cell_magic
from IPython import get_ipython


@register_cell_magic
def skip(line, cell):
    return


@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().ex(cell)

In [None]:
# Authenticate and create the client
scope = ["https://spreadsheets.google.com/feeds",
         "https://www.googleapis.com/auth/drive"]

# Support both local and scheduled execution
credentials_path = os.getenv(
    "GOOGLE_APPLICATION_CREDENTIALS", "drive-api.json")
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    credentials_path, scope)
googleClient = gspread.authorize(credentials)

logger.info("Successfully authenticated with Google Sheets API")

In [None]:
# Prepare The Data
# Sheet IDs can be parameterized via environment variables
LEGACY_CHARGES_SHEET_ID = os.getenv(
    "LEGACY_CHARGES_SHEET_ID",
    "1kQENu6sumzEQX60fjQtgmXvwPGlUfaNRgW7v_TWFUXo"
)

# Open the Google Sheet by ID
sheet = googleClient.open_by_key(LEGACY_CHARGES_SHEET_ID)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
legacy_charges = worksheet.get_all_records(numericise_ignore=['all'])

# Set null to empty mid_label on legacy_charges
for record in legacy_charges:
    record['mid_label'] = None if record['mid_label'] == '' else record['mid_label']
    record['installment_count'] = None if record['installment_count'] == '' else int(
        record['installment_count'])

# Convert the records to a BigQuery DataFrame
legacy_charges_df = bpd.DataFrame(legacy_charges)
legacy_charges_df['installment_count'] = legacy_charges_df['installment_count'].astype(
    'Int64')

logger.info(
    f"Successfully loaded {len(legacy_charges)} records from legacy_charges sheet")

In [None]:
MERCHANT_SEND_MID_LABEL_SHEET_ID = os.getenv(
    "MERCHANT_SEND_MID_LABEL_SHEET_ID",
    "1_8sm8QciAU3T8oDlNS1Pfj-GQlmlJBrAi1TYdnnMlkw"
)

# Open the Google Sheet by ID
sheet = googleClient.open_by_key(MERCHANT_SEND_MID_LABEL_SHEET_ID)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
merchant_send_mid_label = worksheet.get_all_records()

# Convert the records to a BigQuery DataFrame
merchant_send_mid_label_df = bpd.DataFrame(merchant_send_mid_label)

logger.info(
    f"Successfully loaded {len(merchant_send_mid_label)} records from merchant_send_mid_label sheet")

In [None]:
MERCHANT_EXCLUDED_SHEET_ID = os.getenv(
    "MERCHANT_EXCLUDED_SHEET_ID",
    "1orVBlPP77HTt9d8x-lC1Oo5xrPp0r1FgVUQ-43DYqYc"
)

# Open the Google Sheet by ID
sheet = googleClient.open_by_key(MERCHANT_EXCLUDED_SHEET_ID)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
merchant_excluded = worksheet.get_all_records()

# Convert the records to a BigQuery DataFrame
merchant_excluded_df = bpd.DataFrame(merchant_excluded)

logger.info(
    f"Successfully loaded {len(merchant_excluded)} records from merchant_excluded sheet")

In [None]:
legacy_charges_df.head()

In [None]:
# Monitoring Script
legacy_charges_df.to_gbq('temp.legacy_charges', if_exists='replace')
merchant_send_mid_label_df.to_gbq(
    'temp.merchant_send_mid_label', if_exists='replace')
merchant_excluded_df.to_gbq('temp.merchant_excluded', if_exists='replace')

logger.info("Successfully wrote all dataframes to BigQuery temp tables")

In [None]:
# Create final results table using bpd
RESULTS_TABLE = os.getenv("RESULTS_TABLE", "temp.filtered_legacy_charges")

create_table_sql = f"""
CREATE OR REPLACE TABLE `{RESULTS_TABLE}` AS
SELECT
    lc.*,
    CURRENT_TIMESTAMP() as processed_at
FROM
    `temp.legacy_charges` lc
LEFT JOIN
    `temp.merchant_send_mid_label` msml ON lc.business_id = msml.business_id
LEFT JOIN
    `temp.merchant_excluded` me ON lc.business_id = me.business_id
WHERE
    msml.business_id IS NULL AND me.business_id IS NULL
"""

# Execute using bpd
bpd.read_gbq_query(create_table_sql)

In [None]:
# Get row count
count_df = bpd.read_gbq(f"SELECT COUNT(*) as cnt FROM `{RESULTS_TABLE}`")
count = count_df['cnt'].iloc[0]

logger.info(f"Successfully created table {RESULTS_TABLE} with {count} records")
logger.info(
    f"View results: https://console.cloud.google.com/bigquery?project={bpd.options.bigquery.project}&p={bpd.options.bigquery.project}&d=temp&t=filtered_legacy_charges&page=table")
logger.info("Notebook execution completed successfully")