In [None]:
from google.cloud.dataproc_spark_connect import DataprocSparkSession
from google.cloud.dataproc_v1 import Session, SparkConnectConfig
session_config = Session()
session_config.spark_connect_session = SparkConnectConfig()
session_config.session_template = 'projects/your-project-id/locations/us-central1/sessionTemplates/runtime-00000b96da90'

spark = DataprocSparkSession.builder.projectId("your-project-id").location(
    "us-central1").dataprocSessionConfig(session_config).getOrCreate()

In [None]:
# Prepare The Environment
import gspread
import json
import pandas as pd
from datetime import datetime
from oauth2client.service_account import ServiceAccountCredentials
from pyspark.sql.functions import col

In [None]:
from IPython.core.magic import register_cell_magic
from IPython import get_ipython


@register_cell_magic
def skip(line, cell):
    return


@register_cell_magic
def skip_if(line, cell):
    if eval(line):
        return
    get_ipython().ex(cell)

In [None]:
# Authenticate and create the client
scope = ["https://spreadsheets.google.com/feeds",
         "https://www.googleapis.com/auth/drive"]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    'drive-api.json', scope)
googleClient = gspread.authorize(credentials)

In [None]:
# Prepare The Data
# Open the Google Sheet by ID
sheet_id = "1kQENu6sumzEQX60fjQtgmXvwPGlUfaNRgW7v_TWFUXo"
sheet = googleClient.open_by_key(sheet_id)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
legacy_charges = worksheet.get_all_records(numericise_ignore=['all'])

# Set null to empty mid_label on legacy_charges
for record in legacy_charges:
    record['mid_label'] = None if record['mid_label'] == '' else record['mid_label']
    record['installment_count'] = None if record['installment_count'] == '' else int(
        record['installment_count'])

# Convert the records to a Spark DataFrame
legacy_charges_df = spark.createDataFrame(legacy_charges).withColumn(
    "installment_count", col("installment_count").cast("int"))
legacy_charges_df.createOrReplaceTempView("legacy_charges")

In [None]:
# Open the Google Sheet by ID
sheet_id = "1_8sm8QciAU3T8oDlNS1Pfj-GQlmlJBrAi1TYdnnMlkw"
sheet = googleClient.open_by_key(sheet_id)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
merchant_send_mid_label = worksheet.get_all_records()

# Convert the records to a Spark DataFrame
merchant_send_mid_label_df = spark.createDataFrame(merchant_send_mid_label)
merchant_send_mid_label_df.createOrReplaceTempView("merchant_send_mid_label")

In [None]:
# Open the Google Sheet by ID
sheet_id = "1orVBlPP77HTt9d8x-lC1Oo5xrPp0r1FgVUQ-43DYqYc"
sheet = googleClient.open_by_key(sheet_id)

# Select the first worksheet
worksheet = sheet.get_worksheet(0)

# Get all records from the worksheet
merchant_excluded = worksheet.get_all_records()

# Convert the records to a Spark DataFrame
merchant_excluded_df = spark.createDataFrame(merchant_excluded)
merchant_excluded_df.createOrReplaceTempView("merchant_excluded")

In [None]:
spark.sql("""SELECT * FROM legacy_charges""").show()

In [None]:
# Monitoring Script
# Fetch charges data from Databricks

join_df = spark.sql("""
SELECT
    lc.*
FROM
    legacy_charges lc
LEFT ANTI JOIN
    merchant_send_mid_label msml ON lc.business_id = msml.business_id
LEFT ANTI JOIN
    merchant_excluded me ON lc.business_id = me.business_id
""")

In [None]:
join_df.show()