In [None]:
DT_NODASH = "20220217"

In [None]:
! hdfs dfs -get -f /jars/ojdbc8.jar .

In [None]:
from pydatafabric.gcp import df_to_bq_table, bq_insert_overwrite

In [None]:
def get_spark(scale=0, queue=None):
    import os
    import uuid
    import tempfile
    from pyspark.sql import SparkSession
    from pydatafabric.vault_utils import get_secrets
    from pyspark import version as spark_version

    is_spark_3 = spark_version.__version__ >= "3.0.0"

    tmp_uuid = str(uuid.uuid4())
    app_name = f"emart-{os.environ.get('USER', 'default')}-{tmp_uuid}"

    key = get_secrets("gcp/emart-datafabric/dataflow")["config"]
    key_file_name = tempfile.mkstemp()[1]
    with open(key_file_name, "wb") as key_file:
        key_file.write(key.encode())
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file.name

    if not queue:
        if "JUPYTERHUB_USER" in os.environ:
            queue = "dmig_eda"
        else:
            queue = "airflow_job"

    bigquery_jars = (
        "hdfs:///jars/spark-bigquery-with-dependencies_2.12-0.21.0.jar,hdfs:///jars/ojdbc8.jar"
        if is_spark_3
        else "hdfs:///jars/spark-bigquery-with-dependencies_2.11-0.17.3.jar,hdfs:///jars/ojdbc8.jar"
    )

    arrow_enabled = "spark.sql.execution.arrow.pyspark.enabled" if is_spark_3 else "spark.sql.execution.arrow.enabled"

    arrow_pre_ipc_format = "0" if is_spark_3 else "1"
    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = arrow_pre_ipc_format

    if queue == "nrt":
        spark = (
            SparkSession.builder.config("spark.app.name", app_name)
            .config("spark.driver.memory", "6g")
            .config("spark.executor.memory", "4g")
            .config("spark.driver.maxResultSize", "6g")
            .config("spark.rpc.message.maxSize", "1024")
            .config("spark.executor.core", "4")
            .config("spark.executor.instances", "32")
            .config("spark.yarn.queue", queue)
            .config("spark.ui.enabled", "false")
            .config("spark.port.maxRetries", "128")
            .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config(
                "spark.jars",
                bigquery_jars,
            )
            .config("spark.driver.extraClassPath", "ojdbc8.jar")
            .enableHiveSupport()
            .getOrCreate()
        )
        spark.conf.set(arrow_enabled, "true")
        return spark

    if scale in [1, 2, 3, 4]:
        spark = (
            SparkSession.builder.config("spark.app.name", app_name)
            .config("spark.driver.memory", f"{scale*8}g")
            .config("spark.executor.memory", f"{scale*3}g")
            .config("spark.executor.instances", f"{scale*8}")
            .config("spark.driver.maxResultSize", f"{scale*4}g")
            .config("spark.rpc.message.maxSize", "1024")
            .config("spark.yarn.queue", queue)
            .config("spark.ui.enabled", "false")
            .config("spark.port.maxRetries", "128")
            .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config(
                "spark.jars",
                bigquery_jars,
            )
            .enableHiveSupport()
            .getOrCreate()
        )
    elif scale in [5, 6, 7, 8]:
        spark = (
            SparkSession.builder.config("spark.app.name", app_name)
            .config("spark.driver.memory", "8g")
            .config("spark.executor.memory", f"{2 ** scale}g")
            .config("spark.executor.instances", "32")
            .config("spark.driver.maxResultSize", "8g")
            .config("spark.rpc.message.maxSize", "1024")
            .config("spark.yarn.queue", queue)
            .config("spark.ui.enabled", "false")
            .config("spark.port.maxRetries", "128")
            .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
            .config(
                "spark.jars",
                bigquery_jars,
            )
            .enableHiveSupport()
            .getOrCreate()
        )
    else:
        if is_spark_3:
            spark = (
                SparkSession.builder.config("spark.app.name", app_name)
                .config("spark.driver.memory", "8g")
                .config("spark.executor.memory", "8g")
                .config("spark.executor.instances", "8")
                .config("spark.driver.maxResultSize", "6g")
                .config("spark.rpc.message.maxSize", "1024")
                .config("spark.yarn.queue", queue)
                .config("spark.ui.enabled", "false")
                .config("spark.port.maxRetries", "128")
                .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
                .config("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
                .config(
                    "spark.jars",
                    bigquery_jars,
                )
                .config("spark.driver.extraClassPath", "ojdbc8.jar")
                .enableHiveSupport()
                .getOrCreate()
            )
        else:
            spark = (
                SparkSession.builder.config("spark.app.name", app_name)
                .config("spark.driver.memory", "6g")
                .config("spark.executor.memory", "8g")
                .config("spark.shuffle.service.enabled", "true")
                .config("spark.dynamicAllocation.enabled", "true")
                .config("spark.dynamicAllocation.maxExecutors", "200")
                .config("spark.driver.maxResultSize", "6g")
                .config("spark.rpc.message.maxSize", "1024")
                .config("spark.yarn.queue", queue)
                .config("spark.ui.enabled", "false")
                .config("spark.port.maxRetries", "128")
                .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
                .config("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", arrow_pre_ipc_format)
                .config(
                    "spark.jars",
                    bigquery_jars,
                )
                .config("spark.driver.extraClassPath", "ojdbc8.jar")
                .enableHiveSupport()
                .getOrCreate()
            )
    spark.conf.set(arrow_enabled, "true")
    return spark

spark = get_spark()

In [None]:
def read_oracle_to_df(spark, dbtable):
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:oracle:thin:@150.204.1.46:1525/WZDB") \
        .option("dbtable", dbtable) \
        .option("user", "metatron") \
        .option("password", "wisenut2021!") \
        .load()
    return df

In [None]:
history_tables = [
    ("wisenut", "wise_neo_tbl_fail_querycnt", "fail_dt"),
    ("wisenut", "wise_neo_tbl_pop_querycnt", "pop_dt"),
    ("wisenut", "wise_neo_tbl_total_querycnt", "total_dt")
]

snapshot_tables = [
    ("wisenut", "wise_neo_tbl_col_label"),
    ("wisenut", "wise_neo_tbl_dic"),
    ("wisenut", "wise_neo_tbl_except_word"),
    ("wisenut", "wise_neo_tbl_word_code")
]

In [None]:
for db, table, dt_col in history_tables:
    df = read_oracle_to_df(spark, f"(SELECT * FROM {db}.{table} WHERE {dt_col} = '{DT_NODASH}') INPUT")
    df_to_bq_table(df, "temp_1d", f"{table}__{DT_NODASH}")
    bq_insert_overwrite(f"SELECT *, parse_date('%Y%m%d', '{DT_NODASH}') as dt from temp_1d.{table}__{DT_NODASH}", f"emart-datafabric.tworld.{table}", partition="dt")
    

In [None]:
for db, table in snapshot_tables:
    df = read_oracle_to_df(spark, f"{db}.{table}")
    df_to_bq_table(df, "tworld", f"{table}")
    print(f"{db}.{table} done.")

In [None]:
spark.stop()