In [129]:
from google.cloud import bigquery

GCP_PROJECT = "smart-ruler-304409"

ORA_OWNER = "C##DPC"
ORA_TABLE = "GTPSD_EP_PAY_RESULT_A"

BQ_PARTITION_TYPE = bigquery.TimePartitioningType.DAY
BQ_PARTITION_COLUMN = "BIZ_DATE"
BQ_PARTITION_FIELD = "biz_date"
BQ_CLUSTERING_FIELD = "pos_no"
BQ_DATASET = "temp_1d"
BQ_TABLE = "temp_gtpsd_ep_pay_result_a"

DT = '20220923'

In [130]:
from pydatafabric.vault_utils import get_secrets
from pydatafabric.oracle import get_oracle_schema, set_oracle_secret, generate_oracle_select_query, oracle_to_pandas

oracle_secret = get_secrets(mount_point="datafabric",path="oracle/datafabric/oracle-db")
set_oracle_secret(oracle_secret)

In [133]:
def convert_oracle_to_bigquery_schema(owner, table_name, partition_field=None):
    oracle_schema = get_oracle_schema(owner, table_name)
    schema = []

    if partition_field and isinstance(partition_field, bigquery.SchemaField):
        schema.append(partition_field)

    for i in range(len(oracle_schema)):
        row = oracle_schema.iloc[i]
        description = getDefault(row["column_comments"], "")
        column = row["column_name"].lower()
        oc_type = row["data_type_name"]
        nullable = row["is_nullable"]
        scale = getDefault(row["scale"], "")
        precision = getDefault(row["precision"], "")
        length = row["length"]
        
        
        bq_type = "STRING"
        if oc_type in ("VARCHAR2", "CHAR", "NVARCHAR2", "CLOB"):
            bq_type = "STRING"
        elif oc_type == "NUMBER":
            if precision and precision >= 0:
                if scale == 0:
                    if precision < 9:
                        bq_type = "INTEGER"
                    elif precision >= 9 and precision < 22:
                        bq_type = "NUMERIC"
                    elif precision >= 22:
                        bq_type = "BIGNUMERIC"
                elif scale > 0:
                    bq_type = "FLOAT"
            else:
                bq_type = "NUMERIC"
        elif oc_type == "LONG":
            bq_type = "NUMERIC"
        elif oc_type == "DATE":
            bq_type = "DATETIME"
        elif oc_type.startswith("TIMESTAMP"):
            bq_type = "DATETIME"

        mode = "REQUIRED" if nullable == "N" else "NULLABLE"
        
        if column != BQ_PARTITION_FIELD:
            schema.append(
                bigquery.SchemaField(column, bq_type, mode=mode, description=description)
            )
    
    return schema

In [134]:
from pydatafabric.gcp import get_bigquery_client
from google.cloud.bigquery.job import LoadJobConfig
from google.cloud.bigquery.table import TimePartitioning, TableReference
from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery.schema import SchemaField

bq = get_bigquery_client(project=GCP_PROJECT)

def get_bigquery_load_job_config(schema, write_disposition, partition_type = None, partition_field = None, clustering_fields = None):
    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.write_disposition = write_disposition
    job_config.create_disposition = "CREATE_IF_NEEDED"
    job_config.source_format = bigquery.SourceFormat.PARQUET
    job_config.autodetect = False

    if partition_type and partition_field:
        job_config.time_partitioning = bigquery.TimePartitioning(
            type_=partition_type,
            field=partition_field
        )
        
    if clustering_fields:
        job_config.clustering_fields = clustering_fields        
    
    return job_config

In [135]:
import time
import pandas as pd

oracle_to_pd_convert_start = 0.0
bq_load_time_start = 0.0
    
where = f"""
    {BQ_PARTITION_COLUMN} = '{DT}'
"""

oracle_to_pd_convert_start_t = time.time()

query = generate_oracle_select_query(ORA_OWNER, ORA_TABLE, where)
pdf = oracle_to_pandas(ORA_OWNER, ORA_TABLE, sql = query)

print("target table: ", ORA_TABLE)
print("create oracle to pandas excute time: ", round((time.time()- oracle_to_pd_convert_start_t), 2))
bq_load_start_t = time.time()

if len(pdf.index) > 0:
    pdf[BQ_PARTITION_FIELD] = pd.to_datetime(pdf[BQ_PARTITION_FIELD.lower()], format='%Y%m%d').dt.date
    pdf["crtn_dt"] = pd.to_datetime(pdf["crtn_dt"], format='%Y-%m-%d %H:%M:%S')
    pdf["chg_dt"] = pd.to_datetime(pdf["chg_dt"], format='%Y-%m-%d %H:%M:%S')
    
    partition_schema_field = bigquery.SchemaField(BQ_PARTITION_FIELD, 'DATE', mode='REQUIRED', description='영업일자')
    schema = convert_oracle_to_bigquery_schema(ORA_OWNER, ORA_TABLE, partition_schema_field)
    
    if(schema):
        t_ref = TableReference(DatasetReference(GCP_PROJECT, BQ_DATASET), f"{BQ_TABLE.lower()}")
        jc = get_bigquery_load_job_config(schema, "WRITE_APPEND", BQ_PARTITION_TYPE, BQ_PARTITION_FIELD, BQ_CLUSTERING_FIELD)
        j = bq.load_table_from_dataframe(pdf, t_ref, job_config=jc)
        j.result()
        
        print("pandas to bq load time: ", round((time.time()-bq_load_start_t), 2)) 

        
# pydatafabric core 수정 필요.
# oracle_to_bq_overwrite_table(owner=ORA_OWNER
#                             , table=ORA_TABLE
#                             , dataset=BQ_DATASET
#                             , bq_table=BQ_TABLE
#                             , partition_column=BQ_PARTITION_FIELD
#                             , partition_type=BQ_PARTITION_TYPE
#                             , partition_field=BQ_PARTITION_FIELD
#                             , query=query)

target table:  GTPSD_EP_PAY_RESULT_A
create oracle to pandas excute time:  0.17
pandas to bq load time:  2.79
