In [1]:
from pydatafabric.vault_utils import get_secrets

oracle_info = get_secrets(mount_point="datafabric", path="oracle/entsal/prd")

In [2]:
from pydatafabric.ye import get_spark
spark = get_spark(extra_jars="gs://emart-datafabric-resources/jars/ojdbc8.jar,gs://emart-datafabric-resources/jars/orai18n.jar")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/17 11:49:55 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/05/17 11:49:55 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/05/17 11:49:55 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/05/17 11:49:55 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
spark.conf.set("spark.sql.debug.maxToStringFields", 2000)

In [4]:
from functools import reduce

# AMERICAN_AMERICA.US7ASCII
# KOREAN_KOREA.KO16MSWIN949
# KOREAN_KOREA.KO16KSC5601
# KOREAN_KOREA.AL32UTF8
    
def read_oracle_to_df(spark, dbtable):
    df = spark.read.format("jdbc") \
        .option("url", oracle_info['jdbc_url']) \
        .option("user", oracle_info['user']) \
        .option("password", oracle_info['password']) \
        .option("driver", "oracle.jdbc.driver.OracleDriver") \
        .option("oracle.jdbc.timezoneAsRegion", "false") \
        .option("dbtable", dbtable) \
        .load()
    return df

def convert_col(col):
    return "".join([c.lower() if c.isupper() else c for c in col])

In [5]:
df = read_oracle_to_df(spark, "(SELECT UTL_RAW.CAST_TO_RAW(CONVERT(COMMENTS, 'AL32UTF8', 'KO16KSC5601')) AS COMMENTS FROM ALL_COL_COMMENTS WHERE TABLE_NAME = 'GTPSS_MALL_SALE') INPUT")
# df = read_oracle_to_df(spark, "(SELECT UTL_RAW.CAST_TO_VARCHAR2(UTL_RAW.CAST_TO_RAW(CONVERT(COMMENTS, 'AL32UTF8', 'KO16KSC5601'))) AS COMMENTS FROM ALL_COL_COMMENTS WHERE TABLE_NAME = 'GTPSS_MALL_SALE') INPUT")

In [6]:
from pyspark.sql.functions import *

# DataFrame의 스키마 이름 가져옴
cols = df.schema.names

# DataFrame Column 이름을 소문자로 변환
df = reduce(lambda _df, col: _df.withColumnRenamed(col, convert_col(col)), [df, *cols])
df.printSchema()

df.show(n=1, truncate=False, vertical=False)

root
 |-- comments: binary (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+-------------------------------------------------------+
|comments                                               |
+-------------------------------------------------------+
|[53 53 47 EB B6 80 EB 8B B4 EC 97 90 EB 88 84 EB A6 AC]|
+-------------------------------------------------------+
only showing top 1 row



                                                                                

In [7]:
from pydatafabric.gcp import df_to_bq_table, bq_insert_overwrite

gcp_table = "us7ascii_to_utf"

df_to_bq_table(df, "temp_1d", f"{gcp_table}", project="emart-datafabric", mode="overwrite")    

                                                                                

In [8]:
from pydatafabric.gcp import get_bigquery_client

bq = get_bigquery_client(project="emart-datafabric") # 프로젝트 꼭 지정
r = bq.query(f"SELECT SAFE_CONVERT_BYTES_TO_STRING(comments) FROM temp_1d.{gcp_table} LIMIT 10")

In [9]:
df = r.result().to_dataframe()
df

Unnamed: 0,f0_
0,SSG부담에누리
1,영업일자
2,점포코드
3,계획금액
4,당월매출계획누계금액
5,금년매출계획누계금액
6,매출수량
7,매출금액
8,고객수
9,에누리금액


In [10]:
spark.stop()