### S3로 Parquet파일 업로드

In [3]:
PROJECT = "emart-datafabric"
DATASET = "common_dev"
TABLE = "dfm_sample_eapp_review_keywords"
LIMIT = 30

In [4]:
query = f"""
        SELECT
          to_json_string ( STRUCT ( CAST(RANK() OVER(ORDER BY GENERATE_UUID()) AS STRING) AS id,
              STRUCT ( prdt_cd,
                store_cd ) AS
            values
              )) AS
        values
        FROM
          {PROJECT}.{DATASET}.{TABLE}
  """
path = "app_review_keywords/gzip"

In [2]:
import os
from pydatafabric.vault_utils import get_secrets
aws_info = get_secrets(mount_point="datafabric",path="aws/credentials/datafabric")

In [3]:
env = "dev" 
bucket = f"emart-datafabric-{env}"
ds = "20220725"
destination = f"s3a://{bucket}/bigquery-db/{path}/dt={ds}/op=put/"

In [4]:
from pydatafabric.ye import get_spark

spark = get_spark()
spark.conf.set("fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("fs.s3a.buffer.dir", "/tmp")
spark.conf.set("fs.s3a.access.key", aws_info["aws_access_key_id"])
spark.conf.set("fs.s3a.secret.key", aws_info["aws_secret_access_key"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/30 13:11:17 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/09/30 13:11:17 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/09/30 13:11:17 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/09/30 13:11:17 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


### gzip 압축 옵션 추가하여 저장

In [5]:
from pydatafabric.gcp import bq_to_df
import pyspark.sql.functions as f
import json

df = bq_to_df(query, spark_session=spark)

In [6]:
df.show(10, False)

[Stage 0:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------+
|values                                                               |
+---------------------------------------------------------------------+
|{"id":"2938","values":{"prdt_cd":"8801045375038","store_cd":"1125"}} |
|{"id":"13726","values":{"prdt_cd":"8801492374158","store_cd":"1016"}}|
|{"id":"40445","values":{"prdt_cd":"1113150000000","store_cd":"1010"}}|
|{"id":"57701","values":{"prdt_cd":"2500000145346","store_cd":"1047"}}|
|{"id":"69973","values":{"prdt_cd":"2500000079597","store_cd":"1055"}}|
|{"id":"71190","values":{"prdt_cd":"8809558842016","store_cd":"1158"}}|
|{"id":"83034","values":{"prdt_cd":"8803006001802","store_cd":"1161"}}|
|{"id":"89387","values":{"prdt_cd":"2438520000000","store_cd":"1161"}}|
|{"id":"90360","values":{"prdt_cd":"2418430000000","store_cd":"1111"}}|
|{"id":"91899","values":{"prdt_cd":"2500000118562","store_cd":"1135"}}|
+---------------------------------------------------------------

                                                                                

In [7]:
from pydatafabric.gcp import bq_insert_overwrite

bq_insert_overwrite(sql=query, destination="emart-datafabric.temp_1d.app_review_keywords_json")

destination: emart-datafabric.temp_1d.app_review_keywords_json
total_rows: 237978
slot_secs: 27.426



In [8]:
from pydatafabric.gcp import import_bigquery_ipython_magic
import_bigquery_ipython_magic()

In [9]:
%%bq

SELECT *
FROM temp_1d.app_review_keywords_json

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 2168.16query/s]                        
Downloading: 100%|██████████| 237978/237978 [00:00<00:00, 265190.00rows/s]

BigQuery execution took 3 seconds.





Unnamed: 0,values
0,"{""id"":""1036"",""values"":{""prdt_cd"":""241854000000..."
1,"{""id"":""15769"",""values"":{""prdt_cd"":""88014923741..."
2,"{""id"":""18006"",""values"":{""prdt_cd"":""25000003080..."
3,"{""id"":""20025"",""values"":{""prdt_cd"":""25000002723..."
4,"{""id"":""63702"",""values"":{""prdt_cd"":""88093155904..."
...,...
237973,"{""id"":""190272"",""values"":{""prdt_cd"":""2500000222..."
237974,"{""id"":""199804"",""values"":{""prdt_cd"":""2500000196..."
237975,"{""id"":""225403"",""values"":{""prdt_cd"":""2500000094..."
237976,"{""id"":""226742"",""values"":{""prdt_cd"":""8809142023..."


In [10]:
df = bq_to_df("SELECT * FROM temp_1d.app_review_keywords_json")
df.show(10, False)

+----------------------------------------------------------------------+
|values                                                                |
+----------------------------------------------------------------------+
|{"id":"136697","values":{"prdt_cd":"2366520000000","store_cd":"1082"}}|
|{"id":"140996","values":{"prdt_cd":"2500000038549","store_cd":"1001"}}|
|{"id":"43706","values":{"prdt_cd":"8801492374158","store_cd":"1091"}} |
|{"id":"177012","values":{"prdt_cd":"1113150000000","store_cd":"1030"}}|
|{"id":"217517","values":{"prdt_cd":"2500000077265","store_cd":"1075"}}|
|{"id":"13415","values":{"prdt_cd":"2500000306990","store_cd":"1115"}} |
|{"id":"89443","values":{"prdt_cd":"8809558842016","store_cd":"1026"}} |
|{"id":"151517","values":{"prdt_cd":"8809315590402","store_cd":"1063"}}|
|{"id":"175647","values":{"prdt_cd":"2500000011481","store_cd":"1042"}}|
|{"id":"27090","values":{"prdt_cd":"2500000107740","store_cd":"1144"}} |
+--------------------------------------------------

In [11]:
df.write.mode("overwrite").option("compression", "gzip").text(destination)

22/09/30 13:12:24 WARN org.apache.hadoop.fs.s3a.commit.AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

In [12]:
!hdfs dfs -Dhadoop.security.credential.provider.path=jceks:///datafabric/credentials/aws.jceks -ls {destination}

2022-09-30 13:12:32,243 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2022-09-30 13:12:32,292 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2022-09-30 13:12:32,293 INFO impl.MetricsSystemImpl: s3a-file-system metrics system started
Found 2 items
-rw-rw-rw-   1 datafabric datafabric          0 2022-09-30 13:12 s3a://emart-datafabric-dev/bigquery-db/app_review_keywords/dt=20220725/op=put/_SUCCESS
-rw-rw-rw-   1 datafabric datafabric    2220715 2022-09-30 13:12 s3a://emart-datafabric-dev/bigquery-db/app_review_keywords/dt=20220725/op=put/part-00000-8a83d8c8-11cb-45c1-9306-325f26aad724-c000.txt.gz
2022-09-30 13:12:33,908 INFO impl.MetricsSystemImpl: Stopping s3a-file-system metrics system...
2022-09-30 13:12:33,908 INFO impl.MetricsSystemImpl: s3a-file-system metrics system stopped.
2022-09-30 13:12:33,908 INFO impl.MetricsSystemImpl: s3a-file-system metrics system shutdown complete.
