### bq 데이터를 S3로 데이터 적재

In [1]:
PROJECT = "emart-datafabric"
DATASET = "common_dev"
TABLE = "dfm_sample_eapp_review_keywords"
LIMIT = 30

In [2]:
from pydatafabric.vault_utils import get_secrets

aws_info = get_secrets(mount_point="datafabric",path="aws/credentials/datafabric")

In [3]:
from pydatafabric.ye import get_spark
import os

spark = get_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/06 14:45:45 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/10/06 14:45:45 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/10/06 14:45:45 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/10/06 14:45:45 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


### Spark 설정 필수 

In [4]:
spark.conf.set("fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("fs.s3a.buffer.dir", "/tmp")
spark.conf.set("fs.s3a.access.key", aws_info["aws_access_key_id"])
spark.conf.set("fs.s3a.secret.key", aws_info["aws_secret_access_key"])

In [5]:
review_keywords = spark.read.format('bigquery') \
  .option('table', f"{PROJECT}:{DATASET}.{TABLE}") \
  .load()
review_keywords.createOrReplaceTempView('temp_review_keywords')

### Temp View에서 질의를 통해 저장 

In [6]:
query="select * from temp_review_keywords"

In [7]:
df = spark.sql(query)
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-------------+--------+----------+------------------------------------+------------------------+----------------------------------+----------+-------------------------------+
|      prdt_cd|store_cd|order_date|                            sku_name|           link_sku_name|                           comment|score_text|                       keywords|
+-------------+--------+----------+------------------------------------+------------------------+----------------------------------+----------+-------------------------------+
|1102150000000|    1018|  20220102|(후레쉬팩)미국냉장초이스척아이로스트|(후레쉬팩)미국산CH갈비살|    기름기가 많아서 
먹기 불편했음|       Bad|    기름기 많아서 먹기 불편했음|
|1102150000000|    1108|  20220101|(후레쉬팩)미국냉장초이스척아이로스트|(후레쉬팩)미국산CH갈비살| 기름이 너무너무 많아요.버린 기...|       Bad|       기름 많아요 기름 무게 만|
|1102150000000|    1108|  20220101|(후레쉬팩)미국냉장초이스척아이로스트|(후레쉬팩)미국산CH갈비살|세일전주에 샀을때 손질해보니 기...|       Bad|전주 때 손질 기름 근육 고기 ...|
|1102150000000|    1048|  20220101|(후레쉬팩)미국냉장초이스척아이로스트|(후레쉬팩)미국산CH갈비살| 고기가 상태가 별로였어요
빨리 ...|      

                                                                                

### S3로 데이터 적재

In [8]:
env = "dev" 
bucket = f"emart-datafabric-{env}"

s3_path = f"s3a://{bucket}/bigquery-db/{TABLE}"
spark.sql(query).write.mode("overwrite").parquet(s3_path)

                                                                                

In [9]:
!hdfs dfs -Dhadoop.security.credential.provider.path=jceks:///datafabric/credentials/aws.jceks -ls {s3_path}

2022-09-30 13:03:54,208 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2022-09-30 13:03:54,259 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2022-09-30 13:03:54,259 INFO impl.MetricsSystemImpl: s3a-file-system metrics system started
Found 2 items
-rw-rw-rw-   1 datafabric datafabric          0 2022-09-30 13:03 s3a://emart-datafabric-dev/bigquery-db/_SUCCESS
-rw-rw-rw-   1 datafabric datafabric   14702163 2022-09-30 13:03 s3a://emart-datafabric-dev/bigquery-db/part-00000-12819580-5b71-4929-8823-63f2c574f7b5-c000.snappy.parquet
2022-09-30 13:03:55,901 INFO impl.MetricsSystemImpl: Stopping s3a-file-system metrics system...
2022-09-30 13:03:55,901 INFO impl.MetricsSystemImpl: s3a-file-system metrics system stopped.
2022-09-30 13:03:55,901 INFO impl.MetricsSystemImpl: s3a-file-system metrics system shutdown complete.


### Spark Context 종료

In [10]:
spark.stop()