## bq 데이터를 kafka에 저장

In [1]:
PROJECT = "emart-datafabric"
DATASET = "common_dev"
TABLE = "dfm_sample_eapp_data"
LIMIT = 10

In [2]:
BOOTSTRAP_SERVERS = "datafabric-kafka-kafka-bootstrap.kafka-farm.svc.cluster.local:9092"
TOPICS = 'test'
CONSUMER_GROUP = 'test-datafabric'

In [3]:
import traceback
from pydatafabric.gcp import bq_to_pandas

df = bq_to_pandas(f"""
    select review_id as key, comments as value
    from  `{PROJECT}.{DATASET}.{TABLE}`
    where comments != ''
    limit {LIMIT}
""")

unsupported operand type(s) for /: 'NoneType' and 'int'


Downloading: 100%|██████████| 10/10 [00:00<00:00, 11.19rows/s]


In [4]:
df.head(10)

Unnamed: 0,key,value
0,2208062001168751,수박껍질부분에 가까울수록 많이 싱겁네요
1,2208061419398993,좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아...
2,2208060738244837,달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다
3,2208061038412404,황도 좋아해서 샀어요 ㅎㅎ
4,2208062325391646,당도 높고 맛있어요
5,2208061555067780,저렴한 가격에 구매했습니다. 만족합니다.
6,2208061053319664,싱싱하고 잘 익어 맛있게 먹었습니다~
7,2208062319573664,과일좋아하는 울식구들\r\n씻어서 통에 넣어놓으면 하나씩 꺼내먹기 좋아요
8,2208061357215606,냉장고에 넣었는데 이렇게 되었네요
9,2208062002191078,올해는 천도가 비싸네요 맛은 좋아요


In [5]:
print(df.dtypes)

key      object
value    object
dtype: object


### Kafka Prodcuer, Consumer, Create Topic Example

In [6]:
!python -m pip install kafka-python

Defaulting to user installation because normal site-packages is not writeable


In [7]:
import threading, time
from kafka import KafkaProducer, KafkaConsumer, KafkaAdminClient
from kafka.admin import NewTopic
from json import loads, dumps
    
class Producer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()

    def stop(self):
        self.stop_event.set()

    def run(self):
        producer = KafkaProducer(acks="all",
                                 compression_type='gzip',
                                 bootstrap_servers=BOOTSTRAP_SERVERS,
                                 value_serializer=lambda x: dumps(x).encode('utf-8'))

        # while not self.stop_event.is_set():
        producer.send(TOPICS, value=df.to_json(force_ascii=False))
        time.sleep(1)

        producer.close()
        

class Consumer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()
        self._consumer_dict = {}

    def stop(self):
        self.stop_event.set()

    def run(self):
        consumer = KafkaConsumer(bootstrap_servers=BOOTSTRAP_SERVERS,
                                 auto_offset_reset='earliest',
                                 group_id=CONSUMER_GROUP,
                                 value_deserializer=lambda x: loads(x.decode('utf-8')),
                                 max_poll_records = 1,
                                 consumer_timeout_ms=1000)
        
        consumer.subscribe([TOPICS])

        while not self.stop_event.is_set():
            for message in consumer:
                print("Topic: %s, Partition: %d, Offset: %d, Key: %s, Value: %s" % (
                        message.topic, message.partition, message.offset, message.key, message.value))
                
                self._consumer_dict = message.value
           
                if self.stop_event.is_set():
                    break

        consumer.close()
        
    def get_consumer_value_dict(self):
        return self._consumer_dict

                                 
def main():
    # try:
    #     admin = KafkaAdminClient(bootstrap_servers=BOOTSTRAP_SERVERS)
    #     topic = NewTopic(name=TOPICS,
    #                      num_partitions=1,
    #                      replication_factor=1)
    #     admin.create_topics([topic])
    # except Exception as e:
    #     print(e)
    
    producer = Producer()
    consumer = Consumer()
    tasks = [producer, consumer]

    for t in tasks:
        t.start()

    time.sleep(10)

    for task in tasks:
        task.stop()

    for task in tasks:
        task.join()
        
    return consumer.get_consumer_value_dict()

In [8]:
consumer_dict = main()

Topic: test, Partition: 3, Offset: 199, Key: None, Value: {"key":{"0":"2208062001168751","1":"2208061419398993","2":"2208060738244837","3":"2208061038412404","4":"2208062325391646","5":"2208061555067780","6":"2208061053319664","7":"2208062319573664","8":"2208061357215606","9":"2208062002191078"},"value":{"0":"수박껍질부분에 가까울수록  많이 싱겁네요","1":"좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요","2":"달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다","3":"황도 좋아해서 샀어요 ㅎㅎ","4":"당도 높고 맛있어요","5":"저렴한 가격에 구매했습니다. 만족합니다.","6":"싱싱하고 잘 익어 맛있게 먹었습니다~","7":"과일좋아하는 울식구들\r\n씻어서 통에 넣어놓으면 하나씩 꺼내먹기 좋아요","8":"냉장고에 넣었는데 이렇게 되었네요","9":"올해는 천도가 비싸네요 맛은 좋아요"}}


### Kafka에서 받아온 데이터를 aws s3에 parquet파일로 저장

In [9]:
from ast import literal_eval
consumer_dict = literal_eval(consumer_dict)

In [10]:
import pandas as pd

parquet_df = pd.DataFrame(consumer_dict)
parquet_df.head()

Unnamed: 0,key,value
0,2208062001168751,수박껍질부분에 가까울수록 많이 싱겁네요
1,2208061419398993,좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아...
2,2208060738244837,달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다
3,2208061038412404,황도 좋아해서 샀어요 ㅎㅎ
4,2208062325391646,당도 높고 맛있어요


In [11]:
from pydatafabric.vault_utils import get_secrets

aws_info = get_secrets(mount_point="datafabric",path="aws/credentials/datafabric")
path = "dfm_sample_eapp_data/parquet"
bucket = f"emart-datafabric-dev"
destination = f"s3a://{bucket}/bigquery-db/{path}/op=put/"

In [12]:
from pydatafabric.ye import get_spark

spark = get_spark()
spark.conf.set("fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("fs.s3a.buffer.dir", "/tmp")
spark.conf.set("fs.s3a.access.key", aws_info["aws_access_key_id"])
spark.conf.set("fs.s3a.secret.key", aws_info["aws_secret_access_key"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/18 17:21:36 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/10/18 17:21:36 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/10/18 17:21:36 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/10/18 17:21:36 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [13]:
df = spark.createDataFrame(parquet_df)
df.show()



+----------------+-----------------------------------+
|             key|                              value|
+----------------+-----------------------------------+
|2208062001168751| 수박껍질부분에 가까울수록  많이...|
|2208061419398993|  좋아요 좋아요 좋아요 좋아요 좋...|
|2208060738244837| 달고 맛있어요 수분이 풍부하고요...|
|2208061038412404|          황도 좋아해서 샀어요 ㅎㅎ|
|2208062325391646|                 당도 높고 맛있어요|
|2208061555067780|  저렴한 가격에 구매했습니다. 만...|
|2208061053319664|싱싱하고 잘 익어 맛있게 먹었습니다~|
|2208062319573664|  과일좋아하는 울식구들
씻어서 ...|
|2208061357215606|  냉장고에 넣었는데 이렇게 되었네요|
|2208062002191078| 올해는 천도가 비싸네요 맛은 좋아요|
+----------------+-----------------------------------+



                                                                                

In [14]:
df.write.mode("overwrite").parquet(destination)

                                                                                

In [16]:
!hdfs dfs -Dhadoop.security.credential.provider.path=jceks:///datafabric/credentials/aws.jceks -ls {destination}

2022-10-18 17:22:37,527 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2022-10-18 17:22:37,579 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2022-10-18 17:22:37,579 INFO impl.MetricsSystemImpl: s3a-file-system metrics system started
Found 11 items
-rw-rw-rw-   1 datafabric datafabric          0 2022-10-18 17:21 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/_SUCCESS
-rw-rw-rw-   1 datafabric datafabric       1162 2022-10-18 17:21 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/part-00000-6b2e885c-f698-42ac-bb46-d7a0f72f73b0-c000.snappy.parquet
-rw-rw-rw-   1 datafabric datafabric       1459 2022-10-18 17:21 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/part-00001-6b2e885c-f698-42ac-bb46-d7a0f72f73b0-c000.snappy.parquet
-rw-rw-rw-   1 datafabric datafabric       1255 2022-10-18 17:21 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/p

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 47510)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 260, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 235, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 250, in authenticate_and_accum_updates