### bq 데이터를 kafka에 저장

In [18]:
PROJECT = "emart-datafabric"
DATASET = "common_dev"
TABLE = "dfm_sample_eapp_data"
LIMIT = 10

In [19]:
BOOTSTRAP_SERVERS = "datafabric-kafka-kafka-bootstrap.kafka-farm.svc.cluster.local:9092"
TOPICS = 'test'
CONSUMER_GROUP = 'test-datafabric'

In [20]:
import traceback
from pydatafabric.gcp import bq_to_pandas

df = bq_to_pandas(f"""
    select review_id as key, comments as value
    from  `{PROJECT}.{DATASET}.{TABLE}`
    where comments != ''
    limit {LIMIT}
""")

unsupported operand type(s) for /: 'NoneType' and 'int'


Downloading: 100%|██████████| 10/10 [00:00<00:00, 10.78rows/s]


In [21]:
df.head(10)

Unnamed: 0,key,value
0,2208062001168751,수박껍질부분에 가까울수록 많이 싱겁네요
1,2208061419398993,좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아...
2,2208060738244837,달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다
3,2208061038412404,황도 좋아해서 샀어요 ㅎㅎ
4,2208062325391646,당도 높고 맛있어요
5,2208061555067780,저렴한 가격에 구매했습니다. 만족합니다.
6,2208061053319664,싱싱하고 잘 익어 맛있게 먹었습니다~
7,2208062319573664,과일좋아하는 울식구들\r\n씻어서 통에 넣어놓으면 하나씩 꺼내먹기 좋아요
8,2208061357215606,냉장고에 넣었는데 이렇게 되었네요
9,2208062002191078,올해는 천도가 비싸네요 맛은 좋아요


In [22]:
print(df.dtypes)

key      object
value    object
dtype: object


### Kafka Prodcuer, Consumer, Create Topic Example

In [10]:
!python -m pip install kafka-python

Defaulting to user installation because normal site-packages is not writeable


In [57]:
import threading, time
from kafka import KafkaProducer, KafkaConsumer, KafkaAdminClient
from kafka.admin import NewTopic
from json import loads, dumps
    
class Producer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()

    def stop(self):
        self.stop_event.set()

    def run(self):
        producer = KafkaProducer(acks="all",
                                 compression_type='gzip',
                                 bootstrap_servers=BOOTSTRAP_SERVERS,
                                 value_serializer=lambda x: dumps(x).encode('utf-8'))

        # while not self.stop_event.is_set():
        producer.send(TOPICS, value=df.to_json(force_ascii=False))
        time.sleep(1)

        producer.close()
        

class Consumer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop_event = threading.Event()
        self._consumer_dict = {}

    def stop(self):
        self.stop_event.set()

    def run(self):
        consumer = KafkaConsumer(bootstrap_servers=BOOTSTRAP_SERVERS,
                                 auto_offset_reset='earliest',
                                 group_id=CONSUMER_GROUP,
                                 value_deserializer=lambda x: loads(x.decode('utf-8')),
                                 max_poll_records = 1,
                                 consumer_timeout_ms=1000)
        
        consumer.subscribe([TOPICS])

        while not self.stop_event.is_set():
            for message in consumer:
                print("Topic: %s, Partition: %d, Offset: %d, Key: %s, Value: %s" % (
                        message.topic, message.partition, message.offset, message.key, message.value))
                
                self._consumer_dict = message.value
           
                if self.stop_event.is_set():
                    break

        consumer.close()
        
    def get_consumer_value_dict(self):
        return self._consumer_dict

                                 
def main():
    # try:
    #     admin = KafkaAdminClient(bootstrap_servers=BOOTSTRAP_SERVERS)
    #     topic = NewTopic(name=TOPICS,
    #                      num_partitions=1,
    #                      replication_factor=1)
    #     admin.create_topics([topic])
    # except Exception as e:
    #     print(e)
    
    producer = Producer()
    consumer = Consumer()
    tasks = [producer, consumer]

    for t in tasks:
        t.start()

    time.sleep(10)

    for task in tasks:
        task.stop()

    for task in tasks:
        task.join()
        
    return consumer.get_consumer_value_dict()

In [58]:
consumer_dict = main()

Topic: test, Partition: 0, Offset: 11176, Key: None, Value: {"key":{"0":"2208062001168751","1":"2208061419398993","2":"2208060738244837","3":"2208061038412404","4":"2208062325391646","5":"2208061555067780","6":"2208061053319664","7":"2208062319573664","8":"2208061357215606","9":"2208062002191078"},"value":{"0":"수박껍질부분에 가까울수록  많이 싱겁네요","1":"좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요","2":"달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다","3":"황도 좋아해서 샀어요 ㅎㅎ","4":"당도 높고 맛있어요","5":"저렴한 가격에 구매했습니다. 만족합니다.","6":"싱싱하고 잘 익어 맛있게 먹었습니다~","7":"과일좋아하는 울식구들\r\n씻어서 통에 넣어놓으면 하나씩 꺼내먹기 좋아요","8":"냉장고에 넣었는데 이렇게 되었네요","9":"올해는 천도가 비싸네요 맛은 좋아요"}}


### Kafka에서 받아온 데이터를 aws s3에 parquet파일로 저장

In [67]:
from ast import literal_eval
consumer_dict = literal_eval(consumer_dict)

In [76]:
import pandas as pd

parquet_df = pd.DataFrame(consumer_dict)
parquet_df.head()

Unnamed: 0,key,value
0,2208062001168751,수박껍질부분에 가까울수록 많이 싱겁네요
1,2208061419398993,좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아요 좋아...
2,2208060738244837,달고 맛있어요 수분이 풍부하고요\r\n맛있게 잘먹었습니다
3,2208061038412404,황도 좋아해서 샀어요 ㅎㅎ
4,2208062325391646,당도 높고 맛있어요


In [72]:
from pydatafabric.vault_utils import get_secrets

aws_info = get_secrets(mount_point="datafabric",path="aws/credentials/datafabric")
path = "dfm_sample_eapp_data/parquet"
bucket = f"emart-datafabric-dev"
destination = f"s3a://{bucket}/bigquery-db/{path}/op=put/"

s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/


In [73]:
from pydatafabric.ye import get_spark

spark = get_spark()
spark.conf.set("fs.s3a.fast.upload.buffer", "disk")
spark.conf.set("fs.s3a.buffer.dir", "/tmp")
spark.conf.set("fs.s3a.access.key", aws_info["aws_access_key_id"])
spark.conf.set("fs.s3a.secret.key", aws_info["aws_secret_access_key"])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/18 12:32:26 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/10/18 12:32:26 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/10/18 12:32:26 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/10/18 12:32:26 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [77]:
df = spark.createDataFrame(parquet_df)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----------------+-----------------------------------+
|             key|                              value|
+----------------+-----------------------------------+
|2208062001168751| 수박껍질부분에 가까울수록  많이...|
|2208061419398993|  좋아요 좋아요 좋아요 좋아요 좋...|
|2208060738244837| 달고 맛있어요 수분이 풍부하고요...|
|2208061038412404|          황도 좋아해서 샀어요 ㅎㅎ|
|2208062325391646|                 당도 높고 맛있어요|
|2208061555067780|  저렴한 가격에 구매했습니다. 만...|
|2208061053319664|싱싱하고 잘 익어 맛있게 먹었습니다~|
|2208062319573664|  과일좋아하는 울식구들
씻어서 ...|
|2208061357215606|  냉장고에 넣었는데 이렇게 되었네요|
|2208062002191078| 올해는 천도가 비싸네요 맛은 좋아요|
+----------------+-----------------------------------+



                                                                                

In [78]:
df.write.mode("overwrite").parquet(destination)

                                                                                

In [79]:
!hdfs dfs -Dhadoop.security.credential.provider.path=jceks:///datafabric/credentials/aws.jceks -ls {destination}

2022-10-18 12:36:56,333 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties
2022-10-18 12:36:56,386 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).
2022-10-18 12:36:56,386 INFO impl.MetricsSystemImpl: s3a-file-system metrics system started
Found 6 items
-rw-rw-rw-   1 datafabric datafabric          0 2022-10-18 12:34 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/_SUCCESS
-rw-rw-rw-   1 datafabric datafabric       1135 2022-10-18 12:34 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/part-00000-dce0e222-c5b3-4b46-b304-dbedd31a2853-c000.snappy.parquet
-rw-rw-rw-   1 datafabric datafabric       1059 2022-10-18 12:34 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/parquet/op=put/part-00001-dce0e222-c5b3-4b46-b304-dbedd31a2853-c000.snappy.parquet
-rw-rw-rw-   1 datafabric datafabric        987 2022-10-18 12:34 s3a://emart-datafabric-dev/bigquery-db/dfm_sample_eapp_data/pa