In [15]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from service.utils.spark import get_spark_session

In [16]:
spark = get_spark_session(dev=True)

In [17]:
src_namespace = 'silver'
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [18]:
review_metadata_df = spark.read.csv(f"s3a://warehousedev/{src_namespace}/tsv/review_metadata.tsv", header=True, sep='\t')
review_metadata_df.show(2)

+--------------------+-----------------------+--------------------+--------------------+------------+
|review_creation_date|review_answer_timestamp|           review_id|            order_id|review_score|
+--------------------+-----------------------+--------------------+--------------------+------------+
| 2016-10-02 00:00:00|    2016-10-26 12:31:00|69ac6a27fde9855eb...|809a282bbd5dbcabb...|           1|
| 2016-10-06 00:00:00|    2016-10-07 18:32:28|6916ca4502d6d3bfd...|bfbd0f9bdef843021...|           1|
+--------------------+-----------------------+--------------------+--------------------+------------+
only showing top 2 rows



In [19]:
review_inference_df = spark.read.csv(f"s3a://warehousedev/{src_namespace}/tsv/review_inference.tsv", header=True, sep='\t')
review_inference_df.show(2)

+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|           review_id|        message_type|main_sentiment|                 eng|          negative|           neutral|          positive|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|11eb20303bc6c13c4...|review_comment_me...|      negative|I did not receive...|0.9991900324821472|0.0005851832684129|0.0002247465599793|
|3c7388d3ed1f8a85f...|review_comment_me...|      negative|I bought 2 backpa...|0.9302172660827636|0.0691679194569587|0.0006147770909592|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
only showing top 2 rows



In [20]:
delivered_order_product_bcg = spark.read.table(f"{test_namespace}.delivered_order_product_bcg")
delivered_order_product_bcg.show(2)

+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------+
|          product_id|            order_id|     category|order_item_id|           seller_id|total_price|       segment|
+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------+
|310ae3c140ff94b03...|0005a1a1728c9d785...|health_beauty|            1|a416b6a846a117243...|      157.6|    Niche Gems|
|d63c1011f49d98b97...|00061f2a7bc09da83...|health_beauty|            1|cc419e0650a3c5ba7...|      68.87|Volume Drivers|
+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------+
only showing top 2 rows



In [21]:
"""
CAUTION: 하나의 주문에는 다수의 동일 제품이 들어갈 수 있으나, 리뷰 스코어는 order_id
"""
review_metatdata_product = review_metadata_df.join(delivered_order_product_bcg.select('order_id', 'product_id'), on='order_id', how='inner')
review_metatdata_product.show(2)

+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
|            order_id|review_creation_date|review_answer_timestamp|           review_id|review_score|          product_id|
+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
|95d31d53ccc53bca2...| 2016-10-18 00:00:00|    2016-10-21 05:00:10|e9651b370d483da0e...|           5|d5f4f441812a9a67c...|
|dabf2b0e35b423f94...| 2016-10-19 00:00:00|    2016-10-19 20:43:13|28112c56c882286e0...|           5|7cd29da0653abeb44...|
+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
only showing top 2 rows



In [10]:
review_metatdata_product.writeTo(f'{test_namespace}.review_metatdata_product').createOrReplace()

                                                                                

In [12]:
spark.stop()