In [17]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from service.utils.spark import get_spark_session

In [18]:
spark = get_spark_session(dev=True)

In [19]:
src_namespace = 'silver'
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [20]:
review_metadata_df = spark.read.csv(f"s3a://warehousedev/{src_namespace}/tsv/review_metadata.tsv", header=True, sep='\t')
review_metadata_df.show(2)

+--------------------+-----------------------+--------------------+--------------------+------------+
|review_creation_date|review_answer_timestamp|           review_id|            order_id|review_score|
+--------------------+-----------------------+--------------------+--------------------+------------+
| 2016-10-02 00:00:00|    2016-10-26 12:31:00|69ac6a27fde9855eb...|809a282bbd5dbcabb...|           1|
| 2016-10-06 00:00:00|    2016-10-07 18:32:28|6916ca4502d6d3bfd...|bfbd0f9bdef843021...|           1|
+--------------------+-----------------------+--------------------+--------------------+------------+
only showing top 2 rows



In [21]:
review_inference_df = spark.read.csv(f"s3a://warehousedev/{src_namespace}/tsv/review_inference.tsv", header=True, sep='\t')
review_inference_df.show(2)

+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|           review_id|        message_type|main_sentiment|                 eng|          negative|           neutral|          positive|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|11eb20303bc6c13c4...|review_comment_me...|      negative|I did not receive...|0.9991900324821472|0.0005851832684129|0.0002247465599793|
|3c7388d3ed1f8a85f...|review_comment_me...|      negative|I bought 2 backpa...|0.9302172660827636|0.0691679194569587|0.0006147770909592|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
only showing top 2 rows



In [22]:
target_product_order_info_with_segment = spark.read.table(f"{test_namespace}.target_product_order_info_with_segment")
target_product_order_info_with_segment.show(2)

+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------+
|          product_id|            order_id|     category|order_item_id|           seller_id|total_price|         customer_id|           purchase|           approved|  delivered_carrier| delivered_customer|      segment|
+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------+
|05f0fe07929d35be0...|362cd36939c1c4638...|health_beauty|            1|c70c1b0d8ca86052f...|     118.35|32ca12567c820aa0b...|2018-06-25 23:00:42|2018-06-27 08:53:50|2018-06-29 14:16:00|2018-07-02 21:05:36|Star Products|
|05f0fe07929d35be0...|b41cc5a0872a7fd88...|health_beauty|            1|c70c1b0d8ca86052f...|     118.35|ada8dea5e28ec3e9

                                                                                

In [64]:
"""
CAUTION: 하나의 주문에는 다수의 동일 제품이 들어갈 수 있으나, 리뷰 스코어는 order_id
"""
review_metatdata_product = review_metadata_df.join(target_product_order_info_with_segment.select('order_id', 'product_id'), on='order_id', how='inner')
review_metatdata_product.show(2)



+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
|            order_id|review_creation_date|review_answer_timestamp|           review_id|review_score|          product_id|
+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
|95d31d53ccc53bca2...| 2016-10-18 00:00:00|    2016-10-21 05:00:10|e9651b370d483da0e...|           5|d5f4f441812a9a67c...|
|dabf2b0e35b423f94...| 2016-10-19 00:00:00|    2016-10-19 20:43:13|28112c56c882286e0...|           5|7cd29da0653abeb44...|
+--------------------+--------------------+-----------------------+--------------------+------------+--------------------+
only showing top 2 rows



                                                                                

In [63]:
# title은 제외: 경향성에 큰 영향 없음
review_eng_message = review_inference_df.filter(F.col('message_type') == 'review_comment_message')
review_eng_message.show(2)

+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|           review_id|        message_type|main_sentiment|                 eng|          negative|           neutral|          positive|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|11eb20303bc6c13c4...|review_comment_me...|      negative|I did not receive...|0.9991900324821472|0.0005851832684129|0.0002247465599793|
|3c7388d3ed1f8a85f...|review_comment_me...|      negative|I bought 2 backpa...|0.9302172660827636|0.0691679194569587|0.0006147770909592|
+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
only showing top 2 rows



In [66]:
review_product = review_metatdata_product.join(review_eng_message, on='review_id', how='left')
review_product.show(2)

+--------------------+--------------------+--------------------+-----------------------+------------+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|           review_id|            order_id|review_creation_date|review_answer_timestamp|review_score|          product_id|        message_type|main_sentiment|                 eng|          negative|           neutral|          positive|
+--------------------+--------------------+--------------------+-----------------------+------------+--------------------+--------------------+--------------+--------------------+------------------+------------------+------------------+
|e9651b370d483da0e...|95d31d53ccc53bca2...| 2016-10-18 00:00:00|    2016-10-21 05:00:10|           5|d5f4f441812a9a67c...|review_comment_me...|      positive|it came before th...|0.0001487636036472|0.0007651383639313|0.9990861415863036|
|28112c56c882286e0...|dabf2b0e35b423f94...| 2016-10-

In [None]:
# review 메시지가 없지만 스코어는 있는 경우가 있으므로 중복제거 안함
print(review_product.dropna().count())
print(review_product.count())

                                                                                

3702


                                                                                

9554


In [53]:
# 하나의 주문에 여러 리뷰를 남긴 경우가 있을 수 있음: (`order_id` 개수 != `review_id` 개수)
print(review_product.select('order_id').distinct().count())
print(review_product.select('review_id').distinct().count())

                                                                                

8602




8601


                                                                                

In [50]:
tmp = review_product.groupBy('review_id').agg(F.count('order_id').alias('order_count'))
tmp.filter("'order_count' > 1").show()

+---------+-----------+
|review_id|order_count|
+---------+-----------+
+---------+-----------+



In [None]:
review_product.writeTo(f'{test_namespace}.review_product').createOrReplace()

In [16]:
# spark.stop()