In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from service.utils.spark import get_spark_session

In [2]:
spark = get_spark_session(dev=True)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/23 06:27:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [4]:
target_product_order_info_with_segment = spark.read.table(f"{test_namespace}.target_product_order_info_with_segment")
target_product_order_info_with_segment.show(n=2)

                                                                                

+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------+
|          product_id|            order_id|     category|order_item_id|           seller_id|total_price|         customer_id|           purchase|           approved|  delivered_carrier| delivered_customer|      segment|
+--------------------+--------------------+-------------+-------------+--------------------+-----------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------+
|05f0fe07929d35be0...|362cd36939c1c4638...|health_beauty|            1|c70c1b0d8ca86052f...|     118.35|32ca12567c820aa0b...|2018-06-25 23:00:42|2018-06-27 08:53:50|2018-06-29 14:16:00|2018-07-02 21:05:36|Star Products|
|05f0fe07929d35be0...|b41cc5a0872a7fd88...|health_beauty|            1|c70c1b0d8ca86052f...|     118.35|ada8dea5e28ec3e9

In [5]:
"""
CAUTION: `dropDuplicates()` 를 안하는 이유
- `seller_order` 데이터프레임과 union 하기 위해 `order_id`, `order_item` 열을 제거 하지만, 동일 제품을 재구매 할 수 있으므로 중복 제거하지 않는다.
- 새로운 임시 고유식별번호를 부여하여 중복 제거를 방지
"""
customer_order = target_product_order_info_with_segment.select('product_id', 'category', 'customer_id')
customer_order = customer_order.withColumnRenamed('customer_id', 'user_id')
customer_order = customer_order.withColumn("tmp_id", F.expr("uuid()"))
customer_order.show(n=2, truncate=False)

+--------------------------------+-------------+--------------------------------+------------------------------------+
|product_id                      |category     |user_id                         |tmp_id                              |
+--------------------------------+-------------+--------------------------------+------------------------------------+
|05f0fe07929d35be0748fac56eecbeb2|health_beauty|32ca12567c820aa0b0e5c20876829bdc|0fbe29f9-ce0d-48e2-9c72-76a912f4e5b3|
|05f0fe07929d35be0748fac56eecbeb2|health_beauty|ada8dea5e28ec3e93fc7d67ac4428363|6580b11d-93ce-4853-b980-31e444a89235|
+--------------------------------+-------------+--------------------------------+------------------------------------+
only showing top 2 rows



In [6]:
"""
`dropDuplicates()` 를 하는 이유
- seller 는 제품에 중복이 존재하지 않기 때문
- 새로운 임시 고유식별번호를 부여하여 `customer_order`와 스키마를 맞춘다.
"""
seller_order = target_product_order_info_with_segment.select('product_id', 'category', 'seller_id')
seller_order = seller_order.withColumnRenamed('seller_id', 'user_id')
seller_order = seller_order.dropDuplicates()
seller_order = seller_order.withColumn("tmp_id", F.expr("uuid()"))
seller_order.show(n=2)



+--------------------+-------------+--------------------+--------------------+
|          product_id|     category|             user_id|              tmp_id|
+--------------------+-------------+--------------------+--------------------+
|8a5334e25cfd596c6...|health_beauty|4aba6a02a788d3ec8...|605b0017-3034-4a6...|
|25e9a0d431fe3359e...|health_beauty|402916f742e5c740c...|27e73798-f433-48a...|
+--------------------+-------------+--------------------+--------------------+
only showing top 2 rows



                                                                                

In [7]:
user_order = seller_order.unionByName(customer_order)
user_order.show(2)



+--------------------+-------------+--------------------+--------------------+
|          product_id|     category|             user_id|              tmp_id|
+--------------------+-------------+--------------------+--------------------+
|8a5334e25cfd596c6...|health_beauty|4aba6a02a788d3ec8...|605b0017-3034-4a6...|
|25e9a0d431fe3359e...|health_beauty|402916f742e5c740c...|27e73798-f433-48a...|
+--------------------+-------------+--------------------+--------------------+
only showing top 2 rows



                                                                                

In [8]:
user_location = spark.read.table(f"{test_namespace}.user_location")
user_location = user_location.withColumns({
    'lat': F.col("lat").cast(FloatType()),
    'lng': F.col("lng").cast(FloatType())
    })
user_location.show(n=2)

+--------+--------------------+---------+----------+---------+-----+---------+
|zip_code|             user_id|user_type|       lat|      lng|state|     city|
+--------+--------------------+---------+----------+---------+-----+---------+
|    1001|8602a61d680a10a82...|   seller| -23.55019|-46.63402|   SP|sao paulo|
|    1003|0c1a20644f0dc126c...| customer|-23.548994|-46.63573|   SP|sao paulo|
+--------+--------------------+---------+----------+---------+-----+---------+
only showing top 2 rows



In [9]:
order_location = user_order.join(user_location, on='user_id', how='inner')
order_location = order_location.orderBy('product_id')

In [10]:
order_location.show(2)



+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+------------+
|             user_id|          product_id|     category|              tmp_id|zip_code|user_type|       lat|       lng|state|        city|
+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+------------+
|e9779976487b77c6d...|00210e41887c2a8ef...|health_beauty|941b2890-5a39-40a...|   11701|   seller|-24.008923|-46.419125|   SP|praia grande|
|c9b884c5a603d602b...|00210e41887c2a8ef...|health_beauty|304f1703-1954-422...|   98910| customer|-27.776575| -54.23552|   RS|tres de maio|
+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+------------+
only showing top 2 rows



                                                                                

In [11]:
order_location.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- tmp_id: string (nullable = false)
 |-- zip_code: string (nullable = true)
 |-- user_type: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)



In [12]:
order_location.writeTo(f"{test_namespace}.order_location").createOrReplace()

                                                                                

In [13]:
spark.stop()