In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
from service.utils.spark import get_spark_session

In [2]:
spark = get_spark_session(dev=True)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/27 13:20:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
test_namespace = 'gold' 
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {test_namespace}")

DataFrame[]

In [4]:
delivered_order_customer = spark.read.table(f"{test_namespace}.delivered_order_timestamp")
delivered_order_customer_id = delivered_order_customer.select('order_id', 'customer_id')

In [5]:
delivered_order_product_bcg = spark.read.table(f"{test_namespace}.delivered_order_product_bcg")
users_order_produt = delivered_order_product_bcg.join(delivered_order_customer_id, on='order_id', how='inner')

In [6]:
"""
CAUTION: `dropDuplicates()` 를 안하는 이유
- `seller_order` 데이터프레임과 union 하기 위해 `order_id`, `order_item` 열을 제거 하지만, 동일 제품을 재구매 할 수 있으므로 중복 제거하지 않는다.
- 새로운 임시 고유식별번호를 부여하여 중복 제거를 방지
"""
customer_order = users_order_produt.select('product_id', 'category', 'customer_id')
customer_order = customer_order.withColumnRenamed('customer_id', 'user_id')
customer_order = customer_order.withColumn("tmp_id", F.expr("uuid()"))
customer_order.show(n=2, truncate=False)

                                                                                

+--------------------------------+-------------+--------------------------------+------------------------------------+
|product_id                      |category     |user_id                         |tmp_id                              |
+--------------------------------+-------------+--------------------------------+------------------------------------+
|340e3e28357bb29f7612c86c13d9729a|health_beauty|ab4f4c7e4c8db698d574957a76b94bc2|b42f472c-2d47-4814-a2f9-b38b1d5a79df|
|70869aae4d447a8815db20569604a9e7|health_beauty|a525b3b29560b44b401e1f7b119a1172|413c0b88-a51a-48f9-b55c-b70a5d18265d|
+--------------------------------+-------------+--------------------------------+------------------------------------+
only showing top 2 rows



In [7]:
"""
`dropDuplicates()` 를 하는 이유
- seller 는 제품에 중복이 존재하지 않기 때문
- 새로운 임시 고유식별번호를 부여하여 `customer_order`와 스키마를 맞춘다.
"""
seller_order = delivered_order_product_bcg.select('product_id', 'category', 'seller_id')
seller_order = seller_order.withColumnRenamed('seller_id', 'user_id')
seller_order = seller_order.dropDuplicates()
seller_order = seller_order.withColumn("tmp_id", F.expr("uuid()"))
seller_order.show(n=2)

+--------------------+-------------+--------------------+--------------------+
|          product_id|     category|             user_id|              tmp_id|
+--------------------+-------------+--------------------+--------------------+
|47f699d9462e071f9...|health_beauty|56642bcb79900e777...|c0b426da-ebb5-4c7...|
|56f096959d0abfbb6...|health_beauty|56642bcb79900e777...|604a15fc-2c87-4b1...|
+--------------------+-------------+--------------------+--------------------+
only showing top 2 rows



In [8]:
user_order = seller_order.unionByName(customer_order)
user_order.show(2)

+--------------------+-------------+--------------------+--------------------+
|          product_id|     category|             user_id|              tmp_id|
+--------------------+-------------+--------------------+--------------------+
|47f699d9462e071f9...|health_beauty|56642bcb79900e777...|c0b426da-ebb5-4c7...|
|56f096959d0abfbb6...|health_beauty|56642bcb79900e777...|604a15fc-2c87-4b1...|
+--------------------+-------------+--------------------+--------------------+
only showing top 2 rows



In [9]:
user_location = spark.read.table(f"{test_namespace}.user_location")
user_location = user_location.withColumns({
    'lat': F.col("lat").cast(FloatType()),
    'lng': F.col("lng").cast(FloatType())
    })
user_location.show(n=2)

+--------+--------------------+---------+----------+---------+-----+---------+
|zip_code|             user_id|user_type|       lat|      lng|state|     city|
+--------+--------------------+---------+----------+---------+-----+---------+
|    1001|8602a61d680a10a82...|   seller| -23.55019|-46.63402|   SP|sao paulo|
|    1003|0c1a20644f0dc126c...| customer|-23.548994|-46.63573|   SP|sao paulo|
+--------+--------------------+---------+----------+---------+-----+---------+
only showing top 2 rows



In [10]:
order_location = user_order.join(user_location, on='user_id', how='inner')
order_location = order_location.orderBy('product_id')

In [11]:
order_location.show(2)



+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+-----------+
|             user_id|          product_id|     category|              tmp_id|zip_code|user_type|       lat|       lng|state|       city|
+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+-----------+
|771f0e0905089b99e...|00210e41887c2a8ef...|health_beauty|67dda72c-1094-4ce...|    9290| customer| -23.65236|-46.507896|   SP|santo andre|
|3f36fa15a16c48509...|00210e41887c2a8ef...|health_beauty|62347e4f-7c1c-442...|   72270| customer|-15.805427| -48.14381|   DF|   brasilia|
+--------------------+--------------------+-------------+--------------------+--------+---------+----------+----------+-----+-----------+
only showing top 2 rows



                                                                                

In [12]:
order_location.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- tmp_id: string (nullable = false)
 |-- zip_code: string (nullable = true)
 |-- user_type: string (nullable = true)
 |-- lat: float (nullable = true)
 |-- lng: float (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)



In [13]:
order_location.writeTo(f"{test_namespace}.order_location").createOrReplace()

                                                                                

In [14]:
spark.stop()