In [1]:
from service.utils.spark import get_spark_session
from pyspark.sql import Window
from pyspark.sql import functions as F
from operator import or_
from functools import reduce  # functools에서 reduce를 직접 import합니다.

import time

In [None]:
spark_session = get_spark_session(dev=True)
order_item_df = spark_session.read.table('warehousedev.silver.order_item').dropDuplicates()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/07 14:32:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/07 14:32:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
product_df = spark_session.read.table('warehousedev.silver.product').dropDuplicates()

In [4]:
product_df.groupBy('product_id').agg(F.count('product_id')).show(n=100, truncate=False)

                                                                                

+--------------------------------+-----------------+
|product_id                      |count(product_id)|
+--------------------------------+-----------------+
|7946cc9288ba7328ba3f7ef55c146d70|1                |
|03d7ad0ce97624c931b3abde02b5d819|1                |
|3f1a741cf5591384428c1cbb0ef07ec0|1                |
|42a2bd596fda1baef5719cb74f73030c|1                |
|2f9c2888168b8c2d8e625905f3737057|1                |
|ec8514b52ebb78103c68fddbb2d9f0a8|1                |
|9798f14fd2be3c482aaff824a05ac605|1                |
|37274552ed878a9f6279f30be24b1473|1                |
|1a596515dcbe1a261f8dc0640ccb5149|1                |
|90509918fbc0f45016520d833ae25f78|1                |
|a760ef0bb70c954d79e62ec6023d380c|1                |
|38c56c5c5b16958d95bf8254498ad555|1                |
|290ada89b05e1dca2a4fe42f1cfc012e|1                |
|f42b6d7b1de09b0216895a6dd9621c79|1                |
|746c36df2702006bac777bbb9bf8a0a5|1                |
|8b90be4893a4277a9f33c5b2348cf9c6|1           

In [5]:
select_exprs = [
    "o.shipping_limit_date",
    "p.category",
    "o.seller_id",
    "o.order_id",
    "o.order_item_id",
    "o.price as unit_price",
    "o.freight_value as unit_freight",
    "p.weight_g",
    "p.length_cm",
    "p.height_cm",
    "p.width_cm"
]

order_product = order_item_df.alias('o').join(product_df.alias('p'), on='product_id', how='right').selectExpr(select_exprs).dropDuplicates()

# order_item_df.createOrReplaceTempView("order_items")
# product_df.createOrReplaceTempView("products")

# # 2. spark.sql() 안에서는 데이터프레임 변수명이 아닌, 등록한 뷰의 이름을 사용합니다.
# #    'join' 중복 오류도 수정합니다.
# order_product = spark_session.sql("""
#     select 
#         o.shipping_limit_date,
#         p.category,
#         o.seller_id,
#         o.order_id,
#         o.order_item_id,
#         o.price,
#         o.freight_value,
#         p.weight_g,
#         p.length_cm,
#         p.height_cm,
#         p.width_cm
#     from
#         order_items as o
#     right join
#         products as p
#     ON o.product_id = p.product_id
# """)
order_product.show()



+-------------------+--------------------+--------------------+--------------------+-------------+----------+------------+--------+---------+---------+--------+
|shipping_limit_date|            category|           seller_id|            order_id|order_item_id|unit_price|unit_freight|weight_g|length_cm|height_cm|width_cm|
+-------------------+--------------------+--------------------+--------------------+-------------+----------+------------+--------+---------+---------+--------+
|2017-03-27 05:45:56|        home_confort|4a3ca9315b744ce9f...|7aaffbd34cee1970a...|            1|      89.9|        14.8|  4050.0|     45.0|     15.0|    35.0|
|2017-03-07 19:05:29|     furniture_decor|d20b021d3efdf267a...|a1a68e110acccf453...|            2|     10.99|       14.52|   100.0|     80.0|     10.0|    60.0|
|2017-04-20 07:50:17|                auto|8581055ce74af1dab...|c7037ea6603ae90fc...|            1|     117.3|       20.64|  4105.0|     67.0|     10.0|    67.0|
|2017-02-25 09:40:29|       health

                                                                                

In [6]:
print(order_item_df.count())    # 8238
print(order_product.count())    # 7854

                                                                                

8687




8323


                                                                                

In [7]:
order_product.columns

['shipping_limit_date',
 'category',
 'seller_id',
 'order_id',
 'order_item_id',
 'unit_price',
 'unit_freight',
 'weight_g',
 'length_cm',
 'height_cm',
 'width_cm']

In [8]:
null_conditions = [F.col(c).isNull() for c in order_product.columns]
any_null_condition = reduce(or_, null_conditions, F.lit(False))
rows_with_any_na = order_product.where(any_null_condition)

rows_with_any_na.show(truncate=False)

                                                                                

+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+
|shipping_limit_date|category|seller_id|order_id|order_item_id|unit_price|unit_freight|weight_g|length_cm|height_cm|width_cm|
+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+
+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+



                                                                                

In [9]:
rows_with_any_na.count()

                                                                                

0

In [None]:
order_product.filter(F.col('order_id') == 'c1aee3898c27b4b86706a7a0259efd19').show()



+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+
|shipping_limit_date|category|seller_id|order_id|order_item_id|unit_price|unit_freight|weight_g|length_cm|height_cm|width_cm|
+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+
+-------------------+--------+---------+--------+-------------+----------+------------+--------+---------+---------+--------+



                                                                                

In [None]:
# # 1. order_id로 그룹을 나누고, quantity를 기준으로 내림차순 정렬하는 Window를 정의합니다.
# windowSpec = Window.partitionBy(["order_id",'').orderBy(F.col("quantity").desc())

# # 2. 위에서 정의한 Window를 사용해 각 그룹 내 순위(rank)를 계산합니다.
# ranked_df = df.withColumn("rank", rank().over(windowSpec))

# # 3. 순위가 1인 (quantity가 max인) 행들만 필터링합니다.
# result_df = ranked_df.where(col("rank") == 1).drop("rank")

# # 4. 결과를 출력합니다.
# result_df.show()