In [215]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, countDistinct

spark = SparkSession.builder.appName('products').getOrCreate()

# Join categories ENG

In [216]:
PRODUCTS_TABLE_NAME = f"warehouse_dev.silver.dedup.olist_products_dataset"
products = spark.table(PRODUCTS_TABLE_NAME)
products.show(2)
print(products.count())

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|a453b4ccccf554efb...|            telefonia|                 60|                       818|                 6|             300|               17|                4|              12|
|5657897eec3381c5a...|           cool_stuff|                 35|                       377|                 2|             500|               19|               17|              25|
+--------------------+---------------------+-------------------+--------------------------+----

In [217]:
unique_categories = products.select('product_category_name') \
                .dropDuplicates() \
                .filter(~col('product_category_name').isNull())

unique_categories.count()

73

In [231]:
CATEGORIES_TABLE_NAME = f"warehouse_dev.silver.dedup.product_category_name_translation"
categories = spark.table(CATEGORIES_TABLE_NAME)
categories.show(2)
print(categories.count())

+---------------------+-----------------------------+
|product_category_name|product_category_name_english|
+---------------------+-----------------------------+
|           automotivo|                         auto|
|                  pcs|                    computers|
+---------------------+-----------------------------+
only showing top 2 rows

71


In [230]:
joined_categories = unique_categories.join(categories, on='product_category_name', how='left')
joined_categories.show(truncate=False)
# print(joined_categories.count())

+---------------------------------+-------------------------------+
|product_category_name            |product_category_name_english  |
+---------------------------------+-------------------------------+
|pcs                              |computers                      |
|bebes                            |baby                           |
|artes                            |art                            |
|cine_foto                        |cine_photo                     |
|moveis_decoracao                 |furniture_decor                |
|pc_gamer                         |NULL                           |
|construcao_ferramentas_construcao|construction_tools_construction|
|tablets_impressao_imagem         |tablets_printing_image         |
|artigos_de_festas                |party_supplies                 |
|fashion_roupa_masculina          |fashion_male_clothing          |
|artigos_de_natal                 |christmas_supplies             |
|la_cuisine                       |la_cuisine   

In [223]:
null_categories = joined_categories.filter(col("product_category_name_english").isNull())
null_categories.show(truncate=False)

+---------------------------------------------+-----------------------------+
|product_category_name                        |product_category_name_english|
+---------------------------------------------+-----------------------------+
|pc_gamer                                     |NULL                         |
|portateis_cozinha_e_preparadores_de_alimentos|NULL                         |
+---------------------------------------------+-----------------------------+



In [229]:
type(null_categories)

pyspark.sql.dataframe.DataFrame

In [228]:
filled_categories = joined_categories.withColumn(
    "product_category_name_english",
    when( col("product_category_name") == "pc_gamer", "gaming_pc")
    .when( col("product_category_name") == "portateis_cozinha_e_preparadores_de_alimentos", "portable_kitchen_and_food_preparators")
    .otherwise(col('product_category_name_english'))
)

# filled_categories.show(truncate=False)
filled_categories.filter(col('product_category_name_english').isin(["gaming_pc", "portable_kitchen_and_food_preparators"])).show(truncate=False)

+---------------------------------------------+-------------------------------------+
|product_category_name                        |product_category_name_english        |
+---------------------------------------------+-------------------------------------+
|pc_gamer                                     |gaming_pc                            |
|portateis_cozinha_e_preparadores_de_alimentos|portable_kitchen_and_food_preparators|
+---------------------------------------------+-------------------------------------+



In [226]:
joined_categories.show(truncate=False)

+---------------------------------+-------------------------------+
|product_category_name            |product_category_name_english  |
+---------------------------------+-------------------------------+
|pcs                              |computers                      |
|bebes                            |baby                           |
|artes                            |art                            |
|cine_foto                        |cine_photo                     |
|moveis_decoracao                 |furniture_decor                |
|pc_gamer                         |NULL                           |
|construcao_ferramentas_construcao|construction_tools_construction|
|tablets_impressao_imagem         |tablets_printing_image         |
|artigos_de_festas                |party_supplies                 |
|fashion_roupa_masculina          |fashion_male_clothing          |
|artigos_de_natal                 |christmas_supplies             |
|la_cuisine                       |la_cuisine   

2

# Check null categories in products

In [95]:
null_categories_in_products = products.filter(col("product_category_name").isNull())
null_categories_in_products.show(2)
print(null_categories_in_products.count())

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|88c09dd121ebe0b11...|                 NULL|               NULL|                      NULL|              NULL|             300|               22|                9|              16|
|0ba14c257ad706bac...|                 NULL|               NULL|                      NULL|              NULL|             300|               16|               10|              11|
+--------------------+---------------------+-------------------+--------------------------+----

In [96]:
not_null_categories_in_products = products.filter(col("product_category_name").isNotNull())
not_null_categories_in_products.show(2)
print(not_null_categories_in_products.count())

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|a453b4ccccf554efb...|            telefonia|                 60|                       818|                 6|             300|               17|                4|              12|
|5657897eec3381c5a...|           cool_stuff|                 35|                       377|                 2|             500|               19|               17|              25|
+--------------------+---------------------+-------------------+--------------------------+----

In [97]:
# null_categories_in_products.join(not_null_categories_in_products, on='product_id', how='left').count()

# 교집합 product_id만 추출
common_product_ids = not_null_categories_in_products.select("product_id") \
    .intersect(null_categories_in_products.select("product_id"))

common_product_ids.show(10)
print("겹치는 product_id 개수:", common_product_ids.count())

+----------+
|product_id|
+----------+
+----------+

겹치는 product_id 개수: 0


# NULL 값 채울 수 있는 지 확인
1. order_items과 NUL_product_id 를 조인해서 셀러를 추출
2. seller_id의 판매 물품이 1개라면 채울 수 있음

In [137]:
ORDER_ITEMS_TABLE_NAME = "warehouse_dev.silver.dedup.olist_order_items_dataset"
order_items = spark.table(ORDER_ITEMS_TABLE_NAME)
order_items.show(2)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00ed64bc080d87b4a...|            1|fbb1cfc2810efabf3...|0c8380b62e38e8a1e...|2017-06-06 18:22:47| 44.9|        16.11|
|048f4f6b6d2d3bc13...|            1|c075b8e1313535522...|f80edd2c5aaa505cc...|2017-05-31 22:35:20| 48.9|         37.9|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
only showing top 2 rows



In [138]:
# gold: 셀러별 판매 물품 수
num_products_by_seller = order_items.groupBy('seller_id').agg(countDistinct('product_id').alias('number_of_proudcts_sold'))
num_products_by_seller.show()

+--------------------+-----------------------+
|           seller_id|number_of_proudcts_sold|
+--------------------+-----------------------+
|0ea22c1cfbdc755f8...|                    152|
|9803a40e82e45418a...|                     13|
|b3f19518fcec265b2...|                      4|
|297d5eccd19fa9a83...|                      3|
|8e6cc767478edae94...|                     61|
|da7039f29f90ce5b4...|                      7|
|ff063b022a9a0aab9...|                      9|
|062ce95fa2ad4dfae...|                     14|
|2009a095de2a2a416...|                      4|
|791cfcfe22fe4a771...|                      1|
|4d600e08ecbe08258...|                      8|
|ec8879960bd2221d5...|                      9|
|9c068d10aca38e85c...|                     13|
|c522be04e020c1e7b...|                      4|
|e63e8bfa530fb1691...|                     12|
|6eeed17989b0ae47c...|                      2|
|a49928bcdf77c55c6...|                      3|
|0b64bcdb0784abc13...|                      2|
|7aa4334be125

In [None]:
# gold: 동일 물건을 파는 셀러
num_seller_by_product = order_items.groupBy('product_id').agg(countDistinct('seller_id').alias('number_of_seller'))
num_seller_by_product.filter(col('number_of_seller') > 1).show()

In [139]:
sell_only_one_type_seller_id = num_products_by_seller.filter(col('number_of_proudcts_sold') == 1).select('seller_id')
sell_only_one_type_seller_id.count()

746

In [158]:
# ??????
one_product_seller = sell_only_one_type_seller_id.join(order_items, on='seller_id', how='left')
unique_product_id_one = one_product_seller.select('product_id').distinct()
unique_product_id_one.show()

+--------------------+
|          product_id|
+--------------------+
|75f3ef6a5cb0f2d5a...|
|d6a98c2faf26312d4...|
|9b8fae0d687fadb80...|
|7946cc9288ba7328b...|
|7368c0d612a177b6c...|
|2c92025a638964976...|
|54e423e4e4f906d5e...|
|4f8303640418a925c...|
|b575098a6da9b8138...|
|d65a890d49d3bb020...|
|ffaf0af7eebb57c7f...|
|b11b6e603959e09bb...|
|9dac5ef9b28fa1204...|
|f043e249c55c1ab30...|
|804f35d5834309996...|
|d13816a79f03bf978...|
|6e1c2008dea1929b9...|
|99c3deeef8923c103...|
|e93a471ae7f8acd97...|
|17787817a95955793...|
+--------------------+
only showing top 20 rows



In [161]:
unique_product_id_one.join(products, on='product_id', how='left').filter(col('product_category_name').isNull()).count()

39

# 물품 스펙이 동일하면 동일한 카테고리가 아닐까?

In [207]:
spec_cols = ["product_weight_g", "product_length_cm", "product_height_cm", "product_width_cm"]
products_spec = products.select(*spec_cols)
dedup_products_spec = products_spec.dropDuplicates()
dedup_products_spec.show(2)
dedup_products_spec.count()

+----------------+-----------------+-----------------+----------------+
|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+----------------+-----------------+-----------------+----------------+
|            1250|               32|               18|              32|
|            1150|               33|                9|              38|
+----------------+-----------------+-----------------+----------------+
only showing top 2 rows



23340

In [209]:
dedup_joined = dedup_products_spec.join(products, on=spec_cols, how='left').sort('product_weight_g').dropna()
dedup_joined.show(2)
dedup_joined.filter(col('product_category_name').isNull()).count()

+----------------+-----------------+-----------------+----------------+--------------------+---------------------+-------------------+--------------------------+------------------+
|product_weight_g|product_length_cm|product_height_cm|product_width_cm|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|
+----------------+-----------------+-----------------+----------------+--------------------+---------------------+-------------------+--------------------------+------------------+
|               0|               30|               25|              30|e673e90efa65a5409...|      cama_mesa_banho|                 53|                       528|                 1|
|               0|               30|               25|              30|8038040ee2a71048d...|      cama_mesa_banho|                 48|                       528|                 1|
+----------------+-----------------+-----------------+----------------+--------------------+---

0

In [210]:
null_products = products.filter(col('product_category_name').isNull()).select('product_id')
null_products.show(2)

+--------------------+
|          product_id|
+--------------------+
|88c09dd121ebe0b11...|
|0ba14c257ad706bac...|
+--------------------+
only showing top 2 rows



In [213]:
# "dedup_joined의 product_id가 null_products에 포함되는 것만 남김"
dedup_joined.join(
    null_products.select('product_id'), 
    on='product_id',
    how='left_semi'
).show()

+----------+----------------+-----------------+-----------------+----------------+---------------------+-------------------+--------------------------+------------------+
|product_id|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|
+----------+----------------+-----------------+-----------------+----------------+---------------------+-------------------+--------------------------+------------------+
+----------+----------------+-----------------+-----------------+----------------+---------------------+-------------------+--------------------------+------------------+

