In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, isnan, when, count


# SparkSession 생성
spark = SparkSession.builder \
    .appName("HDFS File Read Example") \
    .getOrCreate()

products = spark.read.csv("hdfs://master:9000/raw/olist/olist_products_dataset.csv", header=True, inferSchema=True)
products_eng = spark.read.csv("hdfs://master:9000/raw/olist/product_category_name_translation.csv", header=True, inferSchema=True)

                                                                                

In [19]:
products.select('product_category_name').distinct().count()

74

In [20]:
products_eng.select('product_category_name_english').distinct().count()

71

In [21]:
diff1 = products.select('product_category_name').distinct() \
    .subtract(products_eng.select('product_category_name').distinct())

diff1.show(truncate=False)

+---------------------------------------------+
|product_category_name                        |
+---------------------------------------------+
|pc_gamer                                     |
|portateis_cozinha_e_preparadores_de_alimentos|
|NULL                                         |
+---------------------------------------------+



In [22]:
products.filter(
    col("product_category_name").isNull() | isnan(col("product_category_name"))
).show()

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|a41e356c76fab6633...|                 NULL|               NULL|                      NULL|              NULL|             650|               17|               14|              12|
|d8dee61c2034d6d07...|                 NULL|               NULL|                      NULL|              NULL|             300|               16|                7|              20|
|56139431d72cd51f1...|                 NULL|               NULL|                      NULL|    

In [23]:
diff2 = products_eng.select('product_category_name').distinct() \
    .subtract(products.select('product_category_name').distinct())

diff2.show(truncate=False)

+---------------------+
|product_category_name|
+---------------------+
+---------------------+



In [24]:
new_rows = [
    Row(product_category_name='pc_gamer', product_category_name_eng='gaming_pc'),
    Row(product_category_name='portateis_cozinha_e_preparadores_de_alimentos', product_category_name_eng='portable_kitchen_and_food_processors'),
]

new_df = spark.createDataFrame(new_rows)

In [25]:
products_eng_updated = products_eng.union(new_df)
products_eng_updated.count()

73

In [26]:
products_eng_updated.show()

+---------------------+-----------------------------+
|product_category_name|product_category_name_english|
+---------------------+-----------------------------+
|         beleza_saude|                health_beauty|
| informatica_acess...|         computers_accesso...|
|           automotivo|                         auto|
|      cama_mesa_banho|               bed_bath_table|
|     moveis_decoracao|              furniture_decor|
|        esporte_lazer|               sports_leisure|
|           perfumaria|                    perfumery|
| utilidades_domest...|                   housewares|
|            telefonia|                    telephony|
|   relogios_presentes|                watches_gifts|
|    alimentos_bebidas|                   food_drink|
|                bebes|                         baby|
|            papelaria|                   stationery|
| tablets_impressao...|         tablets_printing_...|
|           brinquedos|                         toys|
|       telefonia_fixa|     

In [None]:
products_eng_updated.coalesce(1).write \
    .option("header", True) \
    .mode("overwrite") \
    .csv("file:///tmp/products_eng_updated_single")

# cd /tmp/products_eng_updated_single && mv part-00000-78579e90-a292-4c4a-95f6-e98c8c464fbe-c000.csv products_eng_updated_single.csv

                                                                                

In [35]:
test = spark.read.csv("hdfs://master:9000/preprocessed/olist/products_eng_updated_single.csv", header=True, inferSchema=True)
test.count()

73