In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

BUCKET = "warehouse-dev"    # `-`, `.`만 포함 가능
BRONZE_NAMESPACE = 'bronze'

bronze_path = f"s3a://{BUCKET}/{BRONZE_NAMESPACE}"  # s3a://, s3:// 둘다 가능: spark-defaults.conf 참조
jPath = spark._jvm.org.apache.hadoop.fs.Path(bronze_path)

# FileSystem 얻기
fs = jPath.getFileSystem(spark._jsc.hadoopConfiguration())
status_list = fs.listStatus(jPath)
file_list = [f.getPath().toString() for f in status_list if f.isFile()]
file_list

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/08 06:59:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/08 06:59:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


['s3a://warehouse-dev/bronze/olist_customers_dataset.csv',
 's3a://warehouse-dev/bronze/olist_geolocation_dataset.csv',
 's3a://warehouse-dev/bronze/olist_order_items_dataset.csv',
 's3a://warehouse-dev/bronze/olist_order_payments_dataset.csv',
 's3a://warehouse-dev/bronze/olist_order_reviews_dataset.csv',
 's3a://warehouse-dev/bronze/olist_orders_dataset.csv',
 's3a://warehouse-dev/bronze/olist_products_dataset.csv',
 's3a://warehouse-dev/bronze/olist_sellers_dataset.csv',
 's3a://warehouse-dev/bronze/product_category_name_translation.csv']

In [None]:
CATALOG = 'warehouse_dev'   # `-` 포함 불가. `_` 가능
SILVER_NAMESPACE = 'silver.dedup'
TARGET_QUALIFIED_NAMESPACE = f"{CATALOG}.{SILVER_NAMESPACE}"
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS {TARGET_QUALIFIED_NAMESPACE}")

DataFrame[]

In [3]:

for file_path in file_list:
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    dedup_df = df.dropDuplicates()
    print(f"{file_path}, before: {df.count()}, after: {dedup_df.count()}")

    table_name = file_path.split("/")[-1].replace(".csv", "")
    full_table_name = f"{TARGET_QUALIFIED_NAMESPACE}.{table_name}"

    if not spark.catalog.tableExists(full_table_name):
        dedup_df.writeTo(full_table_name).create()
    else:
        dedup_df.writeTo(full_table_name).overwritePartitions()

s3a://warehouse-dev/bronze/olist_customers_dataset.csv, before: 99441, after: 99441


                                                                                

s3a://warehouse-dev/bronze/olist_geolocation_dataset.csv, before: 1000163, after: 738332
s3a://warehouse-dev/bronze/olist_order_items_dataset.csv, before: 112650, after: 112650
s3a://warehouse-dev/bronze/olist_order_payments_dataset.csv, before: 103886, after: 103886
s3a://warehouse-dev/bronze/olist_order_reviews_dataset.csv, before: 104162, after: 104077
s3a://warehouse-dev/bronze/olist_orders_dataset.csv, before: 99441, after: 99441
s3a://warehouse-dev/bronze/olist_products_dataset.csv, before: 32951, after: 32951
s3a://warehouse-dev/bronze/olist_sellers_dataset.csv, before: 3095, after: 3095
s3a://warehouse-dev/bronze/product_category_name_translation.csv, before: 71, after: 71


In [4]:
df = spark.sql(f"SHOW TABLES IN {TARGET_QUALIFIED_NAMESPACE}")
table_names = [row.tableName for row in df.collect()]
table_names

['olist_customers_dataset',
 'olist_geolocation_dataset',
 'olist_order_items_dataset',
 'olist_order_payments_dataset',
 'olist_order_reviews_dataset',
 'olist_orders_dataset',
 'olist_products_dataset',
 'olist_sellers_dataset',
 'product_category_name_translation']

In [None]:
# for table_name in table_names:
#     spark.sql(f"DROP TABLE IF EXISTS {TARGET_QUALIFIED_NAMESPACE}.{table_name} PURGE")

In [5]:
spark.stop()