In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F 

StatementMeta(, 33b722e4-8056-4092-8ca5-afe6875f43f7, 6, Finished, Available, Finished)

In [None]:
# ============================================
# PART 2: CLEAN PAYMENTS
# ============================================

# Load existing table
payments_df = spark.read.table("br_order_payments")
payments_df.printSchema()
payments_df.show(5)
paymentCount = payments_df.count()
print("Total payments:", paymentCount)

StatementMeta(, 33b722e4-8056-4092-8ca5-afe6875f43f7, 9, Finished, Available, Finished)

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
+--------------------+------------------+

In [None]:
# ---- Cleaning ----
payments_df = payments_df.dropDuplicates()
payments_df = payments_df.filter(F.col("payment_value") >= 0)

valid_types = ["credit_card", "boleto", "voucher", "debit_card", "not_defined"]
payments_df = payments_df.withColumn(
    "payment_type",
    F.when(F.col("payment_type").isin(valid_types), F.col("payment_type")).otherwise("other")
)

# capitalize payment_type values and remove underscores
payments_df = payments_df.withColumn(
    "payment_type",
    F.initcap(F.regexp_replace(F.col("payment_type"), "_", " "))
)

# count payments by type
payments_df.groupBy("payment_type").count().orderBy("payment_type").show()

payments_df.printSchema()
payments_df.show(5)
print("✅ Payments cleaned rows:", payments_df.count())
print("Removed ", payments_df.count() - payments_df.count(), " rows with duplicate payment_id")

StatementMeta(, 33b722e4-8056-4092-8ca5-afe6875f43f7, 10, Finished, Available, Finished)

+------------+-----+
|payment_type|count|
+------------+-----+
|      Boleto|19784|
| Credit Card|76795|
|  Debit Card| 1529|
| Not Defined|    3|
|     Voucher| 5775|
+------------+-----+

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|cf30fe76d1505192a...|                 1| Credit Card|                   2|        47.72|
|a04639b53f2cbd1f7...|                 1|      Boleto|                   1|        44.83|
|2e2dd1119ebf597a9...|                 1| Credit Card|                   2|       105.42|
|440a666da55232dbd...|         

In [None]:

# ---- Save ----
payments_df.write.format("delta").mode("overwrite").saveAsTable("lh_silver_olist.sl_payment")
print("✅ Payments cleaned. Silver rows:", payments_df.count())

# ---- Check & EDA ----
payments_silver = spark.read.table("lh_silver_olist.sl_payment")
payments_silver.printSchema()
print("📊 Payments Silver count:", payments_silver.count())

# Show a sample of cleaned payments
payments_silver.select("payment_type","payment_installments","payment_value").show(5)

# Distribution of payment types
payments_silver.groupBy("payment_type").count().orderBy("count", ascending=False).show()

StatementMeta(, 33b722e4-8056-4092-8ca5-afe6875f43f7, 17, Finished, Available, Finished)

✅ Payments cleaned. Silver rows: 103886
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)

📊 Payments Silver count: 103886
+------------+--------------------+-------------+
|payment_type|payment_installments|payment_value|
+------------+--------------------+-------------+
| Credit Card|                   1|        75.78|
|      Boleto|                   1|        56.42|
|      Boleto|                   1|       131.47|
| Credit Card|                   3|        37.09|
| Credit Card|                   2|       103.93|
+------------+--------------------+-------------+
only showing top 5 rows

+------------+-----+
|payment_type|count|
+------------+-----+
| Credit Card|76795|
|      Boleto|19784|
|     Voucher| 5775|
|  Debit Card| 1529|
| Not Defined|    3|
+------------+-----+



In [None]:

# ============================================
# STOP SPARK
# ============================================
spark.stop()
print("🔚 Spark session stopped.")