In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum

spark = SparkSession.builder.appName("ETLPipeline").getOrCreate()

# Extract
df = spark.read.csv("sales_data.csv", header=True, inferSchema=True)

# Transform
df_filtered = df.filter(col("Revenue") > 100)
df_result = df_filtered.groupBy("Product_Category").agg(sum("Revenue").alias("total_sales"))

# Load
df_result.write.mode("overwrite").parquet("output_sales.parquet")

spark.stop()


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, sum, count

# Inisialisasi SparkSession
spark = SparkSession.builder.appName("ETLPipeline").getOrCreate()

# Membaca file CSV
df = spark.read.csv("sales_data.csv", header=True, inferSchema=True)

# Menghitung total pendapatan per bulan
df_revenue = df.withColumn("month", month("Date")) \
    .groupBy("month") \
    .agg(sum(df["Unit_Price"] * df["Order_Quantity"]).alias("total_revenue"))

# Menampilkan hasil
df_revenue.show()
 # Menampilkan produk terlaris 
df_top_products = df.groupBy("product")\
                    .agg(count("*").alias("totaL_orders"))\
                    .orderBy("total_orders", ascending=False)\
                    .limit(5)
df_top_products.show()


+-----+-------------+
|month|total_revenue|
+-----+-------------+
|   12|     10158080|
|    1|      7832338|
|    6|     10085537|
|    3|      8201790|
|    5|      9859851|
|    9|      6517880|
|    4|      8485163|
|    8|      6348349|
|    7|      6392045|
|   10|      6709394|
|   11|      6977157|
|    2|      7608734|
+-----+-------------+

+--------------------+------------+
|             product|totaL_orders|
+--------------------+------------+
|Water Bottle - 30...|       10794|
| Patch Kit/8 Patches|       10416|
|  Mountain Tire Tube|        6816|
|        AWC Logo Cap|        4358|
|Sport-100 Helmet,...|        4220|
+--------------------+------------+



In [6]:
#simpan hasil 
df_revenue.write.parquet("revenue_by_month.parquet")
df_top_products.write.parquet("top_products.write.parquet")
