In [2]:
# import libraries
import pandas as pd
import numpy as np


In [6]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m994.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Obtaining dependency information for py4j==0.10.9.7 from https://files.pythonhosted.org/packages/10/30/a58b32568f1623aaad7db22aa9eafc4c6c194b429ff35bdc55ca2726da47/py4j-0.10.9.7-py2.py3-none-any.whl.metadata
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488496 sha256=110d9de6270e465a7637553df1bc2648

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, sum

In [9]:
# create the spark session
spark = SparkSession.builder \
    .appName("Time_Based_Aggregations") \
    .getOrCreate()

# read in csv file
ecommerce_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .load('/Users/jacquelineyu/src/kaizenflow/sorrentum_sandbox/projects/SorrTask792_Windowed_Streaming_Analysis_With_Apache_Spark_Streaming/archive/Amazon Sale Report.csv')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/25 22:11:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [11]:
# list columns to drop
col_drop = ['Order ID', 'Style', 'SKU', 'ASIN', 'Courier Status', 'promotion-ids', 'B2B', 'fulfilled-by', 'Unnamed: 22']

# drop the columns
ecommerce_df = ecommerce_df.drop(*col_drop)

In [13]:
ecommerce_df.show(5)

+-----+--------+--------------------+----------+--------------+------------------+-------------+----+---+--------+------+-----------+-----------+----------------+------------+
|index|    Date|              Status|Fulfilment|Sales Channel |ship-service-level|     Category|Size|Qty|currency|Amount|  ship-city| ship-state|ship-postal-code|ship-country|
+-----+--------+--------------------+----------+--------------+------------------+-------------+----+---+--------+------+-----------+-----------+----------------+------------+
|    0|04-30-22|           Cancelled|  Merchant|     Amazon.in|          Standard|          Set|   S|  0|     INR|647.62|     MUMBAI|MAHARASHTRA|        400081.0|          IN|
|    1|04-30-22|Shipped - Deliver...|  Merchant|     Amazon.in|          Standard|        kurta| 3XL|  1|     INR| 406.0|  BENGALURU|  KARNATAKA|        560085.0|          IN|
|    2|04-30-22|             Shipped|    Amazon|     Amazon.in|         Expedited|        kurta|  XL|  1|     INR| 329.0

In [14]:
# Convert 'Date' column to timestamp type
ecommerce_df = ecommerce_df.withColumn("Date", ecommerce_df["Date"].cast("timestamp"))

# time-based aggregations using tumbling windows
# aggregate the sales using 'Amount' and 'Qty' for each day
tumbling_window_df = ecommerce_df \
    .groupBy(window("Date", "1 day")) \
    .agg(sum("Amount").alias("TotalAmount"), sum("Qty").alias("TotalQty"))

# time-based aggregations using sliding windows
# aggregate the sales using 'Amount' and 'Qty' for each day using sliding window of 3 days
sliding_window_df = ecommerce_df \
    .groupBy(window("Date", "3 days", slideDuration="1 day")) \
    .agg(sum("Amount").alias("TotalAmount"), sum("Qty").alias("TotalQty"))

# print tumbling window df
print("Tumbling Window Aggregations:")
tumbling_window_df.show(truncate=False)

# print sliding window df
print("Sliding Window Aggregations:")
sliding_window_df.show(truncate=False)

Tumbling Window Aggregations:




+------+-----------+--------+
|window|TotalAmount|TotalQty|
+------+-----------+--------+
+------+-----------+--------+

Sliding Window Aggregations:


[Stage 5:>                                                          (0 + 4) / 4]

+------+-----------+--------+
|window|TotalAmount|TotalQty|
+------+-----------+--------+
+------+-----------+--------+



In [None]:
# stop the SparkSession when the app is done
spark.stop()