In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os
import findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

findspark.init()

In [3]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [5]:
from pyspark import SparkConf
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

config = SparkConf().setMaster('local').setAppName('lab53')
spark = SparkSession.builder.config(conf=config).getOrCreate()
sc = spark.sparkContext

DATASET_PATH = '/content/gdrive/MyDrive/summary.parquet'

In [6]:
summary_df = spark.read \
    .format("parquet") \
    .option("inferSchema", "true") \
    .load(DATASET_PATH)

summary_df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- WeekNumber: integer (nullable = true)
 |-- NumInvoices: long (nullable = true)
 |-- TotalQuantity: long (nullable = true)
 |-- InvoiceValue: double (nullable = true)



Hãy sử dụng các thao tác Windowing để tạo thêm cột `RunningTotal` và tính tổng `Invoice` từ tuần đầu tiên đến tuần hiện tại theo các `Country`
<img src = "https://firebasestorage.googleapis.com/v0/b/funix-way.appspot.com/o/xSeries%2FData%20Engineer%2FDEP303x%2FSumary_Image%2FDEP303_sum_L8_4.png?alt=media&token=d9ce255c-42f5-4bf5-9cc3-fde26db5f2bc">

In [8]:
from pyspark.sql import functions as f
running_total_window = Window.partitionBy("Country") \
                              .orderBy("WeekNumber") \
                              .rowsBetween(-2, Window.currentRow)

summary_df.withColumn("RunningTotal",
                      f.sum("InvoiceValue").over(running_total_window)) \
    .show()

+---------------+----------+-----------+-------------+------------+------------------+
|        Country|WeekNumber|NumInvoices|TotalQuantity|InvoiceValue|      RunningTotal|
+---------------+----------+-----------+-------------+------------+------------------+
|      Australia|        48|          1|          107|      358.25|            358.25|
|      Australia|        49|          1|          214|       258.9|            617.15|
|      Australia|        50|          2|          133|      387.95|1005.0999999999999|
|        Austria|        50|          2|            3|      257.04|            257.04|
|        Bahrain|        51|          1|           54|      205.74|            205.74|
|        Belgium|        48|          1|          528|       346.1|             346.1|
|        Belgium|        50|          2|          285|      625.16|            971.26|
|        Belgium|        51|          2|          942|      838.65|1809.9099999999999|
|Channel Islands|        49|          1|   