In [2]:
println("Start")

Start


In [4]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


In [34]:
val df = Seq(
  (1, 10, 0),
  (1, 11, 1),
  (1, 13, 1),
  (1, 16, 1),
  (1, 20, 0),
  (1, 21, 0),
  (1, 22, 1),
  (1, 25, 1),
  (1, 27, 1),
  (1, 29, 1),
  (1, 30, 0),
  (1, 32, 1),
  (1, 34, 1),
  (1, 35, 1),
  (1, 38, 0)
).toDF("Category", "Value", "Sequences")

df: org.apache.spark.sql.DataFrame = [Category: int, Value: int ... 1 more field]


In [35]:
df.show

+--------+-----+---------+
|Category|Value|Sequences|
+--------+-----+---------+
|       1|   10|        0|
|       1|   11|        1|
|       1|   13|        1|
|       1|   16|        1|
|       1|   20|        0|
|       1|   21|        0|
|       1|   22|        1|
|       1|   25|        1|
|       1|   27|        1|
|       1|   29|        1|
|       1|   30|        0|
|       1|   32|        1|
|       1|   34|        1|
|       1|   35|        1|
|       1|   38|        0|
+--------+-----+---------+



assing each row unique id

In [36]:
val zipped = df.withColumn("zip", monotonically_increasing_id())

zipped: org.apache.spark.sql.DataFrame = [Category: int, Value: int ... 2 more fields]


In [37]:
zipped.show

+--------+-----+---------+---+
|Category|Value|Sequences|zip|
+--------+-----+---------+---+
|       1|   10|        0|  0|
|       1|   11|        1|  1|
|       1|   13|        1|  2|
|       1|   16|        1|  3|
|       1|   20|        0|  4|
|       1|   21|        0|  5|
|       1|   22|        1|  6|
|       1|   25|        1|  7|
|       1|   27|        1|  8|
|       1|   29|        1|  9|
|       1|   30|        0| 10|
|       1|   32|        1| 11|
|       1|   34|        1| 12|
|       1|   35|        1| 13|
|       1|   38|        0| 14|
+--------+-----+---------+---+



make range from zero to the next zero

In [38]:
val categoryWindow = Window.partitionBy("Category").orderBy($"zip")

categoryWindow: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@47877c3b


In [39]:
val groups = zipped
             .filter($"Sequences" === 0)
             .withColumn("rangeEnd", lead($"zip",1).over(categoryWindow))
             .withColumnRenamed("zip", "rangeStart")

groups: org.apache.spark.sql.DataFrame = [Category: int, Value: int ... 3 more fields]


In [40]:
groups.show(false)

+--------+-----+---------+----------+--------+
|Category|Value|Sequences|rangeStart|rangeEnd|
+--------+-----+---------+----------+--------+
|1       |10   |0        |0         |4       |
|1       |20   |0        |4         |5       |
|1       |21   |0        |5         |10      |
|1       |30   |0        |10        |14      |
|1       |38   |0        |14        |null    |
+--------+-----+---------+----------+--------+



assing range for each unit

In [41]:
val joinCondition = ($"units.zip" > $"groups.rangeStart").and($"units.zip" < $"groups.rangeEnd")

joinCondition: org.apache.spark.sql.Column = ((units.zip > groups.rangeStart) AND (units.zip < groups.rangeEnd))


In [42]:
val unitsByRange = zipped
                    .filter($"Sequences" === 1).alias("units")
                    .join(groups.alias("groups"), joinCondition, "left")
                    .select("units.Category", "units.Value", "groups.rangeStart")

unitsByRange: org.apache.spark.sql.DataFrame = [Category: int, Value: int ... 1 more field]


In [43]:
unitsByRange.show

+--------+-----+----------+
|Category|Value|rangeStart|
+--------+-----+----------+
|       1|   11|         0|
|       1|   13|         0|
|       1|   16|         0|
|       1|   22|         5|
|       1|   25|         5|
|       1|   27|         5|
|       1|   29|         5|
|       1|   32|        10|
|       1|   34|        10|
|       1|   35|        10|
+--------+-----+----------+



group by range

In [44]:
val result = unitsByRange
                .groupBy($"Category", $"rangeStart")
                .agg(sum("Value").alias("summing"))
                .orderBy("rangeStart")
                .drop("rangeStart")

result: org.apache.spark.sql.DataFrame = [Category: int, summing: bigint]


In [45]:
result.show(false)

+--------+-------+
|Category|summing|
+--------+-------+
|1       |40     |
|1       |103    |
|1       |101    |
+--------+-------+

