In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum

# Create a SparkSession
spark = SparkSession.builder.appName("GroupBySumExample").getOrCreate()

# Create a DataFrame
data = [("Alice", 100), ("Bob", 200), ("Alice", 150), ("Bob",  300)]
df = spark.createDataFrame(data, ["Name", "Amount"])

# Group by "Name" and calculate the sum of "Amount"
result = df.groupBy("Name").agg(sum("Amount").alias("TotalAmount"))

# Show the result
result.show()

+-----+-----------+
| Name|TotalAmount|
+-----+-----------+
|Alice|        250|
|  Bob|        500|
+-----+-----------+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lag, col
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder.appName("LagExample").getOrCreate()

# Create a DataFrame with timestamp, name, and dollars columns
data = [
    ("2022-01-01 09:00:00", "Alice", 1),
    ("2022-01-01 09:10:00", "Alice", 2),
    ("2022-01-01 09:20:00", "Bob", 20),
    ("2022-01-01 09:30:00", "Alice", 3),
    ("2022-01-01 09:30:00", "Bob", 10),
    ("2022-01-01 09:50:00", "Bob", 30)
]
df = spark.createDataFrame(data, ["timestamp", "name", "dollars"])

# Convert the "timestamp" column to timestamp type
df = df.withColumn("timestamp", col("timestamp").cast("timestamp"))

# Add a new column "previous_dollars" with the previous value of "dollars" for each user
df = df.withColumn("previous_dollars", lag("dollars").over(Window.orderBy("timestamp").partitionBy("name")))

# Calculate the difference between the current and previous dollar amounts
df = df.withColumn("amount_difference", col("dollars") - col("previous_dollars"))

# Show the result
df.show()

+-------------------+-----+-------+----------------+-----------------+
|          timestamp| name|dollars|previous_dollars|amount_difference|
+-------------------+-----+-------+----------------+-----------------+
|2022-01-01 09:00:00|Alice|      1|            null|             null|
|2022-01-01 09:10:00|Alice|      2|               1|                1|
|2022-01-01 09:30:00|Alice|      3|               2|                1|
|2022-01-01 09:20:00|  Bob|     20|            null|             null|
|2022-01-01 09:30:00|  Bob|     10|              20|              -10|
|2022-01-01 09:50:00|  Bob|     30|              10|               20|
+-------------------+-----+-------+----------------+-----------------+

