In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Simple DataFrame Example").getOrCreate()

data = [
    ("Alice", 25),
    ("Matt", 30),
    ("Tokio", 28),
    ("David", 35)
]

columns = ["Name", "Age"]
df = spark.createDataFrame(data, schema=columns)

df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
| Matt| 30|
|Tokio| 28|
|David| 35|
+-----+---+



In [3]:
df.select("Name").show()
df.select("Name", "Age").show()

df.filter(df.Age > 28).show()

df.groupBy("Age").count().show()

+-----+
| Name|
+-----+
|Alice|
| Matt|
|Tokio|
|David|
+-----+

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
| Matt| 30|
|Tokio| 28|
|David| 35|
+-----+---+

+-----+---+
| Name|Age|
+-----+---+
| Matt| 30|
|David| 35|
+-----+---+



[Stage 10:>                                                         (0 + 2) / 2]

+---+-----+
|Age|count|
+---+-----+
| 25|    1|
| 30|    1|
| 28|    1|
| 35|    1|
+---+-----+



                                                                                

In [4]:
df.withColumn("Age_Plus_10", df.Age + 10).show()

+-----+---+-----------+
| Name|Age|Age_Plus_10|
+-----+---+-----------+
|Alice| 25|         35|
| Matt| 30|         40|
|Tokio| 28|         38|
|David| 35|         45|
+-----+---+-----------+



In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("Window Function Example").getOrCreate()

data = [
    ("Alice", "2024-09-01", 100),
    ("Alice", "2024-09-02", 200),
    ("Alice", "2024-09-03", 300),
    ("Bob", "2024-09-01", 150),
    ("Bob", "2024-09-02", 250),
    ("Bob", "2024-09-03", 350),
]

columns = ["Name", "Date", "Sales"]

df = spark.createDataFrame(data, schema=columns)

df.show()

windowSpec = Window.partitionBy("Name").orderBy("Date").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_running_total = df.withColumn("Running_Total", F.sum("Sales").over(windowSpec))


df_running_total.show()

+-----+----------+-----+
| Name|      Date|Sales|
+-----+----------+-----+
|Alice|2024-09-01|  100|
|Alice|2024-09-02|  200|
|Alice|2024-09-03|  300|
|  Bob|2024-09-01|  150|
|  Bob|2024-09-02|  250|
|  Bob|2024-09-03|  350|
+-----+----------+-----+

+-----+----------+-----+-------------+
| Name|      Date|Sales|Running_Total|
+-----+----------+-----+-------------+
|Alice|2024-09-01|  100|          100|
|Alice|2024-09-02|  200|          300|
|Alice|2024-09-03|  300|          600|
|  Bob|2024-09-01|  150|          150|
|  Bob|2024-09-02|  250|          400|
|  Bob|2024-09-03|  350|          750|
+-----+----------+-----+-------------+



In [8]:
windowSpecRank = Window.partitionBy("Name").orderBy(F.desc("Sales"))

df_rank = df.withColumn("Rank", F.rank().over(windowSpecRank))

df_rank.show()


+-----+----------+-----+----+
| Name|      Date|Sales|Rank|
+-----+----------+-----+----+
|Alice|2024-09-03|  300|   1|
|Alice|2024-09-02|  200|   2|
|Alice|2024-09-01|  100|   3|
|  Bob|2024-09-03|  350|   1|
|  Bob|2024-09-02|  250|   2|
|  Bob|2024-09-01|  150|   3|
+-----+----------+-----+----+

