In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [12]:
spark = SparkSession.builder \
    .appName("PivotUnpivotExample") \
    .getOrCreate()

In [20]:
data = [
    Row(name="Alice",  subject="Math", score=85),
    Row(name="Alice",  subject="English", score=78),
    Row(name="Alice",  subject="Science", score=92),
    Row(name="Bob",  subject="Math", score=85),
    Row(name="Bob",  subject="English", score=85),
    Row(name="Bob",  subject="Science", score=85)
]

In [21]:
print(type(data))

<class 'list'>


In [22]:
df = spark.createDataFrame(data)

In [23]:
df.show()

+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Alice|   Math|   85|
|Alice|English|   78|
|Alice|Science|   92|
|  Bob|   Math|   85|
|  Bob|English|   85|
|  Bob|Science|   85|
+-----+-------+-----+



In [24]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- score: long (nullable = true)



In [25]:
pivot_df = df.groupBy("name").pivot("subject").sum("score")

In [26]:
pivot_df.show()

+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|     85|  85|     85|
|Alice|     78|  85|     92|
+-----+-------+----+-------+



In [27]:
data = [
    ("Alice", "Math", 85),
    ("Alice", "Math", 15),
    ("Alice", "English", 78),
    ("Alice", "English", 22),
    ("Alice", "Science", 92),
    ("Bob", "Math", 89),
    ("Bob", "Math", 11),
    ("Bob", "English", 76),
    ("Bob", "Science", 94),
    ("Bob", "Science", 6)
]

In [28]:
df = spark.createDataFrame(data, ["name", "subject", "score"])
print("Original DataFrame:")
df.show()

Original DataFrame:
+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Alice|   Math|   85|
|Alice|   Math|   15|
|Alice|English|   78|
|Alice|English|   22|
|Alice|Science|   92|
|  Bob|   Math|   89|
|  Bob|   Math|   11|
|  Bob|English|   76|
|  Bob|Science|   94|
|  Bob|Science|    6|
+-----+-------+-----+



In [29]:
pivot_df = df.groupBy("name").pivot("subject").avg("score")
print("After Pivot with Sum:")
pivot_df.show()

After Pivot with Sum:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|   76.0|50.0|   50.0|
|Alice|   50.0|50.0|   92.0|
+-----+-------+----+-------+



In [30]:
from pyspark.sql.functions import sum, avg, max, min, count, countDistinct
pivot_df_sum = df.groupBy("name").pivot("subject").sum("score")
pivot_df_avg = df.groupBy("name").pivot("subject").avg("score")
pivot_df_max = df.groupBy("name").pivot("subject").max("score")
pivot_df_min = df.groupBy("name").pivot("subject").min("score")
pivot_df_count = df.groupBy("name").pivot("subject").agg(count("score"))
pivot_df_countDistinct = df.groupBy("name").pivot("subject").agg(countDistinct("score"))

In [31]:
print("Pivot with Sum:")
pivot_df_sum.show()
print("Pivot with Avg:")
pivot_df_avg.show()
print("Pivot with Max:")
pivot_df_max.show()
print("Pivot with Min:")
pivot_df_min.show()
print("Pivot with Count:")
pivot_df_count.show()
print("Pivot with CountDistinct:")
pivot_df_countDistinct.show()

Pivot with Sum:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|     76| 100|    100|
|Alice|    100| 100|     92|
+-----+-------+----+-------+

Pivot with Avg:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|   76.0|50.0|   50.0|
|Alice|   50.0|50.0|   92.0|
+-----+-------+----+-------+

Pivot with Max:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|     76|  89|     94|
|Alice|     78|  85|     92|
+-----+-------+----+-------+

Pivot with Min:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|     76|  11|      6|
|Alice|     22|  15|     92|
+-----+-------+----+-------+

Pivot with Count:
+-----+-------+----+-------+
| name|English|Math|Science|
+-----+-------+----+-------+
|  Bob|      1|   2|      2|
|Alice|      2|   2|      1|
+-----+-------+----+-------+

Pivot with CountDistinct:
+-----+-------+--

In [32]:
unpivot_df = pivot_df.selectExpr("name","stack(3, 'NMath', Math, 'NEnglish', English, 'NScience', Science) as (Nsubject, Nscore)")
unpivot_df.show()

+-----+--------+------+
| name|Nsubject|Nscore|
+-----+--------+------+
|  Bob|   NMath|  50.0|
|  Bob|NEnglish|  76.0|
|  Bob|NScience|  50.0|
|Alice|   NMath|  50.0|
|Alice|NEnglish|  50.0|
|Alice|NScience|  92.0|
+-----+--------+------+

