In [1]:
!pip install pyspark

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, explode, flatten, concat_ws
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType, ArrayType, MapType
)

# Start Spark
spark = SparkSession.builder.appName("Practical3").getOrCreate()



In [2]:
#Spark Array and Map operations

# 1 Create Array (ArrayType) column
data = [
    (1, ["apple", "banana", "cherry"]),
    (2, ["watermelon"]),
    (3, ["lemon", "lime"])
]
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("fruits", ArrayType(StringType()), True)
])
df = spark.createDataFrame(data, schema)
print("ArrayType column:")
df.show(truncate=False)

ArrayType column:
+---+-----------------------+
|id |fruits                 |
+---+-----------------------+
|1  |[apple, banana, cherry]|
|2  |[watermelon]           |
|3  |[lemon, lime]          |
+---+-----------------------+



In [3]:
# 2️ Create Map (MapType) column
dataWithMap = [
    (1, {"apple": 3, "banana": 5}),
    (2, {"watermelon": 1}),
    (3, {"lemon": 2, "lime": 4})
]
dfWithMap = spark.createDataFrame(dataWithMap, ["id", "fruit_counts"])
print("MapType column:")
dfWithMap.show(truncate=False)

MapType column:
+---+-------------------------+
|id |fruit_counts             |
+---+-------------------------+
|1  |{banana -> 5, apple -> 3}|
|2  |{watermelon -> 1}        |
|3  |{lemon -> 2, lime -> 4}  |
+---+-------------------------+



In [4]:
# 3 Convert Array to individual rows
dfExploded = df.withColumn("fruit", explode(col("fruits")))
print("Exploded Array column:")
dfExploded.show()

Exploded Array column:
+---+--------------------+----------+
| id|              fruits|     fruit|
+---+--------------------+----------+
|  1|[apple, banana, c...|     apple|
|  1|[apple, banana, c...|    banana|
|  1|[apple, banana, c...|    cherry|
|  2|        [watermelon]|watermelon|
|  3|       [lemon, lime]|     lemon|
|  3|       [lemon, lime]|      lime|
+---+--------------------+----------+



In [5]:
# 4 Create Array of Struct column
dataWithStructArray = [
    (1, [{"fruit": "apple", "count": 3}, {"fruit": "banana", "count": 5}]),
    (2, [{"fruit": "watermelon", "count": 1}]),
    (3, [{"fruit": "lemon", "count": 2}, {"fruit": "lime", "count": 4}])
]
schemaWithStructArray = StructType([
    StructField("id", IntegerType(), False),
    StructField("fruit_counts",
        ArrayType(
            StructType([
                StructField("fruit", StringType(), True),
                StructField("count", IntegerType(), True)
            ])
        ), True)
])
dfWithStructArray = spark.createDataFrame(dataWithStructArray, schemaWithStructArray)
print("Array of Structs:")
dfWithStructArray.show(truncate=False)

Array of Structs:
+---+-------------------------+
|id |fruit_counts             |
+---+-------------------------+
|1  |[{apple, 3}, {banana, 5}]|
|2  |[{watermelon, 1}]        |
|3  |[{lemon, 2}, {lime, 4}]  |
+---+-------------------------+



In [6]:
# 5 Explode Array and Map columns
print("Explode Array column:")
dfExplodedArray = df.withColumn("fruit", explode(col("fruits")))
dfExplodedArray.show()

print("Explode Map column:")
dfExplodedMap = dfWithMap.select(col("id"), explode(col("fruit_counts")).alias("fruit", "count"))
dfExplodedMap.show()

Explode Array column:
+---+--------------------+----------+
| id|              fruits|     fruit|
+---+--------------------+----------+
|  1|[apple, banana, c...|     apple|
|  1|[apple, banana, c...|    banana|
|  1|[apple, banana, c...|    cherry|
|  2|        [watermelon]|watermelon|
|  3|       [lemon, lime]|     lemon|
|  3|       [lemon, lime]|      lime|
+---+--------------------+----------+

Explode Map column:
+---+----------+-----+
| id|     fruit|count|
+---+----------+-----+
|  1|    banana|    5|
|  1|     apple|    3|
|  2|watermelon|    1|
|  3|     lemon|    2|
|  3|      lime|    4|
+---+----------+-----+



In [7]:
# 6 Explode Array of Structs
dfExplodedStructArray = dfWithStructArray.withColumn("fruit_count", explode(col("fruit_counts")))
dfExplodedStructArray.select("id", "fruit_count.fruit", "fruit_count.count").show()

+---+----------+-----+
| id|     fruit|count|
+---+----------+-----+
|  1|     apple|    3|
|  1|    banana|    5|
|  2|watermelon|    1|
|  3|     lemon|    2|
|  3|      lime|    4|
+---+----------+-----+



In [8]:
# 7 Explode Array of Map columns to rows
dataWithMapArray = [
    (1, [{"apple": 3, "banana": 5}]),
    (2, [{"watermelon": 1}]),
    (3, [{"lemon": 2, "lime": 4}])
]
schemaWithMapArray = StructType([
    StructField("id", IntegerType(), False),
    StructField("fruit_counts", ArrayType(MapType(StringType(), IntegerType())), True)
])
dfWithMapArray = spark.createDataFrame(dataWithMapArray, schemaWithMapArray)
dfExplodedMapArray = dfWithMapArray.withColumn("fruit_count", explode(col("fruit_counts")))
dfExplodedMapArray.select(col("id"), explode(col("fruit_count")).alias("fruit", "count")).show()

+---+----------+-----+
| id|     fruit|count|
+---+----------+-----+
|  1|    banana|    5|
|  1|     apple|    3|
|  2|watermelon|    1|
|  3|     lemon|    2|
|  3|      lime|    4|
+---+----------+-----+



In [9]:
#B) Nested Array operations

# 1 Create DataFrame with Nested Array
dataNested = [
    ("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]),
    ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]),
    ("Robert", [["CSharp", "VB"], ["Spark", "Python"]])
]
schemaNested = StructType([
    StructField("name", StringType(), True),
    StructField("subjects", ArrayType(ArrayType(StringType())), True)
])
dfNested = spark.createDataFrame(dataNested, schemaNested)
print("Nested Array:")
dfNested.show(truncate=False)

Nested Array:
+-------+-----------------------------------+
|name   |subjects                           |
+-------+-----------------------------------+
|James  |[[Java, Scala, C++], [Spark, Java]]|
|Michael|[[Spark, Java, C++], [Spark, Java]]|
|Robert |[[CSharp, VB], [Spark, Python]]    |
+-------+-----------------------------------+



In [10]:
# 2 Explode Nested Arrays
explodedDF = dfNested.select(col("name"), explode(col("subjects")).alias("subject"))
print("Exploded Nested Arrays:")
explodedDF.show(truncate=False)

Exploded Nested Arrays:
+-------+------------------+
|name   |subject           |
+-------+------------------+
|James  |[Java, Scala, C++]|
|James  |[Spark, Java]     |
|Michael|[Spark, Java, C++]|
|Michael|[Spark, Java]     |
|Robert |[CSharp, VB]      |
|Robert |[Spark, Python]   |
+-------+------------------+



In [11]:
# 3 Flatten Nested Array
flattenedDF = dfNested.select(col("name"), flatten(col("subjects")).alias("subjects_flat"))
print("Flattened Array:")
flattenedDF.show(truncate=False)

Flattened Array:
+-------+-------------------------------+
|name   |subjects_flat                  |
+-------+-------------------------------+
|James  |[Java, Scala, C++, Spark, Java]|
|Michael|[Spark, Java, C++, Spark, Java]|
|Robert |[CSharp, VB, Spark, Python]    |
+-------+-------------------------------+



In [12]:
# 4 Convert Array of String to single String column
stringDF = flattenedDF.select(col("name"), concat_ws(", ", col("subjects_flat")).alias("subjects_string"))
print("Array of String → Single String:")
stringDF.show(truncate=False)

Array of String → Single String:
+-------+-----------------------------+
|name   |subjects_string              |
+-------+-----------------------------+
|James  |Java, Scala, C++, Spark, Java|
|Michael|Spark, Java, C++, Spark, Java|
|Robert |CSharp, VB, Spark, Python    |
+-------+-----------------------------+

