In [None]:
The arrays_zip() function in PySpark is used to merge multiple arrays into an array of structs, pairing corresponding elements together.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import arrays_zip

# Initialize Spark session
spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

# Sample Data
data = [
    (["John", "Alice", "Bob"], [25, 30, 22]),  # Names & corresponding ages
    (["Mike", "Emma"], [40, 35])              # Another set
]
df = spark.createDataFrame(data, ["names", "ages"])

# Use arrays_zip() to combine names and ages into a single array of structs
df = df.withColumn("zipped_array", arrays_zip("names", "ages"))

# Show results
df.show(truncate=False)


+------------------+------------+------------------------------------+
|names             |ages        |zipped_array                        |
+------------------+------------+------------------------------------+
|[John, Alice, Bob]|[25, 30, 22]|[{John, 25}, {Alice, 30}, {Bob, 22}]|
|[Mike, Emma]      |[40, 35]    |[{Mike, 40}, {Emma, 35}]            |
+------------------+------------+------------------------------------+



In [2]:
from pyspark.sql.functions import explode

df_exploded = df.withColumn("person", explode("zipped_array"))
df_exploded.show(truncate=False)


+------------------+------------+------------------------------------+-----------+
|names             |ages        |zipped_array                        |person     |
+------------------+------------+------------------------------------+-----------+
|[John, Alice, Bob]|[25, 30, 22]|[{John, 25}, {Alice, 30}, {Bob, 22}]|{John, 25} |
|[John, Alice, Bob]|[25, 30, 22]|[{John, 25}, {Alice, 30}, {Bob, 22}]|{Alice, 30}|
|[John, Alice, Bob]|[25, 30, 22]|[{John, 25}, {Alice, 30}, {Bob, 22}]|{Bob, 22}  |
|[Mike, Emma]      |[40, 35]    |[{Mike, 40}, {Emma, 35}]            |{Mike, 40} |
|[Mike, Emma]      |[40, 35]    |[{Mike, 40}, {Emma, 35}]            |{Emma, 35} |
+------------------+------------+------------------------------------+-----------+



In [4]:
# Sample Employee Data
data = [
    ('Sales',
     [{'employee_name':'John','salary':'10000','age':'32'},
      {'employee_name':'Alice','salary':'7500','age':'30'},
      {'employee_name':'Bob','salary':'8500','age':'31'},
      {'employee_name':'Cathy','salary':'6800','age':'28'}]
     ),
    
    ('HR', 
     [{'employee_name':'Emma','salary':'5000','age':'24'},
      {'employee_name':'Sofia','salary':'7000','age':'31'},
      {'employee_name':'Katy','salary':'8000','age':'32'},
      {'employee_name':'Perry','salary':'6000','age':'25'}])
]

# Create DataFrame
df2 = spark.createDataFrame(data, ["Department", "Employee"])
df2.printSchema()
# Show initial data
df2.show(truncate=False)

root
 |-- Department: string (nullable = true)
 |-- Employee: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)

+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Department|Employee                                                                                                                                                                                                          |
+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Sales     |[{employee_name -> John, salary -> 10000, age -> 32}, {employee_name -> Alice, salary -> 7500, age ->

In [10]:
df3 = df2.withColumn("zipped_Emparray", arrays_zip(df2["Employee"]))
df3.show(truncate=False)

+----------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Department|Employee                                                                                                                                                                                                          |zipped_Emparray                                                                                                                                                                                                           |
+----------+------------------------------------------------------------------------------------------------------

In [12]:
df4 = df3.withColumn("Explode",explode(df3["zipped_Emparray"]))
df4.show()

+----------+--------------------+--------------------+--------------------+
|Department|            Employee|     zipped_Emparray|             Explode|
+----------+--------------------+--------------------+--------------------+
|     Sales|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|     Sales|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|     Sales|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|     Sales|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|        HR|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|        HR|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|        HR|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
|        HR|[{employee_name -...|[{{employee_name ...|{{employee_name -...|
+----------+--------------------+--------------------+--------------------+



In [19]:
df5 = df4.withColumn("employee_name",df4["Explode.Employee.employee_name"]).withColumn("salary",df4["Explode.Employee.salary"]).withColumn("age",df4["Explode.Employee.age"]).drop("zipped_Emparray").drop("Explode").drop("Employee")
df5.show(truncate=False)        

+----------+-------------+------+---+
|Department|employee_name|salary|age|
+----------+-------------+------+---+
|Sales     |John         |10000 |32 |
|Sales     |Alice        |7500  |30 |
|Sales     |Bob          |8500  |31 |
|Sales     |Cathy        |6800  |28 |
|HR        |Emma         |5000  |24 |
|HR        |Sofia        |7000  |31 |
|HR        |Katy         |8000  |32 |
|HR        |Perry        |6000  |25 |
+----------+-------------+------+---+

