In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataframe

In [4]:
data =(('Alicia','Joseph',['Java','Scala','Spark'],{'hair':'black','eye':'brown'}), \
('Robert','Gee',['Spark','Java'],{'hair':'brown','eye':None}), \
('Mike','Bianca',['CSharp',''],{'hair':'red','eye':''}), \
('John','Kumar',None,None), \
('Jeff','L',['1','2'],{}))
schema = ('FirstName','LastName','Languages','properties')
emp1 = spark.createDataFrame(data=data,schema=schema)

In [5]:
emp1.show()

+---------+--------+--------------------+--------------------+
|FirstName|LastName|           Languages|          properties|
+---------+--------+--------------------+--------------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|
|     John|   Kumar|                NULL|                NULL|
|     Jeff|       L|              [1, 2]|                  {}|
+---------+--------+--------------------+--------------------+



# explode

In [8]:
from pyspark.sql.functions import explode

emp1\
    .withColumn('exploded lanagues',explode('languages'))\
    .show()

+---------+--------+--------------------+--------------------+-----------------+
|FirstName|LastName|           Languages|          properties|exploded lanagues|
+---------+--------+--------------------+--------------------+-----------------+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|             Java|
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|            Scala|
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|            Spark|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|            Spark|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|             Java|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|           CSharp|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|                 |
|     Jeff|       L|              [1, 2]|                  {}|                1|
|     Jeff|       L|              [1, 2]|                  {}|                2|
+---------+--------+--------

In [11]:
emp1.select('*',explode('properties')).show()

+---------+--------+--------------------+--------------------+----+-----+
|FirstName|LastName|           Languages|          properties| key|value|
+---------+--------+--------------------+--------------------+----+-----+
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...| eye|brown|
|   Alicia|  Joseph|[Java, Scala, Spark]|{eye -> brown, ha...|hair|black|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...| eye| NULL|
|   Robert|     Gee|       [Spark, Java]|{eye -> NULL, hai...|hair|brown|
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...| eye|     |
|     Mike|  Bianca|          [CSharp, ]|{eye -> , hair ->...|hair|  red|
+---------+--------+--------------------+--------------------+----+-----+



# flatten

In [16]:
data =(('Alicia',[['Java'],['Scala'],['Python']]),\
('Robert',[[None],['Java'],['Hadoop']])
)
schema = ('empName','ArrayofArray')
emp = spark.createDataFrame(data=data,schema=schema)
emp.show()

+-------+--------------------+
|empName|        ArrayofArray|
+-------+--------------------+
| Alicia|[[Java], [Scala],...|
| Robert|[[NULL], [Java], ...|
+-------+--------------------+



In [17]:
from pyspark.sql.functions import flatten
emp.select(emp.empName,flatten(emp.ArrayofArray)).show()

+-------+---------------------+
|empName|flatten(ArrayofArray)|
+-------+---------------------+
| Alicia| [Java, Scala, Pyt...|
| Robert| [NULL, Java, Hadoop]|
+-------+---------------------+



In [18]:
spark.stop()