In [0]:
#Read Raw json file

df = spark.read.option("multiline", "true").json("/FileStore/tables/Sample_JSON_file_with_multiple_records_download.json")

display(df)

users
"List(List(krish.lee@learningcontainer.com, Krish, Lee, 123456, 1), List(racks.jacson@learningcontainer.com, racks, jacson, 123456, 2), List(denial.roast@learningcontainer.com, denial, roast, 33333333, 3), List(devid.neo@learningcontainer.com, devid, neo, 222222222, 4), List(jone.mac@learningcontainer.com, jone, mac, 111111111, 5))"


In [0]:
#Define Python function to flatten deeply nested Json file

from pyspark.sql.types import *
from pyspark.sql.functions import *

#Flatten array of structs and structs
def flatten(df):
   # compute Complex Fields (Lists and Structs) in Schema   
   complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
   while len(complex_fields)!=0:
      col_name=list(complex_fields.keys())[0]
      print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
    
      # if StructType then convert all sub element to columns.
      # i.e. flatten structs
      if (type(complex_fields[col_name]) == StructType):
         expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in  complex_fields[col_name]]]
         df=df.select("*", *expanded).drop(col_name)
    
      # if ArrayType then add the Array Elements as Rows using the explode function
      # i.e. explode Arrays
      elif (type(complex_fields[col_name]) == ArrayType):    
         df=df.withColumn(col_name,explode_outer(col_name))
    
      # recompute remaining Complex Fields in Schema       
      complex_fields = dict([(field.name, field.dataType)
                             for field in df.schema.fields
                             if type(field.dataType) == ArrayType or  type(field.dataType) == StructType])
   return df

In [0]:
#Apply Flattening Function and Display flattened data

df_flatten = flatten (df)

display(df_flatten)

Processing :users Type : <class 'pyspark.sql.types.ArrayType'>
Processing :users Type : <class 'pyspark.sql.types.StructType'>


users_emailAddress,users_firstName,users_lastName,users_phoneNumber,users_userId
krish.lee@learningcontainer.com,Krish,Lee,123456,1
racks.jacson@learningcontainer.com,racks,jacson,123456,2
denial.roast@learningcontainer.com,denial,roast,33333333,3
devid.neo@learningcontainer.com,devid,neo,222222222,4
jone.mac@learningcontainer.com,jone,mac,111111111,5
