In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.4.0") \
    .getOrCreate()

# create dataframe

In [5]:
data = [
    (1, """{"Zipcode":85016,"ZipCodeType":"STANDARD","City":"Phoenix","State":"AZ"}""")
]

df_map = spark.createDataFrame(data, schema=("id", "value"))
df_map.show(truncate=False)

                                                                                

+---+------------------------------------------------------------------------+
|id |value                                                                   |
+---+------------------------------------------------------------------------+
|1  |{"Zipcode":85016,"ZipCodeType":"STANDARD","City":"Phoenix","State":"AZ"}|
+---+------------------------------------------------------------------------+



In [6]:
data = [(1, '''[1, 2, 3]''')]
df_arr=spark.createDataFrame(data,schema=("id","value"))
df_arr.show()

+---+---------+
| id|    value|
+---+---------+
|  1|[1, 2, 3]|
+---+---------+



In [7]:
data = [
    (1, """{"Zipcode":85016,"ZipCodeType":"STANDARD","City":"Phoenix","State":"AZ"}""")
]

df_struct = spark.createDataFrame(data, schema=("id", "value"))
df_struct.show(truncate=False)

+---+------------------------------------------------------------------------+
|id |value                                                                   |
+---+------------------------------------------------------------------------+
|1  |{"Zipcode":85016,"ZipCodeType":"STANDARD","City":"Phoenix","State":"AZ"}|
+---+------------------------------------------------------------------------+



# convert a json string columns to array/map/struct type

In [13]:
# convert to map type
from pyspark.sql.types import MapType,StringType
from pyspark.sql.functions import from_json

schema=MapType(StringType(),StringType())
df_map_new = df_map.withColumn('map_column',from_json(df_map.value,schema))
df_map_new.printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- map_column: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [15]:
df_map_new.show()

+---+--------------------+--------------------+
| id|               value|          map_column|
+---+--------------------+--------------------+
|  1|{"Zipcode":85016,...|{Zipcode -> 85016...|
+---+--------------------+--------------------+



In [25]:
# convert to array type
from pyspark.sql.types import ArrayType,IntegerType
from pyspark.sql.functions import from_json

schema=ArrayType(IntegerType())
df_arr_new = df_arr.withColumn('arr_column',from_json(df_arr.value,schema))
df_arr_new.printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- arr_column: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [17]:
df_arr_new.show()

+---+---------+----------+
| id|    value|arr_column|
+---+---------+----------+
|  1|[1, 2, 3]| [1, 2, 3]|
+---+---------+----------+



In [20]:
# convert to struct type
from pyspark.sql.types import StructType,StringType,StructField
from pyspark.sql.functions import from_json

schema=StructType([
    StructField("Zipcode",StringType()),
    StructField("ZipCodeType",StringType()),
    StructField("City",StringType()),
    StructField("State",StringType()),
])
df_struct_new = df_struct.withColumn('struct_column',from_json(df_struct.value,schema))
df_struct_new.printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- struct_column: struct (nullable = true)
 |    |-- Zipcode: string (nullable = true)
 |    |-- ZipCodeType: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)



In [21]:
df_struct_new.show()

+---+--------------------+--------------------+
| id|               value|       struct_column|
+---+--------------------+--------------------+
|  1|{"Zipcode":85016,...|{85016, STANDARD,...|
+---+--------------------+--------------------+



# convert array/map/struct type columns to string json columns

In [23]:
from pyspark.sql.functions import to_json
df_map_new.select('*',to_json(df_map_new.map_column)).printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- map_column: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- to_json(map_column): string (nullable = true)



In [27]:
from pyspark.sql.functions import to_json
df_arr_new.select('*',to_json(df_arr_new.arr_column)).printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- arr_column: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- to_json(arr_column): string (nullable = true)



In [29]:
from pyspark.sql.functions import to_json
df_struct_new.select('*',to_json(df_struct_new.struct_column)).printSchema()

root
 |-- id: long (nullable = true)
 |-- value: string (nullable = true)
 |-- struct_column: struct (nullable = true)
 |    |-- Zipcode: string (nullable = true)
 |    |-- ZipCodeType: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |-- to_json(struct_column): string (nullable = true)



# schema of json

In [35]:
from pyspark.sql.functions import schema_of_json,lit

sample_json = df_struct.select("value").first()["value"]

json_schema = schema_of_json(lit(sample_json))

print(json_schema)

Column<'schema_of_json('{"Zipcode":85016,"ZipCodeType":"STANDARD","City":"Phoenix","State":"AZ"}')'>


In [36]:
df_struct.select(json_schema.alias("schema")).show(truncate=False)

+-------------------------------------------------------------------------+
|schema                                                                   |
+-------------------------------------------------------------------------+
|STRUCT<City: STRING, State: STRING, ZipCodeType: STRING, Zipcode: BIGINT>|
+-------------------------------------------------------------------------+

