In [2]:
import findspark
findspark.init()

In [11]:
import pyspark
from pyspark.sql import SparkSession,Row

In [12]:
spark = SparkSession.builder.appName('cgpysparksqllabs').getOrCreate()

In [13]:
jsonString = """{"Zipcode":704, "ZipcodeType":"STANDARD","City":"PARIS","State":"PR"}"""
df = spark.createDataFrame([(1, jsonString)],["id", "value"])
df.show(truncate=False)

                                                                                

+---+---------------------------------------------------------------------+
|id |value                                                                |
+---+---------------------------------------------------------------------+
|1  |{"Zipcode":704, "ZipcodeType":"STANDARD","City":"PARIS","State":"PR"}|
+---+---------------------------------------------------------------------+



In [14]:
# Convert json string column to Map type 
from pyspark.sql.types import MapType,StringType
from pyspark.sql.functions import from_json
df2 = df.withColumn("value", from_json(df.value,MapType(StringType(),StringType())))
df2.printSchema()
df2.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---+---------------------------------------------------------------------+
|id |value                                                                |
+---+---------------------------------------------------------------------+
|1  |{Zipcode -> 704, ZipcodeType -> STANDARD, City -> PARIS, State -> PR}|
+---+---------------------------------------------------------------------+



In [15]:
# to_json() func 
from pyspark.sql.functions import to_json,col 
df2.withColumn("value",to_json(col("value"))).show(truncate=False)

+---+----------------------------------------------------------------------+
|id |value                                                                 |
+---+----------------------------------------------------------------------+
|1  |{"Zipcode":"704","ZipcodeType":"STANDARD","City":"PARIS","State":"PR"}|
+---+----------------------------------------------------------------------+



In [16]:
# json_tuple() func
from pyspark.sql.functions import json_tuple 
df.select(col("id"),json_tuple(col("value"),"Zipcode","ZipCodeType","City")).toDF("id","ZipCode","ZipCodeType","City").show(truncate=False)

+---+-------+-----------+-----+
|id |ZipCode|ZipCodeType|City |
+---+-------+-----------+-----+
|1  |704    |null       |PARIS|
+---+-------+-----------+-----+



In [17]:
# get_json_object() func in PySpark SQL 
from pyspark.sql.functions import get_json_object
df.select(col("id"),get_json_object(col("value"),"$.ZipCodeType").alias("ZipCodeType")).show(truncate=False)

+---+-----------+
|id |ZipCodeType|
+---+-----------+
|1  |null       |
+---+-----------+



In [18]:
from pyspark.sql.functions import schema_of_json,lit
schemaStr=spark.range(1).select(schema_of_json(lit("""{"Zipcode":704, "ZipCodetype":"STANDARD","City":"PARIS","State":"PR"}"""))).collect()[0][0]
print(schemaStr)

STRUCT<City: STRING, State: STRING, ZipCodetype: STRING, Zipcode: BIGINT>
