# Read JSON Data from a Dataframe Column

In [None]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Read JSON data")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

In [None]:
# Example Data Frame with column having JSON data
_data = [
    [
        "EMP001",
        '{"dept" : "account", "fname": "Ramesh", "lname": "Singh", "skills": ["excel", "tally", "word"]}',
    ],
    [
        "EMP002",
        '{"dept" : "sales", "fname": "Siv", "lname": "Kumar", "skills": ["biking", "sales"]}',
    ],
    [
        "EMP003",
        '{"dept" : "hr", "fname": "MS Raghvan", "skills": ["communication", "soft-skills"]}',
    ],
]

# Columns for the data
_cols = ["emp_no", "raw_data"]

# Lets create the raw Data Frame
df_raw = spark.createDataFrame(data=_data, schema=_cols)
df_raw.printSchema()
df_raw.show(3, False)

root
 |-- emp_no: string (nullable = true)
 |-- raw_data: string (nullable = true)



                                                                                

+------+-----------------------------------------------------------------------------------------------+
|emp_no|raw_data                                                                                       |
+------+-----------------------------------------------------------------------------------------------+
|EMP001|{"dept" : "account", "fname": "Ramesh", "lname": "Singh", "skills": ["excel", "tally", "word"]}|
|EMP002|{"dept" : "sales", "fname": "Siv", "lname": "Kumar", "skills": ["biking", "sales"]}            |
|EMP003|{"dept" : "hr", "fname": "MS Raghvan", "skills": ["communication", "soft-skills"]}             |
+------+-----------------------------------------------------------------------------------------------+



In [5]:
# We need to parse the JSON values from the Data Frame column - raw_data

# Determine the schema of the JSON payload from the column
json_schema_df = spark.read.json(df_raw.rdd.map(lambda row: row.raw_data))
json_schema = json_schema_df.schema

                                                                                

In [6]:
json_schema

StructType([StructField('dept', StringType(), True), StructField('fname', StringType(), True), StructField('lname', StringType(), True), StructField('skills', ArrayType(StringType(), True), True)])

In [None]:
# Apply the schema to payload to read the data
from pyspark.sql.functions import from_json

df_details = df_raw.withColumn(
    "parsed_data", from_json(df_raw["raw_data"], json_schema)
).drop("raw_data")
df_details.printSchema()

root
 |-- emp_no: string (nullable = true)
 |-- parsed_data: struct (nullable = true)
 |    |-- dept: string (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |    |-- skills: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [8]:
# Lets verify the data
df_details.select("emp_no", "parsed_data.*").show(10, False)

+------+-------+----------+-----+----------------------------+
|emp_no|dept   |fname     |lname|skills                      |
+------+-------+----------+-----+----------------------------+
|EMP001|account|Ramesh    |Singh|[excel, tally, word]        |
|EMP002|sales  |Siv       |Kumar|[biking, sales]             |
|EMP003|hr     |MS Raghvan|null |[communication, soft-skills]|
+------+-------+----------+-----+----------------------------+



In [None]:
# We can explode the data further from list
from pyspark.sql.functions import explode

df_details.select(
    "emp_no",
    "parsed_data.dept",
    "parsed_data.fname",
    "parsed_data.lname",
    "parsed_data",
).withColumn("skills", explode("parsed_data.skills")).drop("parsed_data").show(
    100, False
)

+------+-------+----------+-----+-------------+
|emp_no|dept   |fname     |lname|skills       |
+------+-------+----------+-----+-------------+
|EMP001|account|Ramesh    |Singh|excel        |
|EMP001|account|Ramesh    |Singh|tally        |
|EMP001|account|Ramesh    |Singh|word         |
|EMP002|sales  |Siv       |Kumar|biking       |
|EMP002|sales  |Siv       |Kumar|sales        |
|EMP003|hr     |MS Raghvan|null |communication|
|EMP003|hr     |MS Raghvan|null |soft-skills  |
+------+-------+----------+-----+-------------+



In [10]:
spark.stop()