## Import the libraries and modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = (SparkSession.builder.appName("read-json-data")\
                             .master("spark://spark-master:7077")\
                             .config("spark.executor.memory", "512m")
                             .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/08 21:41:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load the JSON data into Spark Dataframe

In [4]:
df = spark.read.format("json")\
                .option("multiLine", "true")\
                .option("inferSchema", "true")\
                .load("data/nobel_prizes.json")
                

## View the schema

In [5]:
df.printSchema()

root
 |-- category: string (nullable = true)
 |-- laureates: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstname: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- motivation: string (nullable = true)
 |    |    |-- share: string (nullable = true)
 |    |    |-- surname: string (nullable = true)
 |-- overallMotivation: string (nullable = true)
 |-- year: string (nullable = true)



## View the dataframe

In [6]:
df.show(5)

+----------+--------------------+-----------------+----+
|  category|           laureates|overallMotivation|year|
+----------+--------------------+-----------------+----+
| chemistry|[{Carolyn, 1015, ...|             null|2022|
| economics|[{Ben, 1021, "for...|             null|2022|
|literature|[{Annie, 1017, "f...|             null|2022|
|     peace|[{Ales, 1018, "Th...|             null|2022|
|   physics|[{Alain, 1012, "f...|             null|2022|
+----------+--------------------+-----------------+----+
only showing top 5 rows



## Flatten the nested Stuctures within Json

In [15]:
df_flattened = (df.withColumn("laureates", explode(col("laureates")))\
                  .select(
                          col("category"),\
                          col('year'),
                          col('overallMotivation'),
                          col("laureates.id"),\
                          col('laureates.firstname'),\
                          col('laureates.surname'),\
                          col('laureates.share'),\
                          col('laureates.motivation')))
                        

In [16]:
type(df)

pyspark.sql.dataframe.DataFrame

In [17]:
type(df_flattened)

pyspark.sql.dataframe.DataFrame

In [18]:
df_flattened.show(5)

+---------+----+-----------------+----+---------+---------+-----+--------------------+
| category|year|overallMotivation|  id|firstname|  surname|share|          motivation|
+---------+----+-----------------+----+---------+---------+-----+--------------------+
|chemistry|2022|             null|1015|  Carolyn| Bertozzi|    3|"for the developm...|
|chemistry|2022|             null|1016|   Morten|   Meldal|    3|"for the developm...|
|chemistry|2022|             null| 743|    Barry|Sharpless|    3|"for the developm...|
|economics|2022|             null|1021|      Ben| Bernanke|    3|"for research on ...|
|economics|2022|             null|1022|  Douglas|  Diamond|    3|"for research on ...|
+---------+----+-----------------+----+---------+---------+-----+--------------------+
only showing top 5 rows



In [19]:
schema_dict = {field.name : field.dataType for field in df.schema.fields}
schema_dict

{'category': StringType(),
 'laureates': ArrayType(StructType([StructField('firstname', StringType(), True), StructField('id', StringType(), True), StructField('motivation', StringType(), True), StructField('share', StringType(), True), StructField('surname', StringType(), True)]), True),
 'overallMotivation': StringType(),
 'year': StringType()}

## Enforce our schema

In [25]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [26]:
json_schema = StructType(
                        [
                         StructField('category',StringType(),True),\
                         StructField('laureates', ArrayType(StructType(
                                                                       [StructField('firstname', StringType(), True),
                                                                        StructField('id', StringType(), True), 
                                                                        StructField('motivation', StringType(), True), 
                                                                        StructField('share', StringType(), True), 
                                                                        StructField('surname', StringType(), True)]), True),True),
 StructField('overallMotivation',StringType(), True),
 StructField('year', IntegerType(),True)
                                     ])


In [27]:
json_df_with_schema = spark.read.format("json")\
                                .schema(json_schema)\
                                .option("multiLine","true")\
                                .option("mode", "permissive")\
                                .option("columnNameOfCorruptRecord", "corrupt_record")\
                                .load("data/nobel_prizes.json")

In [29]:
json_df_with_schema.show(5)

25/03/08 22:12:19 ERROR TaskSetManager: Task 0 in stage 5.0 failed 4 times; aborting job


Py4JJavaError: An error occurred while calling o87.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 4 times, most recent failure: Lost task 0.3 in stage 5.0 (TID 11) (172.19.0.7 executor 1): java.lang.ClassCastException: class org.apache.spark.sql.catalyst.util.GenericArrayData cannot be cast to class org.apache.spark.unsafe.types.UTF8String (org.apache.spark.sql.catalyst.util.GenericArrayData and org.apache.spark.unsafe.types.UTF8String are in unnamed module of loader 'app')
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow.getUTF8String(rows.scala:46)
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow.getUTF8String$(rows.scala:46)
	at org.apache.spark.sql.catalyst.expressions.GenericInternalRow.getUTF8String(rows.scala:195)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.$anonfun$apply$1(FileFormat.scala:156)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.next(FileScanRDD.scala:211)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1206)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1206)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2984)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2284)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)
Caused by: java.lang.ClassCastException: class org.apache.spark.sql.catalyst.util.GenericArrayData cannot be cast to class org.apache.spark.unsafe.types.UTF8String (org.apache.spark.sql.catalyst.util.GenericArrayData and org.apache.spark.unsafe.types.UTF8String are in unnamed module of loader 'app')
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow.getUTF8String(rows.scala:46)
	at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow.getUTF8String$(rows.scala:46)
	at org.apache.spark.sql.catalyst.expressions.GenericInternalRow.getUTF8String(rows.scala:195)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.$anonfun$apply$1(FileFormat.scala:156)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.next(FileScanRDD.scala:211)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:888)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:888)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:92)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:139)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1529)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


## Using get_json_object()

In [39]:
from pyspark.sql.functions import get_json_object
from pyspark.sql.types import StringType

#create a dataframe

df = spark.createDataFrame([(1, '{"name" : "Krishna", "age" : 25}'),
                            (2, '{"name" : "Radha", "age" : 25}')
                           ], ["id", "json_data"])

## Extract the name field from the Json string column

In [40]:
name_df = df.select(get_json_object(col("json_data"), "$.name").alias("name"))

In [41]:
name_df.show()

+-------+
|   name|
+-------+
|Krishna|
|  Radha|
+-------+



## Cast the extracted value to a string

In [42]:
name_string_df = name_df.withColumn("name_str", name_df["name"].cast(StringType()))

In [43]:
name_string_df.show()

+-------+--------+
|   name|name_str|
+-------+--------+
|Krishna| Krishna|
|  Radha|   Radha|
+-------+--------+



## Using json_tuple()

In [49]:
from pyspark.sql.functions import json_tuple

# Create a dataframe with JSON string column

df = spark.createDataFrame([
                            (1, '{"name" : "Krishna", "age" : 25}'),
                            (2, '{"name" : "Radha", "age" : 25}')], ["id", "json_data"])

In [52]:
name_age_df = df.select(json_tuple(col("json_data"), "name", "age"))


name_age_df.show()

+-------+---+
|     c0| c1|
+-------+---+
|Krishna| 25|
|  Radha| 25|
+-------+---+



In [54]:
name_age_df = df.select(json_tuple(col("json_data"), "name", "age").alias("name", "age"))

name_age_df.show()

+-------+---+
|   name|age|
+-------+---+
|Krishna| 25|
|  Radha| 25|
+-------+---+



In [55]:
spark.stop()