### Reading a line delimiter json file

In [2]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder\
.appName("My App")\
.getOrCreate()

In [12]:
df = spark.read\
.option("header","ture")\
.option("inferschema","ture")\
.json(path = r"F:\Git\NucleusTeq Training PySpark\dataset\json\line_delimited_file.jsonl")

In [13]:
df.show()

+---+-------------+-------+
|age|         city|   name|
+---+-------------+-------+
| 30|     New York|  Alice|
| 25|San Francisco|    Bob|
| 35|      Chicago|Charlie|
| 28|  Los Angeles|  David|
| 32|      Seattle|    Eva|
| 40|       Boston|  Frank|
| 22|      Houston|  Grace|
| 27|        Miami| Hannah|
| 31|       Denver|    Ian|
| 29|      Atlanta|   Jack|
+---+-------------+-------+



### Reading Multiline Json file

In [15]:
df = spark.read\
.option("header","ture")\
.option("inferschema","true")\
.json(path = r"F:\Git\NucleusTeq Training PySpark\dataset\json\multiline_file.json", multiLine = True)

In [16]:
df.show()

+---+-------------+-------+
|age|         city|   name|
+---+-------------+-------+
| 30|     New York|  Alice|
| 25|San Francisco|    Bob|
| 35|      Chicago|Charlie|
| 28|  Los Angeles|  David|
| 32|      Seattle|    Eva|
| 40|       Boston|  Frank|
| 22|      Houston|  Grace|
| 27|        Miami| Hannah|
| 31|       Denver|    Ian|
| 29|      Atlanta|   Jack|
+---+-------------+-------+



### What if a Json file has extra field in some records

In [19]:
df  = spark.read\
.option("header","ture")\
.option("inferschema","true")\
.json(path = r"F:\Git\NucleusTeq Training PySpark\dataset\json\extra_key_in_one_record.jsonl")

In [21]:
df.show()
# There will be extra column added by spark for that field

+------+---+-------------+-----------------+-------+
|Father|age|         city|            email|   name|
+------+---+-------------+-----------------+-------+
|  NULL| 30|     New York|             NULL|  Alice|
|  NULL| 25|San Francisco|             NULL|    Bob|
|  NULL| 35|      Chicago|             NULL|Charlie|
|  NULL| 28|  Los Angeles|david@example.com|  David|
|  NULL| 32|      Seattle|             NULL|    Eva|
|  NULL| 40|       Boston|             NULL|  Frank|
|  NULL| 22|      Houston|             NULL|  Grace|
|  NULL| 27|        Miami|             NULL| Hannah|
|  Alex| 31|       Denver|             NULL|    Ian|
|  NULL| 29|      Atlanta|             NULL|   Jack|
+------+---+-------------+-----------------+-------+



### Working with corrupted JSON File

The Below Json file is LDJSON 

In [61]:
spark.read\
.format("json")\
.option("mode","PERMISSIVE")\
.option("inferschema","true")\
.load(path = r"F:\Git\NucleusTeq Training PySpark\dataset\json\corrupt_jsonl_file.jsonl").show()

+--------------------+----+-------------+---------------+-------+
|     _corrupt_record| age|         city|          email|   name|
+--------------------+----+-------------+---------------+-------+
|                NULL|  30|     New York|           NULL|  Alice|
|                NULL|  25|San Francisco|           NULL|    Bob|
|                NULL|  35|      Chicago|           NULL|Charlie|
|                NULL|  28|  Los Angeles|           NULL|  David|
|                NULL|  32|      Seattle|eva@example.com|    Eva|
|{"name": "Frank",...|NULL|         NULL|           NULL|   NULL|
|                NULL|  22|      Houston|           NULL|  Grace|
|                NULL|  27|        Miami|           NULL| Hannah|
|                NULL|  31|       Denver|           NULL|    Ian|
|{"name": "Jack", ...|NULL|         NULL|           NULL|   NULL|
+--------------------+----+-------------+---------------+-------+



### Reading Nested JSON file


In [65]:
df = spark.read\
.option("inferschema","true")\
.option("header","true")\
.json(path = r"F:\Git\NucleusTeq Training PySpark\dataset\json\nested_json.json", multiLine = True)

In [68]:
df.show(truncate = False)

+---------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|company                                                              |employees                                                                                                                                                                                                                                                             |
+---------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------