### Read a JSON document while enforcing a precise schema, using FAILFAST mode, in order to improve the data pipeline reliability. The earlier in data lifecycle that data quality problems are detected and handled, the better. Schema constraint creates a strict contract between the data provider and data consumer. This type of schema restriction if especially useful when ingestin data from schemaless data sources, such as JSON documents and data by schema-agnostic storage systems.

In [12]:
import json

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [3]:
spark = SparkSession.builder \
    .appName('Data Analysis with Python and Pyspark - Chapter 06 Examples') \
    .getOrCreate()

23/01/01 21:09:46 WARN Utils: Your hostname, karlos-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 192.168.10.22 instead (on interface enp1s0)
23/01/01 21:09:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/01 21:09:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/01 21:09:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/01/01 21:09:50 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [14]:
_embedded_schema = T.StructType(
    [
        T.StructField('episodes', T.ArrayType(
            T.StructType(
                [
                    T.StructField('id', T.IntegerType()),
                    T.StructField('url', T.StringType()),
                    T.StructField('name', T.StringType()),
                    T.StructField('season', T.IntegerType()),
                    T.StructField('number', T.IntegerType()),
                    T.StructField('airdate', T.DateType()),
                    T.StructField('airtime', T.StringType()),
                    T.StructField('airstamp', T.TimestampType()),
                    T.StructField('runtime', T.IntegerType()),
                    T.StructField('_links', T.StructType(
                        [
                            T.StructField('self', T.StructType(
                                [
                                    T.StructField('href', T.StringType())
                                ]
                            ))
                        ]
                    )),
                    T.StructField('image', T.StructType(
                        [
                            T.StructField('medium', T.StringType()),
                            T.StructField('original', T.StringType())
                        ]
                    )),
                    T.StructField('summary', T.StringType()),               
                ]
            )
        ))
    ]
)

In [15]:
network_schema = T.StructType(
    [
        T.StructField('id', T.IntegerType()),
        T.StructField('name', T.StringType()),
        T.StructField('country', T.StructType(
            [
                T.StructField('name', T.StringType()),
                T.StructField('code', T.StringType()),
                T.StructField('timezones', T.StringType())
            ]
        ))
    ]
)

In [16]:
schema = T.StructType(
    [
        T.StructField('_embedded', _embedded_schema),
        T.StructField('id', T.IntegerType()),
        T.StructField('url', T.StringType()),
        T.StructField('name', T.StringType()),
        T.StructField('type', T.StringType()),
        T.StructField('language', T.StringType()),
        T.StructField('languages', T.ArrayType(T.StringType())),
        T.StructField('status', T.StringType()),
        T.StructField('runtime', T.IntegerType()),
        T.StructField('premiered', T.DateType()),
        T.StructField('officialSite', T.StringType()),
        T.StructField('genres', T.ArrayType(T.StringType())),
        T.StructField('schedule', T.StructType(
            [
                T.StructField('time', T.StringType()),
                T.StructField('days', T.ArrayType(T.StringType()))
            ]
        )),
        T.StructField('rating', T.StructType(
            [
                T.StructField('average', T.DoubleType())
            ]
        )),
        T.StructField('weight', T.IntegerType()),
        T.StructField('network', network_schema),
        T.StructField('webChannel', T.StructType(
            [
                T.StructField('country', T.StringType()),
                T.StructField('id', T.StringType()),
                T.StructField('name', T.StringType())
                
            ]
        )),
        T.StructField('externals', T.StructType(
            [
                T.StructField('tvrage', T.IntegerType()),
                T.StructField('thetvdb', T.IntegerType()),
                T.StructField('imdb', T.IntegerType()),
            ]
        )),
        T.StructField('image', T.StructType(
            [
                T.StructField('medium', T.StringType()),
                T.StructField('original', T.StringType())
            ]
        )),
        T.StructField('summary', T.StringType()),
        T.StructField('updated', T.TimestampType()),
        T.StructField('_links', T.StructType(
            [
                T.StructField('self', T.StructType(
                    [
                        T.StructField('href', T.StringType())
                    ]
                )),
                T.StructField('previousepisode', T.StructType(
                    [
                        T.StructField('href', T.StringType())
                    ]
                ))
            ]
        ))
    ]
)

In [8]:
all_shows = spark.read.json(
    '../../data/shows/*.json',
    multiLine=True,
    schema=schema,
    mode='FAILFAST'
)

In [8]:
all_shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- season: integer (nullable = true)
 |    |    |    |-- number: integer (nullable = true)
 |    |    |    |-- airdate: date (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- airstamp: timestamp (nullable = true)
 |    |    |    |-- runtime: integer (nullable = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- summary: string (nu

In [9]:
all_shows.count()

                                                                                

3

In [10]:
all_shows_inferred_schema = spark.read.json('../../data/shows/*.json')

In [11]:
all_shows_inferred_schema.count()

3

In [12]:
all_shows_inferred_schema.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [13]:
all_shows.select('_embedded.episodes.airdate', '_embedded.episodes.airstamp')

DataFrame[airdate: array<date>, airstamp: array<timestamp>]

In [14]:
all_shows_inferred_schema.select(
    '_embedded.episodes.airtime',
    '_embedded.episodes.airstamp'
)

DataFrame[airtime: array<string>, airstamp: array<string>]

#### `DataFrame.schema` returns a DataFrame schema as a StructType.

In [15]:
all_shows.schema

StructType([StructField('_embedded', StructType([StructField('episodes', ArrayType(StructType([StructField('id', IntegerType(), True), StructField('url', StringType(), True), StructField('name', StringType(), True), StructField('season', IntegerType(), True), StructField('number', IntegerType(), True), StructField('airdate', DateType(), True), StructField('airtime', StringType(), True), StructField('airstamp', TimestampType(), True), StructField('runtime', IntegerType(), True), StructField('_links', StructType([StructField('self', StructType([StructField('href', StringType(), True)]), True)]), True), StructField('image', StructType([StructField('medium', StringType(), True), StructField('original', StringType(), True)]), True), StructField('summary', StringType(), True)]), True), True)]), True), StructField('id', IntegerType(), True), StructField('url', StringType(), True), StructField('name', StringType(), True), StructField('type', StringType(), True), StructField('language', StringT

### Using JSON to represent schemas.

- `T.StructType.fromJson()` and `T.StructField.fromJson()`: read a schema represented by a JSON object represented by a dict.
- `DataFrame.schema.json()`: return a string containing the schema as a string representing JSON object.
- `DataFrame.schema.jsonValue()`: return a dict representing the schema.

In [16]:
all_shows.select('_embedded').schema.json()

'{"fields":[{"metadata":{},"name":"_embedded","nullable":true,"type":{"fields":[{"metadata":{},"name":"episodes","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"integer"},{"metadata":{},"name":"url","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"season","nullable":true,"type":"integer"},{"metadata":{},"name":"number","nullable":true,"type":"integer"},{"metadata":{},"name":"airdate","nullable":true,"type":"date"},{"metadata":{},"name":"airtime","nullable":true,"type":"string"},{"metadata":{},"name":"airstamp","nullable":true,"type":"timestamp"},{"metadata":{},"name":"runtime","nullable":true,"type":"integer"},{"metadata":{},"name":"_links","nullable":true,"type":{"fields":[{"metadata":{},"name":"self","nullable":true,"type":{"fields":[{"metadata":{},"name":"href","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{

In [17]:
all_shows.select('_embedded').schema.jsonValue()

{'type': 'struct',
 'fields': [{'name': '_embedded',
   'type': {'type': 'struct',
    'fields': [{'name': 'episodes',
      'type': {'type': 'array',
       'elementType': {'type': 'struct',
        'fields': [{'name': 'id',
          'type': 'integer',
          'nullable': True,
          'metadata': {}},
         {'name': 'url', 'type': 'string', 'nullable': True, 'metadata': {}},
         {'name': 'name', 'type': 'string', 'nullable': True, 'metadata': {}},
         {'name': 'season',
          'type': 'integer',
          'nullable': True,
          'metadata': {}},
         {'name': 'number',
          'type': 'integer',
          'nullable': True,
          'metadata': {}},
         {'name': 'airdate', 'type': 'date', 'nullable': True, 'metadata': {}},
         {'name': 'airtime',
          'type': 'string',
          'nullable': True,
          'metadata': {}},
         {'name': 'airstamp',
          'type': 'timestamp',
          'nullable': True,
          'metadata': {}},
 

In [18]:
json.loads(all_shows.schema.json()) == all_shows.schema.jsonValue()

True

In [19]:
T.StructType.fromJson(all_shows.schema.jsonValue()) == all_shows.schema

True

In [18]:
with open('../../shows_schema.json', mode='w') as f:
    # json.dump(all_shows.schema.jsonValue(), f)
    json.dump(schema.jsonValue(), f)

In [21]:
# spark.stop()