# Multidimensional data frames: Using PySpark with JSON data.

DataFrames can contain arrays, maps, and structs

In [1]:
# reading json with python
import json

sample_json = """{
  "id": 143,
  "name": "Silicon Valley",
  "type": "Scripted",
  "language": "English",
  "genres": [
    "Comedy"
  ],
  "network": {
    "id": 8,
    "name": "HBO",
    "country": {
      "name": "United States",
      "code": "US",
      "timezone": "America/New_York"
    }
  }
}"""

document = json.loads(sample_json)
print(document)
type(document)

{'id': 143, 'name': 'Silicon Valley', 'type': 'Scripted', 'language': 'English', 'genres': ['Comedy'], 'network': {'id': 8, 'name': 'HBO', 'country': {'name': 'United States', 'code': 'US', 'timezone': 'America/New_York'}}}


dict

In [8]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F 

spark = SparkSession.builder.getOrCreate()

In [3]:
# read a show file from a jsonLine doc with a single show
shows = spark.read.json("./data/shows/shows-silicon-valley.json")
shows.show()

+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|           _embedded|              _links|           externals|  genres| id|               image|language|          name|             network|        officialSite| premiered|rating|runtime|         schedule|status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|{[{{{http://api.t...|{{http://api.tvma...|{tt2575988, 27716...|[Comedy]|143|{http://static.tv...| English|Silic

In [4]:
# read three shows
three_shows = spark.read.json("./data/shows/shows-*.json", multiLine=True)
three_shows.count()
assert three_shows.count() == 3

In [5]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [6]:
print(shows.columns)

['_embedded', '_links', 'externals', 'genres', 'id', 'image', 'language', 'name', 'network', 'officialSite', 'premiered', 'rating', 'runtime', 'schedule', 'status', 'summary', 'type', 'updated', 'url', 'webChannel', 'weight']


In [12]:
shows.select(F.col('name')).show()
three_shows.select(F.col('type')).show()

+--------------+
|          name|
+--------------+
|Silicon Valley|
+--------------+

+--------+
|    type|
+--------+
|Scripted|
|Scripted|
|Scripted|
+--------+



In [19]:
# here "genres" is a pyspark complex type (meaning) it is a type containing another datastructure (in this case a list/array)
array_subset = three_shows.select(F.col('name'), F.col('genres'))
array_subset.show()

+----------------+--------------------+
|            name|              genres|
+----------------+--------------------+
|The Golden Girls|     [Drama, Comedy]|
|    Breaking Bad|[Drama, Crime, Th...|
|  Silicon Valley|            [Comedy]|
+----------------+--------------------+



In [20]:
array_subset.schema

StructType([StructField('name', StringType(), True), StructField('genres', ArrayType(StringType(), True), True)])

In [21]:
# you can access elements in a nested list via several routes
array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot_and_index"),
    F.col("genres")[0].alias("col_and_index"),
    array_subset.genres.getItem(0).alias("dot_and_method"),
    F.col('genres').getItem(0).alias("col_and_method"),
)
array_subset.show()

+----------------+-------------+-------------+--------------+--------------+
|            name|dot_and_index|col_and_index|dot_and_method|col_and_method|
+----------------+-------------+-------------+--------------+--------------+
|The Golden Girls|        Drama|        Drama|         Drama|         Drama|
|    Breaking Bad|        Drama|        Drama|         Drama|         Drama|
|  Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
+----------------+-------------+-------------+--------------+--------------+

