# Multidimensional data frames: Using PySpark with JSON data.

DataFrames can contain arrays, maps, and structs

In [1]:
# reading json with python
import json

sample_json = """{
  "id": 143,
  "name": "Silicon Valley",
  "type": "Scripted",
  "language": "English",
  "genres": [
    "Comedy"
  ],
  "network": {
    "id": 8,
    "name": "HBO",
    "country": {
      "name": "United States",
      "code": "US",
      "timezone": "America/New_York"
    }
  }
}"""

document = json.loads(sample_json)
print(document)
type(document)

{'id': 143, 'name': 'Silicon Valley', 'type': 'Scripted', 'language': 'English', 'genres': ['Comedy'], 'network': {'id': 8, 'name': 'HBO', 'country': {'name': 'United States', 'code': 'US', 'timezone': 'America/New_York'}}}


dict

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F 

spark = SparkSession.builder.getOrCreate()

your 131072x1 screen size is bogus. expect trouble
24/09/22 06:46:04 WARN Utils: Your hostname, LAPTOP-CDHH1LA0 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/22 06:46:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/22 06:46:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# read a show file from a jsonLine doc with a single show
shows = spark.read.json("./data/shows/shows-silicon-valley.json")
shows.show()

+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|           _embedded|              _links|           externals|  genres| id|               image|language|          name|             network|        officialSite| premiered|rating|runtime|         schedule|status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|{[{{{http://api.t...|{{http://api.tvma...|{tt2575988, 27716...|[Comedy]|143|{http://static.tv...| English|Silic

In [4]:
# read three shows
three_shows = spark.read.json("./data/shows/shows-*.json", multiLine=True)
three_shows.count()
assert three_shows.count() == 3

In [5]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [6]:
print(shows.columns)

['_embedded', '_links', 'externals', 'genres', 'id', 'image', 'language', 'name', 'network', 'officialSite', 'premiered', 'rating', 'runtime', 'schedule', 'status', 'summary', 'type', 'updated', 'url', 'webChannel', 'weight']


In [7]:
shows.select(F.col('name')).show()
three_shows.select(F.col('type')).show()

+--------------+
|          name|
+--------------+
|Silicon Valley|
+--------------+

+--------+
|    type|
+--------+
|Scripted|
|Scripted|
|Scripted|
+--------+



In [8]:
# here "genres" is a pyspark complex type (meaning) it is a type containing another datastructure (in this case a list/array)
array_subset = three_shows.select(F.col('name'), F.col('genres'))
array_subset.show()

+----------------+--------------------+
|            name|              genres|
+----------------+--------------------+
|The Golden Girls|     [Drama, Comedy]|
|    Breaking Bad|[Drama, Crime, Th...|
|  Silicon Valley|            [Comedy]|
+----------------+--------------------+



In [9]:
array_subset.schema

StructType([StructField('name', StringType(), True), StructField('genres', ArrayType(StringType(), True), True)])

In [10]:
# you can access elements in a nested list via several routes
array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot_and_index"),
    F.col("genres")[0].alias("col_and_index"),
    array_subset.genres.getItem(0).alias("dot_and_method"),
    F.col('genres').getItem(0).alias("col_and_method"),
)
array_subset.show()

+----------------+-------------+-------------+--------------+--------------+
|            name|dot_and_index|col_and_index|dot_and_method|col_and_method|
+----------------+-------------+-------------+--------------+--------------+
|The Golden Girls|        Drama|        Drama|         Drama|         Drama|
|    Breaking Bad|        Drama|        Drama|         Drama|         Drama|
|  Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
+----------------+-------------+-------------+--------------+--------------+



In [26]:
# performing multiple operations on an array column
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("one"),
    F.lit("Horror").alias("two"),
    F.lit("Drama").alias("three"),
    F.col("dot_and_index"),
)

array_subset_repeated.show(3, False)

array_subset_repeated = array_subset_repeated.select(
    "name",
    F.array("one", "two", "three").alias("Some_Genres"),
    F.array_repeat("dot_and_index", 5).alias("Repeated_Genres")
)

array_subset_repeated.show(3, False)

+----------------+------+------+-----+-------------+
|name            |one   |two   |three|dot_and_index|
+----------------+------+------+-----+-------------+
|The Golden Girls|Comedy|Horror|Drama|Drama        |
|Breaking Bad    |Comedy|Horror|Drama|Drama        |
|Silicon Valley  |Comedy|Horror|Drama|Comedy       |
+----------------+------+------+-----+-------------+

+----------------+-----------------------+----------------------------------------+
|name            |Some_Genres            |Repeated_Genres                         |
+----------------+-----------------------+----------------------------------------+
|The Golden Girls|[Comedy, Horror, Drama]|[Drama, Drama, Drama, Drama, Drama]     |
|Breaking Bad    |[Comedy, Horror, Drama]|[Drama, Drama, Drama, Drama, Drama]     |
|Silicon Valley  |[Comedy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|
+----------------+-----------------------+----------------------------------------+



In [15]:
array_subset_repeated.select(
    "name",
    F.size("Some_Genres"),
    F.size("Repeated_Genres"),
).show()

+----------------+-----------------+---------------------+
|            name|size(Some_Genres)|size(Repeated_Genres)|
+----------------+-----------------+---------------------+
|The Golden Girls|                3|                    5|
|    Breaking Bad|                3|                    5|
|  Silicon Valley|                3|                    5|
+----------------+-----------------+---------------------+



In [19]:
array_subset_repeated.select(
    "name",
    F.array_distinct("Some_Genres"),
    F.array_distinct("Repeated_Genres"),
).show(truncate = False)

+----------------+---------------------------+-------------------------------+
|name            |array_distinct(Some_Genres)|array_distinct(Repeated_Genres)|
+----------------+---------------------------+-------------------------------+
|The Golden Girls|[Comedy, Horror, Drama]    |[Drama]                        |
|Breaking Bad    |[Comedy, Horror, Drama]    |[Drama]                        |
|Silicon Valley  |[Comedy, Horror, Drama]    |[Comedy]                       |
+----------------+---------------------------+-------------------------------+



In [29]:
# get common values by intersecting two arrays
array_subset_repeated = array_subset_repeated.select(
    "name",
    F.array_intersect("Some_Genres", "Repeated_Genres").alias("Genres")
)

array_subset_repeated.show(truncate=False)

+----------------+--------+
|name            |Genres  |
+----------------+--------+
|The Golden Girls|[Drama] |
|Breaking Bad    |[Drama] |
|Silicon Valley  |[Comedy]|
+----------------+--------+



In [30]:
# you can get the index of an array via
array_subset_repeated.select(
    "Genres",
    F.array_position("Genres", "Comedy")
).show()

+--------+------------------------------+
|  Genres|array_position(Genres, Comedy)|
+--------+------------------------------+
| [Drama]|                             0|
| [Drama]|                             0|
|[Comedy]|                             1|
+--------+------------------------------+



## 6.2.2 The map type: keys and values within a column

In [39]:
# construct a dataframe with a column containing a map
columns = ["name", "language", "type"]

shows_map = shows.select(
    *[F.lit(column) for column in columns],
    F.array(*columns).alias("values")
)

shows_map.show(truncate=False)

print(*[1,2,3,4])
print([1,2,3,4])

shows_map = shows_map.select(
    F.array(*columns).alias("keys"),
    "values"
)
shows_map.show(1, truncate=False)

shows_map = shows_map.select(
    F.map_from_arrays("keys", "values").alias("mapped")
)
shows_map.printSchema()
shows_map.show(1, False)



+----+--------+----+-----------------------------------+
|name|language|type|values                             |
+----+--------+----+-----------------------------------+
|name|language|type|[Silicon Valley, English, Scripted]|
+----+--------+----+-----------------------------------+

1 2 3 4
[1, 2, 3, 4]
+----------------------+-----------------------------------+
|keys                  |values                             |
+----------------------+-----------------------------------+
|[name, language, type]|[Silicon Valley, English, Scripted]|
+----------------------+-----------------------------------+

root
 |-- mapped: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---------------------------------------------------------------+
|mapped                                                         |
+---------------------------------------------------------------+
|{name -> Silicon Valley, language -> English, type -> Scripted}|
+-------

In [42]:
# map property selection

shows_map.select(
    F.col("mapped.name"),
    F.col("mapped")["name"],
    shows_map.mapped["name"]
).show()

+--------------+--------------+--------------+
|          name|  mapped[name]|  mapped[name]|
+--------------+--------------+--------------+
|Silicon Valley|Silicon Valley|Silicon Valley|
+--------------+--------------+--------------+



In [45]:
sampleJson = spark.read.json("""
    {"name": "Sample name","keywords": ["PySpark", "Python", "Data"]}
""")


IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: 
    {"name":%20%22Sample%20name%22,%22keywords%22:%20%5B%22PySpark%22,%20%22Python%22,%20%22Data%22%5D%7D%0A