In [4]:
import datetime
import json

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [6]:
spark = SparkSession.builder \
    .appName('Data Analysis with Python and PySpark - Chapter 06 Exercises') \
    .getOrCreate()

In [7]:
with open('shows_schema.json', mode='r') as f:
    schema = T.StructType.fromJson(json.load(f))

schema

StructType([StructField('_embedded', StructType([StructField('episodes', ArrayType(StructType([StructField('id', IntegerType(), True), StructField('url', StringType(), True), StructField('name', StringType(), True), StructField('season', IntegerType(), True), StructField('number', IntegerType(), True), StructField('airdate', DateType(), True), StructField('airtime', StringType(), True), StructField('airstamp', TimestampType(), True), StructField('runtime', IntegerType(), True), StructField('_links', StructType([StructField('self', StructType([StructField('href', StringType(), True)]), True)]), True), StructField('image', StructType([StructField('medium', StringType(), True), StructField('original', StringType(), True)]), True), StructField('summary', StringType(), True)]), True), True)]), True), StructField('id', IntegerType(), True), StructField('url', StringType(), True), StructField('name', StringType(), True), StructField('type', StringType(), True), StructField('language', StringT

In [8]:
all_shows = spark.read.json(
    '../../data/shows/*.json',
    multiLine=True,
    schema=schema,
    mode='FAILFAST'
)

all_shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: integer (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- season: integer (nullable = true)
 |    |    |    |-- number: integer (nullable = true)
 |    |    |    |-- airdate: date (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- airstamp: timestamp (nullable = true)
 |    |    |    |-- runtime: integer (nullable = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- summary: string (nu

#### Exercise 6.5
Although much less common, you can create a data frame from a dictionary. Since
dictionaries are so close to JSON documents, build the schema for ingesting the fol-
lowing dictionary. (Both JSON or PySpark schemas are valid here.)

```python
dict_schema = ???
spark.createDataFrame([{"one": 1, "two": [1,2,3]}], schema=dict_schema)
```

In [9]:
dict_schema = T.StructType(
    [
        T.StructField('one', T.IntegerType()),
        T.StructField('two', T.ArrayType(T.IntegerType()))
    ]
)

spark.createDataFrame([{'one': 1, 'two': [1, 2, 3]}], schema=dict_schema)

DataFrame[one: int, two: array<int>]

#### Exercise 6.6 
Using `all_shows`, compute the time between the first and last episodes for each
show. Which show had the longest tenure?

In [39]:
interval_between_first_and_last_episodes = all_shows.withColumn(
    'interval',
    F.array_max('_embedded.episodes.airdate') - F.array_min('_embedded.episodes.airdate')
)

interval_between_first_and_last_episodes.select('id', 'name', 'interval') \
    .orderBy('interval', ascending=False) \
    .show()

+---+----------------+-------------------+
| id|            name|           interval|
+---+----------------+-------------------+
|722|The Golden Girls|INTERVAL '2429' DAY|
|169|    Breaking Bad|INTERVAL '2079' DAY|
|143|  Silicon Valley|INTERVAL '2072' DAY|
+---+----------------+-------------------+



#### Exercise 6.7
Take the shows data frame and extract the air date and name of each episode in two
array columns.

In [11]:
all_shows.select('_embedded.episodes.airdate', '_embedded.episodes.name') \
    .show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                     airdate|                                                        name|
+------------------------------------------------------------+------------------------------------------------------------+
|[1985-09-14, 1985-09-21, 1985-09-28, 1985-10-05, 1985-10-...|[The Engagement, Guess Who's Coming to the Wedding?, Rose...|
|[2008-01-20, 2008-01-27, 2008-02-10, 2008-02-17, 2008-02-...|[Pilot, Cat's in the Bag..., ...and the Bag's in the Rive...|
|[2014-04-06, 2014-04-13, 2014-04-20, 2014-04-27, 2014-05-...|[Minimum Viable Product, The Cap Table, Articles of Incor...|
+------------------------------------------------------------+------------------------------------------------------------+



In [12]:
# Another option:

all_shows.withColumn(
    'airdate_and_name', 
    F.explode(F.arrays_zip('_embedded.episodes.airdate', '_embedded.episodes.name'))
).select('airdate_and_name.*') \
    .show(truncate=False)

+----------+----------------------------------+
|airdate   |name                              |
+----------+----------------------------------+
|1985-09-14|The Engagement                    |
|1985-09-21|Guess Who's Coming to the Wedding?|
|1985-09-28|Rose the Prude                    |
|1985-10-05|Transplant                        |
|1985-10-19|The Triangle                      |
|1985-10-26|On Golden Girls                   |
|1985-11-02|The Competition                   |
|1985-11-09|Break-In                          |
|1985-11-16|Blanche and the Younger Man       |
|1985-11-23|Heart Attack                      |
|1985-11-30|The Return of Dorothy's Ex        |
|1985-12-07|The Custody Battle                |
|1985-12-14|A Little Romance                  |
|1985-12-21|That Was No Lady                  |
|1986-01-11|In a Bed of Rose's                |
|1986-01-18|The Truth Will Out                |
|1986-02-01|Nice and Easy                     |
|1986-02-08|The Operation               

#### Exercise 6.8

Given the following data frame, create a new data frame that contains a single map from one to square:

```python
exo6_8 = spark.createDataFrame([[1, 2], [2, 4], [3, 9]], ["one", "square"])
```

In [25]:
exo6_8 = spark.createDataFrame([[1, 2], [2, 4], [3, 9]], ["one", "square"])

exo6_8.show()

+---+------+
|one|square|
+---+------+
|  1|     2|
|  2|     4|
|  3|     9|
+---+------+



In [26]:
exo6_8.select(
    F.map_from_arrays(F.collect_list('one'), F.collect_list('square')) \
        .alias('map')
).show(truncate=False)

+------------------------+
|map                     |
+------------------------+
|{1 -> 2, 2 -> 4, 3 -> 9}|
+------------------------+



In [None]:
spark.stop()