In [None]:
%%capture
!pip install pyspark

In [None]:
import json

import pyspark.sql.functions as f
import pyspark.sql.types as t

from pprint import pprint
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.utils import AnalysisException

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
spark = (
    SparkSession
    .builder
    .appName("app")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

#### Создание JSON

In [None]:
sample_json = """{
  "id": 143,
  "name": "Silicon Valley",
  "type": "Scripted",
  "language": "English",
  "genres": [
    "Comedy"
    ],
  "network": {
    "id": 8,
    "name": "HBO",
    "country": {
      "name": "United States",
      "code": "US",
      "timezone": "America/New_York"
      }
   }
}"""

document = json.loads(sample_json)
pprint(document)

{'genres': ['Comedy'],
 'id': 143,
 'language': 'English',
 'name': 'Silicon Valley',
 'network': {'country': {'code': 'US',
                         'name': 'United States',
                         'timezone': 'America/New_York'},
             'id': 8,
             'name': 'HBO'},
 'type': 'Scripted'}


#### Чтение JSON   
 в одном джейсоне всегда будет одна строка и выглядит она вот так:

In [None]:
shows = (
    spark
    .read
    .json("/content/gdrive/MyDrive/shows-silicon-valley.json")
)
shows.show()
shows.count()

+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|           _embedded|              _links|           externals|  genres| id|               image|language|          name|             network|        officialSite| premiered|rating|runtime|         schedule|status|             summary|    type|   updated|                 url|webChannel|weight|
+--------------------+--------------------+--------------------+--------+---+--------------------+--------+--------------+--------------------+--------------------+----------+------+-------+-----------------+------+--------------------+--------+----------+--------------------+----------+------+
|{[{{{http://api.t...|{{http://api.tvma...|{tt2575988, 27716...|[Comedy]|143|{http://static.tv...| English|Silic

1

In [None]:
three_shows = spark.read.json("/content/gdrive/MyDrive/Colab Notebooks/pyspark/ch6/*.json", multiLine=True)
three_shows.show()
three_shows.count()

+--------------------+--------------------+--------------------+--------------------+---+--------------------+--------+----------------+--------------------+--------------------+----------+------+-------+-------------------+------+--------------------+--------+----------+--------------------+------------------+------+
|           _embedded|              _links|           externals|              genres| id|               image|language|            name|             network|        officialSite| premiered|rating|runtime|           schedule|status|             summary|    type|   updated|                 url|        webChannel|weight|
+--------------------+--------------------+--------------------+--------------------+---+--------------------+--------+----------------+--------------------+--------------------+----------+------+-------+-------------------+------+--------------------+--------+----------+--------------------+------------------+------+
|{[{{{http://api.t...|{{http://api.tvma.

3

In [None]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [None]:
three_shows.show()

+--------------------+--------------------+--------------------+--------------------+---+--------------------+--------+----------------+--------------------+--------------------+----------+------+-------+-------------------+------+--------------------+--------+----------+--------------------+------------------+------+
|           _embedded|              _links|           externals|              genres| id|               image|language|            name|             network|        officialSite| premiered|rating|runtime|           schedule|status|             summary|    type|   updated|                 url|        webChannel|weight|
+--------------------+--------------------+--------------------+--------------------+---+--------------------+--------+----------------+--------------------+--------------------+----------+------+-------+-------------------+------+--------------------+--------+----------+--------------------+------------------+------+
|{[{{{http://api.t...|{{http://api.tvma.

In [None]:
array_subset = three_shows.select("name", "genres")
array_subset.show(truncate=False)

+----------------+------------------------+
|name            |genres                  |
+----------------+------------------------+
|The Golden Girls|[Drama, Comedy]         |
|Breaking Bad    |[Drama, Crime, Thriller]|
|Silicon Valley  |[Comedy]                |
+----------------+------------------------+



несколько методов выбора значения из списка

In [None]:
array_subset = array_subset.select(
    "name", # выбираем колонку name
    array_subset.genres[0] # методы выбора зачений из списка
    .alias("dot_and_index"),
    f.col("genres")[0] # нельзя использовать питоновский способ [:3]
    .alias("col_and_index"),
    array_subset.genres.getItem(0)
    .alias("dot_and_method"),
    f.col("genres").getItem(0)
    .alias("col_and_method"),
)
array_subset.show()

+----------------+-------------+-------------+--------------+--------------+
|            name|dot_and_index|col_and_index|dot_and_method|col_and_method|
+----------------+-------------+-------------+--------------+--------------+
|The Golden Girls|        Drama|        Drama|         Drama|         Drama|
|    Breaking Bad|        Drama|        Drama|         Drama|         Drama|
|  Silicon Valley|       Comedy|       Comedy|        Comedy|        Comedy|
+----------------+-------------+-------------+--------------+--------------+



#### Массивы
с помощью lit() создаем литеральные столбцы  
с помощью array() можем создать столбец, содержащий массив  
array_repeat создаст массив, где повторяется 5 раз заданное значение  
size() может вывести длину массива или другой структуры  
array_distinct удалит дубликаты в массиве, являющимся элементом столбца

In [None]:
array_subset_repeated = (
    array_subset
    .select(
        "name",
        f.lit("Comedy").alias("one"), # создаем столбец, содержащий во всех элементах слово Comedy
        f.lit("Horror").alias("two"),
        f.lit("Drama").alias("three"),
        f.col("dot_and_index")
    )
    .select(
        "name",
        f.array("one", "two", "three").alias("Some_Genres"), # массив значений колонок
        f.array_repeat("dot_and_index", 5).alias("Repeated_Genres"), # массив повторяющихся значений колонки
  )
)
array_subset_repeated.show(truncate = False)

#size
array_subset_repeated.select(
    "name",
    f.size("Some_Genres"),
    f.size("Repeated_Genres")
).show()

#array_distinct
array_subset_repeated.select(
    "name",
    f.array_distinct("Some_Genres"),
    f.array_distinct("Repeated_Genres"),
).show(truncate = False)


+----------------+-----------------------+----------------------------------------+
|name            |Some_Genres            |Repeated_Genres                         |
+----------------+-----------------------+----------------------------------------+
|The Golden Girls|[Comedy, Horror, Drama]|[Drama, Drama, Drama, Drama, Drama]     |
|Breaking Bad    |[Comedy, Horror, Drama]|[Drama, Drama, Drama, Drama, Drama]     |
|Silicon Valley  |[Comedy, Horror, Drama]|[Comedy, Comedy, Comedy, Comedy, Comedy]|
+----------------+-----------------------+----------------------------------------+

+----------------+-----------------+---------------------+
|            name|size(Some_Genres)|size(Repeated_Genres)|
+----------------+-----------------+---------------------+
|The Golden Girls|                3|                    5|
|    Breaking Bad|                3|                    5|
|  Silicon Valley|                3|                    5|
+----------------+-----------------+---------------------

array_intersect выведет пересечение обоих массивов, то есть элементы, которые есть в обоих массивах  
array_position возвратит положение элемента в массиве, индексация с 1, если заданного элемента нет, то возвращает 0  
хотя в методе getItem индексация с 0

In [None]:
array_subset_repeated = array_subset_repeated.select(
    "name",
    f.array_intersect("Some_Genres", "Repeated_Genres")
    .alias("Genres")
)
array_subset_repeated.show()

#array_position
array_subset_repeated.select(
    "name",
    f.array_position('Genres','Comedy')
).show()

+----------------+--------+
|            name|  Genres|
+----------------+--------+
|The Golden Girls| [Drama]|
|    Breaking Bad| [Drama]|
|  Silicon Valley|[Comedy]|
+----------------+--------+

+----------------+------------------------------+
|            name|array_position(Genres, Comedy)|
+----------------+------------------------------+
|The Golden Girls|                             0|
|    Breaking Bad|                             0|
|  Silicon Valley|                             1|
+----------------+------------------------------+



#### Задание
примените метод explode к колонке со списком, а затем верните df в первоначальный вид

In [None]:
columns = [col for col in array_subset.columns if col != 'name']

(
    array_subset
    .select(
        'name',
        f.array(columns)
        .alias('col_list')
    )
).show()

(
    array_subset
    .select(
        'name',
        f.explode(f.array(columns))
    )
).show()

(
    array_subset
    .select(
        'name',
        f.explode(f.array(columns))
    )
    .groupby('name')
    .agg(f.collect_list('col').alias('col_list'))
).show()

+----------------+--------------------+
|            name|            col_list|
+----------------+--------------------+
|The Golden Girls|[Drama, Drama, Dr...|
|    Breaking Bad|[Drama, Drama, Dr...|
|  Silicon Valley|[Comedy, Comedy, ...|
+----------------+--------------------+

+----------------+------+
|            name|   col|
+----------------+------+
|The Golden Girls| Drama|
|The Golden Girls| Drama|
|The Golden Girls| Drama|
|The Golden Girls| Drama|
|    Breaking Bad| Drama|
|    Breaking Bad| Drama|
|    Breaking Bad| Drama|
|    Breaking Bad| Drama|
|  Silicon Valley|Comedy|
|  Silicon Valley|Comedy|
|  Silicon Valley|Comedy|
|  Silicon Valley|Comedy|
+----------------+------+

+----------------+--------------------+
|            name|            col_list|
+----------------+--------------------+
|    Breaking Bad|[Drama, Drama, Dr...|
|The Golden Girls|[Drama, Drama, Dr...|
|  Silicon Valley|[Comedy, Comedy, ...|
+----------------+--------------------+



#### Map  

In [None]:
columns = ["name", "language", "type"]
three_shows.select(columns).show()

print('создаем столбцы с названиями колонок и столбец с массивом значений')
shows_map = three_shows.select(
    *[f.lit(column) for column in columns],
    f.array(*columns).alias("values"),
)
shows_map.show(truncate=False)

print('столбцы с названиями колонок объединяем в массив и называем keys')
shows_map = shows_map.select(
    f.array(*columns)
    .alias("keys"),
    "values"
)
shows_map.show(truncate=False)

print('из массивов ключей и значений создаем map')
shows_map = shows_map.select(
    f.map_from_arrays("keys", "values").alias("mapped")
)
shows_map.show(truncate=False)
shows_map.printSchema()

print('из колонки типа mapped извлекаем значения по ключу name')
shows_map.select(
    f.col("mapped.name"),
    f.col("mapped")["name"],
    shows_map.mapped["name"],
).show()



+----------------+--------+--------+
|            name|language|    type|
+----------------+--------+--------+
|The Golden Girls| English|Scripted|
|    Breaking Bad| English|Scripted|
|  Silicon Valley| English|Scripted|
+----------------+--------+--------+

создаем столбцы с названиями колонок и столбец с массивом значений
+----+--------+----+-------------------------------------+
|name|language|type|values                               |
+----+--------+----+-------------------------------------+
|name|language|type|[The Golden Girls, English, Scripted]|
|name|language|type|[Breaking Bad, English, Scripted]    |
|name|language|type|[Silicon Valley, English, Scripted]  |
+----+--------+----+-------------------------------------+

столбцы с названиями колонок объединяем в массив и называем keys
+----------------------+-------------------------------------+
|keys                  |values                               |
+----------------------+-------------------------------------+
|[nam

#### Задание
распаковать столбец mapped с помощью posexplode

In [None]:
shows_map.show(truncate=False)

shows_map.select(
    f.posexplode("mapped").alias("position", "id", "name")
).show()

(
    shows_map.select(
        f.posexplode("mapped").alias("position", "id", "name")
    )
    .groupby("id").agg(f.collect_list("name").alias("name"))
).show(truncate=False)

+-----------------------------------------------------------------+
|mapped                                                           |
+-----------------------------------------------------------------+
|{name -> The Golden Girls, language -> English, type -> Scripted}|
|{name -> Breaking Bad, language -> English, type -> Scripted}    |
|{name -> Silicon Valley, language -> English, type -> Scripted}  |
+-----------------------------------------------------------------+

+--------+--------+----------------+
|position|      id|            name|
+--------+--------+----------------+
|       0|    name|The Golden Girls|
|       1|language|         English|
|       2|    type|        Scripted|
|       0|    name|    Breaking Bad|
|       1|language|         English|
|       2|    type|        Scripted|
|       0|    name|  Silicon Valley|
|       1|language|         English|
|       2|    type|        Scripted|
+--------+--------+----------------+

+--------+-------------------------------

в структуре таблице видим, что в столбце _embedded есть лишняя обертка (лишние фигурные скобки, так как тип данных struct) над массивом episodes. Следовательно стоит очистить данные и создать новый столбец без этой обертки

In [None]:
three_shows.select('_embedded').printSchema()
shows_clean = (
    three_shows.withColumn(
        "episodes", f.col("_embedded.episodes")
    )
    .drop("_embedded")
)
shows_clean.show()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

episodes - массив структур, в каждой структуре есть name.  
Мы можем получить массив name для каждой структуры в episodes следующим образом:

In [None]:
episodes_name = (
    shows_clean
    .select(f.col("episodes.name"))
)
episodes_name.show()
episodes_name.printSchema()
episodes_name.select(f.explode('name')).show(5,truncate=False)

# c помощью точек можно опускаться на любой уровень вложенности
shows_clean.select(f.col('episodes.image.medium')[0]).show(truncate=False)

+--------------------+
|                name|
+--------------------+
|[The Engagement, ...|
|[Pilot, Cat's in ...|
|[Minimum Viable P...|
+--------------------+

root
 |-- name: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------------------------+
|col                               |
+----------------------------------+
|The Engagement                    |
|Guess Who's Coming to the Wedding?|
|Rose the Prude                    |
|Transplant                        |
|The Triangle                      |
+----------------------------------+
only showing top 5 rows

+----------------------------------------------------------------------+
|episodes.image.medium[0]                                              |
+----------------------------------------------------------------------+
|http://static.tvmaze.com/uploads/images/medium_landscape/34/86531.jpg |
|http://static.tvmaze.com/uploads/images/medium_landscape/23/59145.jpg |
|http://static.tvmaze.com/

#### Схема данных
В pyspark можно программно создавать схему. Это может быть полезно для ускорения чтения данных (при объявление schema заранее, спарк не будет тратить время на ее автоматическое создание)
  
Чтение данных с использованием явной схемы может сразу уберечь от ошибок, если паплайн требует строгих типов данных
  
Также схема нужна, когда мы создаем df со сложной структурой

схема для колонки summary программно задается следующим образом:

In [None]:
three_shows.select(f.col('summary')).printSchema()
t.StructField("summary", t.StringType())

root
 |-- summary: string (nullable = true)



StructField('summary', StringType(), True)

#### создание схеиы данных для столбца episode
StructType представляет собой коллекцию StructField для объединения нескольких столбцов  
StructField - класс для определения конкретного столбца в схеме  
StructField также может содержать несколько столбцов, объединенных в StructType


StructField _embedded - лишняя обертка, которую мы удаляли ранее

In [None]:
episode_links_schema = t.StructType(
    [
        t.StructField(
            "self",
            t.StructType([t.StructField("href", t.StringType())])
        )
    ]
)
episode_image_schema = t.StructType(
    [
        t.StructField("medium", t.StringType()),
        t.StructField("original", t.StringType()),
    ]
)
episode_schema = t.StructType(
    [
        t.StructField("_links", episode_links_schema),
        t.StructField("airdate", t.DateType()),
        t.StructField("airstamp", t.TimestampType()),
        t.StructField("airtime", t.StringType()),
        t.StructField("id", t.StringType()),
        t.StructField("image", episode_image_schema),
        t.StructField("name", t.StringType()),
        t.StructField("number", t.LongType()),
        t.StructField("runtime", t.LongType()),
        t.StructField("season", t.LongType()),
        t.StructField("summary", t.StringType()),
        t.StructField("url", t.StringType()),
    ]
)
embedded_schema = t.StructType(
    [
        t.StructField(
            "_embedded",
            t.StructType(
                [
                    t.StructField(
                        "episodes", t.ArrayType(episode_schema)
                    )
                ]
            ),
        )
    ]
)
display(embedded_schema)

StructType([StructField('_embedded', StructType([StructField('episodes', ArrayType(StructType([StructField('_links', StructType([StructField('self', StructType([StructField('href', StringType(), True)]), True)]), True), StructField('airdate', DateType(), True), StructField('airstamp', TimestampType(), True), StructField('airtime', StringType(), True), StructField('id', StringType(), True), StructField('image', StructType([StructField('medium', StringType(), True), StructField('original', StringType(), True)]), True), StructField('name', StringType(), True), StructField('number', LongType(), True), StructField('runtime', LongType(), True), StructField('season', LongType(), True), StructField('summary', StringType(), True), StructField('url', StringType(), True)]), True), True)]), True)])

#### Повторное чтение json с явным указанием схемы

mode FAILFAST вызовет ошибку при несовпадении данных со схемой  
поскольку в схеме мы описали только _embedded, то и полученный df будет содержать только столбец _embedded|

In [None]:
shows_with_schema = spark.read.json(
    "/content/gdrive/MyDrive/*.json",
    schema=embedded_schema,
    mode="FAILFAST",
)
shows_with_schema.show()

+--------------------+
|           _embedded|
+--------------------+
|{[{{{http://api.t...|
|{[{{{http://api.t...|
|{[{{{http://api.t...|
+--------------------+



при создании схемы мы указали временные типы данных для airdate и airstamp. При чтении без схемы данные тип данных у этих полей оказался строковым

In [None]:
shows_with_schema.select('_embedded.episodes').printSchema()
three_shows.select('_embedded.episodes').printSchema()

root
 |-- episodes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |-- airdate: date (nullable = true)
 |    |    |-- airstamp: timestamp (nullable = true)
 |    |    |-- airtime: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- image: struct (nullable = true)
 |    |    |    |-- medium: string (nullable = true)
 |    |    |    |-- original: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- runtime: long (nullable = true)
 |    |    |-- season: long (nullable = true)
 |    |    |-- summary: string (nullable = true)
 |    |    |-- url: string (nullable = true)

root
 |-- episodes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _links: struct (nu

In [None]:
for column in ["airdate", "airstamp"]:
    shows_with_schema.select(f"_embedded.episodes.{column}").select(
        f.explode(column)
    ).show(5)

+----------+
|       col|
+----------+
|1985-09-14|
|1985-09-21|
|1985-09-28|
|1985-10-05|
|1985-10-19|
+----------+
only showing top 5 rows

+-------------------+
|                col|
+-------------------+
|1985-09-15 01:00:00|
|1985-09-22 01:00:00|
|1985-09-29 01:00:00|
|1985-10-06 01:00:00|
|1985-10-20 01:00:00|
+-------------------+
only showing top 5 rows



искуственное загрязнение схемы данных, вызываем ошибку pyspark  
в ошибке Caused by будет написано имя поля, примеры его значений и тип, который выставлен для него в схеме

In [None]:
episode_schema = t.StructType(
    [
        t.StructField("_links", episode_links_schema),
        t.StructField("airdate", t.DateType()),
        t.StructField("airstamp", t.TimestampType()),
        t.StructField("airtime", t.StringType()),
        t.StructField("id", t.StringType()),
        t.StructField("image", episode_image_schema),
        t.StructField("name", t.StringType()),
        t.StructField("number", t.LongType()),
        t.StructField("runtime", t.LongType()),
        t.StructField("season", t.LongType()),
        t.StructField("summary", t.LongType()), # поставили LongType вместо StringType
        t.StructField("url", t.StringType()),
    ]
)
embedded_schema = t.StructType(
    [
        t.StructField(
            "_embedded",
            t.StructType(
                [
                    t.StructField(
                        "episodes", t.ArrayType(episode_schema)
                    )
                ]
            ),
        )
    ]
)

shows_with_schema_wrong = spark.read.json(
    "/content/gdrive/MyDrive/*.json",
    schema=embedded_schema,
    mode="FAILFAST",
)
shows_with_schema_wrong.printSchema()
shows_with_schema_wrong.show()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: date (nullable = true)
 |    |    |    |-- airstamp: timestamp (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: long (nullable = true)
 |    |    |    |-- url: string (nullable = tru

Py4JJavaError: ignored

схему может быть не только в терминах StructType, но и в формате json

In [None]:
(
    shows_with_schema
    .select(
        f.explode('_embedded.episodes').alias('episode')
    )
    .select('episode.airtime')
).show(5)

(
    shows_with_schema
    .select(
        f.explode('_embedded.episodes').alias('episode')
    )
    .select('episode.airtime')
).printSchema()

pprint(
    shows_with_schema
    .select(
        f.explode('_embedded.episodes').alias('episode')
    )
    .select('episode.airtime')
    .schema.jsonValue()
)

+-------+
|airtime|
+-------+
|  21:00|
|  21:00|
|  21:00|
|  21:00|
|  21:00|
+-------+
only showing top 5 rows

root
 |-- airtime: string (nullable = true)

{'fields': [{'metadata': {},
             'name': 'airtime',
             'nullable': True,
             'type': 'string'}],
 'type': 'struct'}


программно описанную схему также можно перевести в json

In [None]:
pprint(
    t.StructField("array_example", t.ArrayType(t.StringType())).jsonValue()
)
print()
pprint(
    t.StructField(
    "map_example", t.MapType(t.StringType(), t.LongType())
    ).jsonValue()
)
print()
pprint(
    t.StructType(
        [
            t.StructField("map_example", t.MapType(t.StringType(), t.LongType())),
            t.StructField("array_example", t.ArrayType(t.StringType())),
        ]
    ).jsonValue()
)

{'metadata': {},
 'name': 'array_example',
 'nullable': True,
 'type': {'containsNull': True, 'elementType': 'string', 'type': 'array'}}

{'metadata': {},
 'name': 'map_example',
 'nullable': True,
 'type': {'keyType': 'string',
          'type': 'map',
          'valueContainsNull': True,
          'valueType': 'long'}}

{'fields': [{'metadata': {},
             'name': 'map_example',
             'nullable': True,
             'type': {'keyType': 'string',
                      'type': 'map',
                      'valueContainsNull': True,
                      'valueType': 'long'}},
            {'metadata': {},
             'name': 'array_example',
             'nullable': True,
             'type': {'containsNull': True,
                      'elementType': 'string',
                      'type': 'array'}}],
 'type': 'struct'}


In [None]:
other_shows_schema = t.StructType.fromJson(
    json.loads(shows_with_schema.schema.json())
)
print(other_shows_schema == shows_with_schema.schema) # True

True


можно создавать структуры столбцов в отдельном столбце

In [None]:
(
    three_shows.select(
        f.col('status'),
        f.col('weight'),
        f.lit(True).alias('info'),
        f.struct(
            f.col('status'),
            f.col('weight'),
            f.lit(True).alias('has_watched')
        )
        .alias('info')
    )
).show()

(
    three_shows.select(
        f.col('status'),
        f.col('weight'),
        f.lit(True).alias('info'),
        f.struct(
            f.col('status'),
            f.col('weight'),
            f.lit(True).alias('has_watched')
        )
        .alias('info')
    )
).printSchema()

+------+------+----+-----------------+
|status|weight|info|             info|
+------+------+----+-----------------+
| Ended|    68|true|{Ended, 68, true}|
| Ended|    98|true|{Ended, 98, true}|
| Ended|    96|true|{Ended, 96, true}|
+------+------+----+-----------------+

root
 |-- status: string (nullable = true)
 |-- weight: long (nullable = true)
 |-- info: boolean (nullable = false)
 |-- info: struct (nullable = false)
 |    |-- status: string (nullable = true)
 |    |-- weight: long (nullable = true)
 |    |-- has_watched: boolean (nullable = false)



#### Задание
построить схему для создания df

In [None]:
dict_schema = t.StructType(
    [
        t.StructField("one", t.LongType()),
        t.StructField("two", t.ArrayType(t.LongType()))
    ]
)
spark.createDataFrame(
    [{"one": 1, "two": [1,2,3]}],
    schema=dict_schema,
).printSchema()

root
 |-- one: long (nullable = true)
 |-- two: array (nullable = true)
 |    |-- element: long (containsNull = true)



посчитайте разницу между датой выхода первого и последнего эпизодов

In [None]:
three_shows.select(f.col('_embedded.episodes.airdate')).show(truncate=False)
(
    three_shows
    .select(
        f.col('_embedded.episodes.airdate')[0].alias('first'),
        f.col('_embedded.episodes.airdate')
        [f.size(f.col('_embedded.episodes.airdate'))-1]
        .alias('last')
    )
    .show(truncate=False)
)

(
    three_shows
    .select(
        (f.col('_embedded.episodes.airdate')
        [f.size(f.col('_embedded.episodes.airdate'))-1].cast('date')
        - f.col('_embedded.episodes.airdate')[0].cast('date')).alias('delta')
    )
    .show(truncate=False)
)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------