In [68]:
from pyspark.sql import SparkSession
import json
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [5]:
spark = SparkSession.builder.getOrCreate()

In [8]:
shows = spark.read.json("../Data/shows/shows-silicon-valley.json")

In [28]:
array_subset = shows.select("name","genres")

array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot and index"),
    F.col("genres")[0].alias("col and index"),
    array_subset.genres.getItem(0).alias("method and index"),
    F.col("genres").getItem(0).alias("col and item")
    
)

array_subset.show()


+--------------+-------------+-------------+----------------+------------+
|          name|dot and index|col and index|method and index|col and item|
+--------------+-------------+-------------+----------------+------------+
|Silicon Valley|       Comedy|       Comedy|          Comedy|      Comedy|
+--------------+-------------+-------------+----------------+------------+



In [47]:
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("One"),
    F.lit("Horror").alias("Two"),
    F.lit("Drama").alias("Three"),
    F.col("Dot and index")
).select(
    "name",
    F.array("One", "Two", "Three").alias("Some geners"),
    F.array_repeat("Dot and index",5).alias("Repeted geners")
).select(
    "Name",
    F.size("Some geners").alias("Some geners count"),
    F.array_distinct("Repeted geners")[0].alias("Distinct")
)

array_subset_repeated.show(1,False)

+--------------+-----------------+--------+
|Name          |Some geners count|Distinct|
+--------------+-----------------+--------+
|Silicon Valley|3                |Comedy  |
+--------------+-----------------+--------+



In [51]:
shows.select("schedule").printSchema()
shows.select("schedule").show()

root
 |-- schedule: struct (nullable = true)
 |    |-- days: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- time: string (nullable = true)

+-----------------+
|         schedule|
+-----------------+
|{[Sunday], 22:00}|
+-----------------+



In [53]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [60]:
shows_clean = shows.withColumn(
    "episode",
    F.col("_embedded.episodes")).drop("_embedded")

shows_clean.show(1,False)
shows_clean.printSchema()

+-----------------------------------------------------------------------------+--------------------------+--------+---+---------------------------------------------------------------------------------------------------------------------------------------------------+--------+--------------+-----------------------------------------------+----------------------------------+----------+------+-------+-----------------+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+----------+----------------------------------------------+----------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [67]:
episode_name = shows_clean.select(F.col("episode.name"))
episode_name.printSchema()
episode_name.select(F.explode("name").alias("name")).show(5,False)

root
 |-- name: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------------------------+
|name                     |
+-------------------------+
|Minimum Viable Product   |
|The Cap Table            |
|Articles of Incorporation|
|Fiduciary Duties         |
|Signaling Risk           |
+-------------------------+
only showing top 5 rows



In [70]:
episode_link_schema = T.StructType(
    "self", T.StructType((T.StructField("href", T.StringType()))))

TypeError: 'StructField' object is not iterable