# Spark Schema Definition

In [1]:
# Prerequisites
from pyspark.sql import SparkSession 
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


### Define Schema Programmatically

In [3]:
# Define Schema Programmatically
schema_A = StructType([
    StructField("author", StringType(), False),
    StructField("title", StringType(), False),
    StructField("pages", IntegerType(), False)]
)

# Create static data
data_A = [["George Orwell", "1984", 352],
          ["Jane Austen", "Pride and Prejudice", 456],
          ["Harper Lee", "To Kill a Mockingbird", 399],
          ["Herman Melville", "Moby-Dick", 511]
]

df_books_A = spark.createDataFrame(data_A, schema_A)
df_books_A.printSchema()


root
 |-- author: string (nullable = false)
 |-- title: string (nullable = false)
 |-- pages: integer (nullable = false)



In [4]:
df_books_A.show(truncate=False)

+---------------+---------------------+-----+
|author         |title                |pages|
+---------------+---------------------+-----+
|George Orwell  |1984                 |352  |
|Jane Austen    |Pride and Prejudice  |456  |
|Harper Lee     |To Kill a Mockingbird|399  |
|Herman Melville|Moby-Dick            |511  |
+---------------+---------------------+-----+



### Define Schema with Data Description Language (DDL)

In [5]:
# Define Schema with DDL 
schema_B = "author STRING, title STRING, pages INT"

data_B = [["George Orwell", "1984", 352],
          ["Jane Austen", "Pride and Prejudice", 456],
          ["Harper Lee", "To Kill a Mockingbird", 399],
          ["Herman Melville", "Moby-Dick", 511]
]

df_books_B = spark.createDataFrame(data_B, schema_B)
df_books_B.printSchema()

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- pages: integer (nullable = true)



In [6]:
df_books_B.show(truncate=False)

+---------------+---------------------+-----+
|author         |title                |pages|
+---------------+---------------------+-----+
|George Orwell  |1984                 |352  |
|Jane Austen    |Pride and Prejudice  |456  |
|Harper Lee     |To Kill a Mockingbird|399  |
|Herman Melville|Moby-Dick            |511  |
+---------------+---------------------+-----+



### More complex Schema

In [7]:
# Yet another Schema
schema_C = "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"

data_C = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
          [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
          [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web","twitter", "FB", "LinkedIn"]],
          [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
          [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
          [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

df_blogs = spark.createDataFrame(data_C, schema_C)
df_blogs.printSchema()



root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [8]:
df_blogs.show(truncate=False)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



### Read Schema from JSON

In [11]:

df_blogs_2 = spark.read.schema(schema_C).json("data/blogs.json")

df_blogs_2.printSchema()


root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [21]:
df_blogs_2.show(truncate=False)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+

