In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Load sparksql magic to execute SQL with spark

In [2]:
%load_ext sparksql

In [4]:
%%sparksql
CREATE TABLE student (
    id INT, 
    name STRING,
    `nestedwithspaces` STRUCT<`sub field`:STRUCT<`sub field2`:STRING>>,
    age INT,
    books ARRAY<STRUCT<`title`:STRING, `chapters`:ARRAY<STRUCT<`paragraph`:STRING>>>>,
    struct_col STRUCT<`address`:STRUCT<`streetName`:STRING, `streetNumber`:BIGINT>>,
    map_col MAP<STRING, MAP<STRING, STRUCT<`start`:BIGINT,`end`:BIGINT>>>

    ) USING PARQUET

### Compose and execute query using pyspark SQL

In [None]:
%%sparksql

SELECT
    s.books['The_Odyssey'].chapters[3].paragraph as a,
    s.nestedwithspaces.`sub field`.`sub field2`
FROM
    student AS s
WHERE
    array_contains(map_keys(s.books.chapters.paragraph), 'The_Odyssey') = TRUE

In [14]:
%%sparksql

SELECT
    s.books.title,
    s.boo
FROM
    student AS s
WHERE
    array_contains(s.books.title, 'x')

0
title


### Compose and syntax highlight SQL in python string

In [21]:
sql_statement = '''
--start-sql-syntax
SELECT
    s.books.chapters[3].paragraph as a,
    s.id
FROM
    student AS s
WHERE
    array_contains(s.books.title, 'The_Odyssey')
--end-sql-syntax
'''

print(sql_statement)


--start-sql-syntax
SELECT
    s.books.chapters[3].paragraph as a,
    s.id
FROM
    student AS s
WHERE
    array_contains(s.books.title, 'The_Odyssey')
--end-sql-syntax



In [22]:
spark.sql(sql_statement).show()

+---+---+
|  a| id|
+---+---+
+---+---+



In [None]:
%%sparksql
SELECT 
    transform(array(1, 2, 3), x -> x + 1),
FROM
    stude

In [None]:
%%sparksql
SELECT
    transform(array(1, 2, 3), (x, i) -> x + i)
FROM
    student

In [None]:
%%sparksql
SELECT * FROM person
    PIVOT (
        SUM(age) AS a, AVG(class) AS c
        FOR (name, age) IN (('John', 30) AS c1, ('Mike', 40) AS c2)
    );

In [None]:
%%sparksql
SELECT DATE('2020-01-01')

In [None]:
%%sparksql
SELECT
CAST('12' AS INT),
X'123456' AS col,
NULL AS col,
9223372036854775807L AS col

In [None]:
%%sparksql

-- CTE with multiple column aliases
WITH t(x, y) AS (SELECT 1, 2)
SELECT * FROM t WHERE x = 1 AND y = 2