JupyterLab allows writing SQL query directly in-cell, thanks to the `%sparksql` magic command (use two percent signs `%%sparksql` to span code in multiple lines). An amazing feature is that PySpark can also interacts with this enviroment. This means all local files can be read as Hive tables.

In [1]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
%load_ext sparksql_magic

# 1. Managing tables

## 1.1. Creating tables

#### Manually creating tables

In [9]:
%%sparksql

CREATE TABLE IF NOT EXISTS tbl_product
STORED AS PARQUET
LOCATION 'data/spark_db/tbl_product'
TBLPROPERTIES ('parquet.compression'='snappy')

SELECT *
FROM VALUES
    ('Laptop', '1000', 15),
    ('Mouse', '20', 100),
    ('Headphone', '50', 50),
    ('USB', NULL, 100)
AS (product, price, stock)

In [10]:
%%sparksql

SHOW TABLES

0,1,2
database,tableName,isTemporary
default,tbl_product,False
default,test,False


In [11]:
%%sparksql

desc formatted test

0,1,2
col_name,data_type,comment
product,string,
price,string,
stock,bigint,
,,
# Detailed Table Information,,
Database,default,
Table,test,
Owner,hungpq,
Created Time,Wed Sep 15 00:05:53 ICT 2021,


In [15]:
%%sparksql

drop table tmp2

In [10]:
%%sparksql

show databases

0
namespace
default


In [9]:
%%sparksql

select * from tmp2

0
a
1


In [4]:
%%sparksql

DROP TABLE tbl_product

In [3]:
%%sparksql

CREATE TABLE IF NOT EXISTS tbl_product (
    product STRING COMMENT 'name of product',
    price INT COMMENT 'price of product',
    stock INT COMMENT 'number of products left'
)
STORED AS PARQUET
LOCATION 'data/spark_db/tbl_product'
TBLPROPERTIES ('parquet.compression'='snappy')

In [30]:
%%sparksql

SELECT *
FROM tbl_product

0,1,2
product,price,stock
Headphone,50,50
Laptop,1000,15
Mouse,20,100
USB,,100


In [40]:
%%sparksql

CREATE TABLE IF NOT EXISTS tbl_test (
    product STRING COMMENT 'name of product',
    price INT COMMENT 'price of product',
    stock INT COMMENT 'number of products left'
)
STORED AS PARQUET
LOCATION 'data/spark_db/tbl_product'
TBLPROPERTIES ('parquet.compression' = 'snappy')

## 1.2. Importing local files

In [None]:
%%sparksql --cache --view result df

In [7]:
data = [
    ('Laptop', '$1000', 15),
    ('Mouse', '$20', 100),
    ('Headphone', '$50', 50),
    ('USB', None, 100)
]

columns = ['product', 'price', 'stock']

df = spark.createDataFrame(data, columns)

In [8]:
df\
    .write\
    .format('parquet')\
    .option('path', 'data/spark_db/test')\
    .option('compression', 'snappy')\
    .mode('overwrite').saveAsTable('test')

#### Create tables

In [19]:
%%sparksql

SELECT *
FROM VALUES
    ('Laptop', '$1000', 15),
    ('Mouse', '$20', 100),
    ('Headphone', '$50', 50),
    ('USB', NULL, 100)
AS (product, price, stock)

0,1,2
product,price,stock
Laptop,$1000,15
Mouse,$20,100
Headphone,$50,50
USB,,100


#### Save tables

In [8]:
%%sparksql
show tables

0,1,2
database,tableName,isTemporary
default,tmp2,False


In [7]:
%%sparksql
create table tmp2 as (select 1 as a)

In [32]:
%%sparksql

create temporary view tmp as (select pi() as pi)
-- select pi() as pi

In [11]:
%%sparksql
select pi() as pi

0
pi
3.141592653589793


In [4]:
%%sparksql
show tables

0,1,2
database,tableName,isTemporary


In [44]:
%%sparksql

desc formatted tmp2

0,1,2
col_name,data_type,comment
a,int,
,,
# Detailed Table Information,,
Database,default,
Table,tmp2,
Created Time,Sat Sep 11 00:51:08 ICT 2021,
Last Access,UNKNOWN,
Created By,Spark 3.1.2,
Type,VIEW,
