In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType, DateType
import sys
import os
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException
from delta.tables import *
import io
import json

In [2]:
def create_spark_session():
    spark_packages_list = [
            'io.delta:delta-core_2.12:2.4.0',
            'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0',
            'org.mongodb.spark:mongo-spark-connector:10.0.2'
        ]
    warehouse_location = '/mnt/datalake/warehouse'
    #bin/spark-sql --packages io.delta:delta-core_2.12:2.4.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"

    spark_packages = ",".join(spark_packages_list)
#     derby_location = '/mnt/datalake/derby'
#     .config("spark.driver.extraJavaOptions", f"Dderby.system.home='{derby_location}'") \
    return SparkSession \
        .builder \
        .appName("File Streaming Demo") \
        .master("local[3]") \
        .config("spark.databricks.delta.schema.autoMerge.enabled", "true")\
        .config("spark.sql.warehouse.dir", warehouse_location) \
        .config("spark.jars.packages", spark_packages) \
        .config("spark.streaming.stopGracefullyOnShutdown", "true") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("delta.deletedFileRetentionDuration",7)\
        .config("spark.databricks.delta.retentionDurationCheck.enabled","false") \
        .config("spark.databricks.delta.schema.autoMerge.enabled","true") \
        .config("spark.sql.legacy.allowNonEmptyLocationInCTAS","true")\
        .config("hive.stats.jdbc.timeout", 30) \
        .config("hive.stats.retries.wait", 3000) \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .enableHiveSupport()\
        .getOrCreate()

In [3]:
spark = create_spark_session()



:: loading settings :: url = jar:file:/usr/local/spark-3.4.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2b90c6c1-46de-4efc-a149-b127b84419cd;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;

O catálogo de metadados do Spark pode ser acessado pelo objeto

`SparkSession.catalog` 

As principais funcionalidades são:

* `listDatabases()`: lista todas os databases disponíveis;
* `listTables()`: lista todas as tabelas disponíveis em um determinado database;
* `listFucntions()`: lista as funções disponíveis em um determinado database;
* `refreshTable()`: atualiza os metadados de uma determinada tabela
* `uncacheTable()`: remove uma tabela salva em memória
* `clearCache()`: remove todas as tabelas salvas em memória

In [4]:
spark.catalog.listDatabases()

24/01/19 23:35:31 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/01/19 23:35:31 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/01/19 23:36:05 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/01/19 23:36:05 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore jovyan@172.22.0.2
24/01/19 23:36:05 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException


[Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='file:/mnt/datalake/warehouse')]

In [5]:
spark.catalog.listTables('default')

24/01/19 23:36:12 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


[]

### Show e Create Databases

Os databases do Spark são uma ferramenta para organizar tabelas. Eles podem e devem ser vistos como algo muito próximo dos databases de servidores de bancos de dados relacionais. O Spark utiliza por padrão um database chamado default, que serve para criar tabelas, views e realizar consultas caso o usuário não tenha definido o seu próprio. Um ponto importante é que essas estruturas persistem em diferentes sessões: se o usuário mudar de database, todas as tabelas permanecerão no database anterior e vão precisar ser consultadas de maneira diferente.

Existem alguns comandos do SQL importantes na hora de se trabalhar com databases. Else são:

* `SHOW DATABASES`: lista todas os databases disponíveis, de forma análoga ao Catalog ;
* `CREATE DATABASE <nome_do_db>`: cria um database
* `USE <nome_do_db>`: define o database como o atual para a realização de queries
    * **Obs**: ao se mudar de database, é possível acessar tabelas de um database anterior usando o prefixo “nome_do_db.” antes do nome da tabela. Exemplo:
        ```
        USE db2
        SELECT * FROM db1.table
        ```
* `SELECT current_database()`: retorna qual o database definido como o atual
* `DROP DATABASE IF EXISTS <nome_do_db>`: deleta determinado database dentre aqueles que foram definidos. Atenção: nunca delete o database default do Spark.


In [6]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS department;
""")

24/01/19 23:36:13 WARN ObjectStore: Failed to get database department, returning NoSuchObjectException
24/01/19 23:36:13 WARN ObjectStore: Failed to get database department, returning NoSuchObjectException
24/01/19 23:36:13 WARN ObjectStore: Failed to get database department, returning NoSuchObjectException
chgrp: changing ownership of 'file:///mnt/datalake/warehouse/department.db': chown: changing group of '/mnt/datalake/warehouse/department.db': Operation not permitted


DataFrame[]

In [7]:
spark.catalog.listDatabases()

[Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='file:/mnt/datalake/warehouse'),
 Database(name='department', catalog='spark_catalog', description='', locationUri='file:/mnt/datalake/warehouse/department.db')]

In [8]:
spark.catalog.listTables('department')

[]

In [11]:
path_countries = '../../datalake/transient/departments/countries/'
df_countries = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_countries)

In [14]:
path_regions = '../../datalake/transient/departments/regions/'
df_regions = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_regions)

### Tables

* **Managed Tables**: o Spark administra tanto os dados quanto os metadados das tabelas, de forma que operações como DROP TABLE afetam também os dados escritos em disco;
* **Unmanaged Tables**: o Spark administra somente os metadados da tabela, e os dados escritos em disco não são alterados em nenhum momento

In [15]:
spark.sql("""
USE department;
""")

DataFrame[]

In [16]:
spark.catalog.listTables('department')

[]

**Criando Unmanaged Tables**

In [17]:
df_regions.write.option('path', '../../datalake/bronze/sql_db/departments/regions').saveAsTable("regions")

24/01/19 23:39:09 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/01/19 23:39:09 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
24/01/19 23:39:09 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/01/19 23:39:09 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
chgrp: changing ownership of 'file:///mnt/datalake/datalake': chown: changing group of '/mnt/datalake/datalake': Operation not permitted


In [20]:
spark.catalog.listTables('department')

[Table(name='regions', catalog='spark_catalog', namespace=['department'], description=None, tableType='EXTERNAL', isTemporary=False)]

In [21]:
spark.sql("""
select * from regions limit 5;
""").show()   

+---------+-----------+
|region_id|region_name|
+---------+-----------+
+---------+-----------+



In [22]:
spark.sql("""
DROP TABLE regions;
""")


DataFrame[]

In [23]:
spark.catalog.listTables('department')

[]

In [24]:
! ls -la ../../datalake/bronze/sql_db/departments/regions/

total 20
drwxr-xr-x 2 jovyan users 4096 Jan 19 23:39 .
drwxr-xr-x 3 jovyan users 4096 Jan 19 23:39 ..
-rw-r--r-- 1 jovyan users  823 Jan 19 23:39 part-00000-cf1ce8a0-9a75-4327-95bd-b42cba29349e-c000.snappy.parquet
-rw-r--r-- 1 jovyan users   16 Jan 19 23:39 .part-00000-cf1ce8a0-9a75-4327-95bd-b42cba29349e-c000.snappy.parquet.crc
-rw-r--r-- 1 jovyan users    0 Jan 19 23:39 _SUCCESS
-rw-r--r-- 1 jovyan users    8 Jan 19 23:39 ._SUCCESS.crc


In [25]:
spark.sql("""
select * from regions limit 5;
""").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `regions` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 2 pos 14;
'GlobalLimit 5
+- 'LocalLimit 5
   +- 'Project [*]
      +- 'UnresolvedRelation [regions], [], false


**Criando Managed Tables**

In [26]:
df_regions.write.saveAsTable("regions")

In [27]:
! ls -la ../../datalake/warehouse/department.db/regions/

total 20
drwxr-xr-x 2 jovyan users 4096 Jan 19 23:40 .
drwxr-xr-x 3 jovyan users 4096 Jan 19 23:40 ..
-rw-r--r-- 1 jovyan users  823 Jan 19 23:40 part-00000-f89201d6-0120-467c-aeed-c07e66df8de0-c000.snappy.parquet
-rw-r--r-- 1 jovyan users   16 Jan 19 23:40 .part-00000-f89201d6-0120-467c-aeed-c07e66df8de0-c000.snappy.parquet.crc
-rw-r--r-- 1 jovyan users    0 Jan 19 23:40 _SUCCESS
-rw-r--r-- 1 jovyan users    8 Jan 19 23:40 ._SUCCESS.crc


In [28]:
spark.sql("""
DROP TABLE regions;
""")


DataFrame[]

In [31]:
! ls -la ../../datalake/warehouse/department.db/regions

ls: cannot access '../../datalake/warehouse/department.db/regions': No such file or directory


In [32]:
df_countries.printSchema()

root
 |-- country_id: string (nullable = true)
 |-- country_name: string (nullable = true)
 |-- region_id: integer (nullable = true)



**Criando Views**

In [33]:
df_countries.createOrReplaceTempView('countries_view')

In [34]:
spark.sql("""
select * from countries_view;
""").show()


+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
|        BE|     Belgium|        1|
|        BR|      Brazil|        2|
|        CA|      Canada|        2|
|        CH| Switzerland|        1|
|        CN|       China|        3|
|        DE|     Germany|        1|
|        DK|     Denmark|        1|
|        EG|       Egypt|        4|
|        FR|      France|        1|
|        HK|    HongKong|        3|
|        IL|      Israel|        4|
|        IN|       India|        3|
|        IT|       Italy|        1|
|        JP|       Japan|        3|
|        KW|      Kuwait|        4|
|        MX|      Mexico|        2|
|        NG|     Nigeria|        4|
|        NL| Netherlands|        1|
+----------+------------+---------+
only showing top 20 rows



In [35]:
df_countries.createOrReplaceGlobalTempView('countries_global_view')

**Utilizando a interface SQL**

In [37]:
spark.sql("""
  DROP TABLE countries ;
""")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `spark_catalog`.`department`.`countries` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.

In [None]:
spark.sql("""
  CREATE TABLE countries (
  country_id STRING, 
  country_name STRING,
  region_id INTEGER
) 
""")

In [None]:
spark.catalog.listTables('department')

In [None]:
### inserindo registros na tabela

In [None]:
spark.sql(""" 
            insert into department.countries values
            ('100','Never Land',99)
        """)

In [None]:
spark.sql("SELECT * FROM department.countries limit 5").show()

In [None]:
spark.sql(""" 
        insert into countries
        select        
              country_id , 
              country_name ,       
              region_id      
        from countries_view
        """)

In [None]:
spark.sql("SELECT * FROM department.countries limit 50").show()

In [None]:
spark.sql('SHOW TABLES;').show()

In [None]:
spark.catalog.listDatabases()

In [None]:
spark.catalog.listTables('department')