In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType, DateType
import sys
import os
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException
from delta.tables import *
import io
import json

In [0]:
def create_spark_session():
    return  SparkSession.builder\
    .config('spark.executor.memory', '8G')\
.   getOrCreate()

In [0]:
spark = create_spark_session()

O catálogo de metadados do Spark pode ser acessado pelo objeto

`SparkSession.catalog` 

As principais funcionalidades são:

* `listDatabases()`: lista todas os databases disponíveis;
* `listTables()`: lista todas as tabelas disponíveis em um determinado database;
* `listFucntions()`: lista as funções disponíveis em um determinado database;
* `refreshTable()`: atualiza os metadados de uma determinada tabela
* `uncacheTable()`: remove uma tabela salva em memória
* `clearCache()`: remove todas as tabelas salvas em memória

In [0]:
spark.catalog.listDatabases()

Out[6]: [Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='dbfs:/user/hive/warehouse')]

In [0]:
spark.catalog.listTables('default')

Out[7]: []

### Show e Create Databases

Os databases do Spark são uma ferramenta para organizar tabelas. Eles podem e devem ser vistos como algo muito próximo dos databases de servidores de bancos de dados relacionais. O Spark utiliza por padrão um database chamado default, que serve para criar tabelas, views e realizar consultas caso o usuário não tenha definido o seu próprio. Um ponto importante é que essas estruturas persistem em diferentes sessões: se o usuário mudar de database, todas as tabelas permanecerão no database anterior e vão precisar ser consultadas de maneira diferente.

Existem alguns comandos do SQL importantes na hora de se trabalhar com databases. Else são:

* `SHOW DATABASES`: lista todas os databases disponíveis, de forma análoga ao Catalog ;
* `CREATE DATABASE <nome_do_db>`: cria um database
* `USE <nome_do_db>`: define o database como o atual para a realização de queries
    * **Obs**: ao se mudar de database, é possível acessar tabelas de um database anterior usando o prefixo “nome_do_db.” antes do nome da tabela. Exemplo:
        ```
        USE db2
        SELECT * FROM db1.table
        ```
* `SELECT current_database()`: retorna qual o database definido como o atual
* `DROP DATABASE IF EXISTS <nome_do_db>`: deleta determinado database dentre aqueles que foram definidos. Atenção: nunca delete o database default do Spark.


In [0]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS department;
""")

Out[8]: DataFrame[]

In [0]:
spark.catalog.listDatabases()

Out[9]: [Database(name='default', catalog='spark_catalog', description='Default Hive database', locationUri='dbfs:/user/hive/warehouse'),
 Database(name='department', catalog='spark_catalog', description='', locationUri='dbfs:/user/hive/warehouse/department.db')]

In [0]:
spark.catalog.listTables('department')

Out[11]: []

In [0]:
path_countries = '/FileStore/transient/departments/countries'
df_countries = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_countries)

In [0]:
path_regions = '/FileStore/transient/departments/regions'
df_regions = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_regions)

### Tables

* **Managed Tables**: o Spark administra tanto os dados quanto os metadados das tabelas, de forma que operações como DROP TABLE afetam também os dados escritos em disco;
* **Unmanaged Tables**: o Spark administra somente os metadados da tabela, e os dados escritos em disco não são alterados em nenhum momento

In [0]:
spark.sql("""
USE department;
""")

Out[14]: DataFrame[]

In [0]:
spark.catalog.listTables('department')

Out[15]: []

**Criando Unmanaged Tables**

In [0]:
df_regions.write.option('path', '/FileStore/transient/bronze/sql_db/departments/regions').saveAsTable("regions")

In [0]:
spark.catalog.listTables('department')

Out[17]: [Table(name='regions', catalog='spark_catalog', namespace=['department'], description=None, tableType='EXTERNAL', isTemporary=False)]

In [0]:
spark.sql("""
select * from regions limit 5;
""").show()   

+---------+--------------------+
|region_id|         region_name|
+---------+--------------------+
|        1|              Europe|
|        2|            Americas|
|        3|                Asia|
|        4|Middle East and A...|
+---------+--------------------+



In [0]:
spark.sql("""
DROP TABLE regions;
""")


Out[19]: DataFrame[]

In [0]:
spark.catalog.listTables('department')

Out[20]: []

In [0]:
dbutils.fs.ls('dbfs:/FileStore/transient/sql_db/departments/regions/')


[0;31m---------------------------------------------------------------------------[0m
[0;31mExecutionError[0m                            Traceback (most recent call last)
File [0;32m<command-582003277780622>:1[0m
[0;32m----> 1[0m [43mdbutils[49m[38;5;241;43m.[39;49m[43mfs[49m[38;5;241;43m.[39;49m[43mls[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mdbfs:/FileStore/transient/sql_db/departments/regions/[39;49m[38;5;124;43m'[39;49m[43m)[49m

File [0;32m/databricks/python_shell/dbruntime/dbutils.py:362[0m, in [0;36mDBUtils.FSHandler.prettify_exception_message.<locals>.f_with_exception_handling[0;34m(*args, **kwargs)[0m
[1;32m    360[0m exc[38;5;241m.[39m__context__ [38;5;241m=[39m [38;5;28;01mNone[39;00m
[1;32m    361[0m exc[38;5;241m.[39m__cause__ [38;5;241m=[39m [38;5;28;01mNone[39;00m
[0;32m--> 362[0m [38;5;28;01mraise[39;00m exc

[0;31mExecutionError[0m: An error occurred while calling o937.ls.
: java.io.FileNotFoundException: /FileS

In [0]:
spark.sql("""
select * from regions limit 5;
""").show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-582003277780623>:1[0m
[0;32m----> 1[0m [43mspark[49m[38;5;241;43m.[39;49m[43msql[49m[43m([49m[38;5;124;43m"""[39;49m
[1;32m      2[0m [38;5;124;43mselect * from regions limit 5;[39;49m
[1;32m      3[0m [38;5;124;43m"""[39;49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlo

**Criando Managed Tables**

In [0]:
df_regions.write.saveAsTable("regions")

In [0]:
spark.sql("""
select * from regions limit 5;
""").show()

+---------+--------------------+
|region_id|         region_name|
+---------+--------------------+
|        1|              Europe|
|        2|            Americas|
|        3|                Asia|
|        4|Middle East and A...|
+---------+--------------------+



In [0]:
spark.sql("""
DROP TABLE regions;
""")


Out[29]: DataFrame[]

**Criando Views**

In [0]:
df_countries.createOrReplaceTempView('countries_view')

In [0]:
spark.sql("""
select * from countries_view;
""").show()


+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
|        BE|     Belgium|        1|
|        BR|      Brazil|        2|
|        CA|      Canada|        2|
|        CH| Switzerland|        1|
|        CN|       China|        3|
|        DE|     Germany|        1|
|        DK|     Denmark|        1|
|        EG|       Egypt|        4|
|        FR|      France|        1|
|        HK|    HongKong|        3|
|        IL|      Israel|        4|
|        IN|       India|        3|
|        IT|       Italy|        1|
|        JP|       Japan|        3|
|        KW|      Kuwait|        4|
|        MX|      Mexico|        2|
|        NG|     Nigeria|        4|
|        NL| Netherlands|        1|
+----------+------------+---------+
only showing top 20 rows



In [0]:
df_countries.createOrReplaceGlobalTempView('countries_global_view')

**Utilizando a interface SQL**

In [0]:
spark.sql("""
  DROP TABLE countries ;
""")

Out[39]: DataFrame[]

In [0]:
spark.sql("""
  CREATE TABLE countries (
  country_id STRING, 
  country_name STRING,
  region_id INTEGER
) ;
""")

Out[40]: DataFrame[]

In [0]:
spark.catalog.listTables('department')

Out[41]: [Table(name='countries', catalog='spark_catalog', namespace=['department'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='countries_view', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [0]:
### inserindo registros na tabela

In [0]:
spark.sql(""" 
            insert into department.countries values
            ('100','Never Land',99)
        """)

Out[42]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("SELECT * FROM department.countries limit 5").show()

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|       100|  Never Land|       99|
+----------+------------+---------+



In [0]:
spark.sql(""" 
        insert into countries
        select        
              country_id , 
              country_name ,       
              region_id      
        from countries_view
        """)

Out[44]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("SELECT * FROM department.countries limit 50").show()

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
|        BE|     Belgium|        1|
|        BR|      Brazil|        2|
|        CA|      Canada|        2|
|        CH| Switzerland|        1|
|        CN|       China|        3|
|        DE|     Germany|        1|
|        DK|     Denmark|        1|
|        EG|       Egypt|        4|
|        FR|      France|        1|
|        HK|    HongKong|        3|
|        IL|      Israel|        4|
|        IN|       India|        3|
|        IT|       Italy|        1|
|        JP|       Japan|        3|
|        KW|      Kuwait|        4|
|        MX|      Mexico|        2|
|        NG|     Nigeria|        4|
|        NL| Netherlands|        1|
+----------+------------+---------+
only showing top 20 rows



In [0]:
spark.sql('SHOW TABLES;').show()

+----------+--------------+-----------+
|  database|     tableName|isTemporary|
+----------+--------------+-----------+
|department|     countries|      false|
|          |countries_view|       true|
+----------+--------------+-----------+



In [0]:
spark.catalog.listDatabases()

In [0]:
spark.catalog.listTables('department')

Out[47]: [Table(name='countries', catalog='spark_catalog', namespace=['department'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='countries_view', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]