In [1]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Install dependencies

import sys
!{sys.executable} -m pip install hdfs

/opt/conda/bin/python: No module named pip install hdfs


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# create gold database
spark.sql( 
    """
    DROP DATABASE IF EXISTS ProjetoGold CASCADE
    """
)
spark.sql(
    """
    create database ProjetoGold location 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    DROP TABLE IF EXISTS ProjetoGold.gold_ResgatesAnimais
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE ProjetoGold.gold_ResgatesAnimais (
        Date_and_Time_of_initial_call string,
		Date_and_time_of_Ranger_response string,
		Property string,
		Location string,
		Call_Source string,
		Species_Status string,
		Animal_Condition string,
		Duration_of_Response DOUBLE,
		Animal_Class string,
		311SR_Number string,
		Final_Ranger_Action string,
		Number_of_Animals INT,
		PEP_Response BOOLEAN,
		Animal_Monitored BOOLEAN,
		Police_Response BOOLEAN,
		ESU_Response BOOLEAN,
		Data string,
        Year INT,
        Borough string,
        Species_Description string,
		Age string
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/gold_ResgatesAnimais/'
    """
)

DataFrame[]

In [6]:
from pyspark.sql.functions import substring, avg, sum

# read Urban_Park_Ranger_Animal_Condition_Response from the silver tables
hdfs_path = 'hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Urban_Park_Ranger_Animal_Condition_Response'

gold_ResgatesAnimais = spark\
             .read\
             .load(hdfs_path)

gold_ResgatesAnimais.show()
gold_ResgatesAnimais.printSchema()

+-----------------------------+--------------------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+--------------------+---------------+--------------------+-------------+--------------------+-----------------+------------+----------------+---------------+------------+----------+----+
|Date_and_Time_of_initial_call|Date_and_time_of_Ranger_response|      Borough|            Property|            Location| Species_Description|         Call_Source|Species_Status|Animal_Condition|Duration_of_Response|            Age|        Animal_Class| 311SR_Number| Final_Ranger_Action|Number_of_Animals|PEP_Response|Animal_Monitored|Police_Response|ESU_Response|      Data|Year|
+-----------------------------+--------------------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+--------------------+---------------+-------

In [7]:
# write to delta table
gold_ResgatesAnimais\
    .select("Date_and_Time_of_initial_call", "Date_and_time_of_Ranger_response", "Borough", "Property", "Location", "Species_Description", "Call_Source", 
            "Species_Status", "Animal_Condition", "Duration_of_Response", "Age", "Animal_Class", "311SR_Number", "Final_Ranger_Action", 
            "Number_of_Animals", "PEP_Response", "Animal_Monitored", "Police_Response", "ESU_Response", "Data", "Year") \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/gold_ResgatesAnimais/")

In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/gold_ResgatesAnimais/`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS ProjetoGold.gold_ResgatesAnimais_presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE ProjetoGold.gold_ResgatesAnimais_presto (
       Date_and_Time_of_initial_call string,
		Date_and_time_of_Ranger_response string,
		Property string,
		Location string,
		Call_Source string,
		Species_Status string,
		Animal_Condition string,
		Duration_of_Response DOUBLE,
		Animal_Class string,
		311SR_Number string,
		Final_Ranger_Action string,
		Number_of_Animals INT,
		PEP_Response BOOLEAN,
		Animal_Monitored BOOLEAN,
		Police_Response BOOLEAN,
		ESU_Response BOOLEAN,
		Data string,
        Year INT,
        Borough string,
        Species_Description string,
		Age string
    )
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/gold_ResgatesAnimais/_symlink_format_manifest/'
""").show()

++
||
++
++

++
||
++
++

