In [1]:
import sys
!{sys.executable} -m pip install hdfs




In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [4]:
spark.sql(
    """
    DROP DATABASE IF EXISTS ProjetoGold CASCADE
    """
)
spark.sql(
    """
    create database ProjetoGold location 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db'
    """
)

DataFrame[]

In [5]:
# create mordidas_caes table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS ProjetoGold.Gold_Mordidas_Caes
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE ProjetoGold.Gold_Mordidas_Caes (
        Breed string,
        Gender string,
        SpayNeuter boolean,
        Borough string,
        Year INT,
        ZipCode INT
            
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Mordidas_Caes/'
    """
)

DataFrame[]

In [6]:
from pyspark.sql.functions import substring, avg, sum

# read mordidas_caes from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Mordidas_Caes"

Gold_Mordidas_Caes = spark\
             .read\
             .load(hdfs_path)

Gold_Mordidas_Caes.show()
Gold_Mordidas_Caes.printSchema()

+---------+---------------+---------+------+----------+-------+----+-------+
|Unique_ID|     DateOfBite|    Breed|Gender|SpayNeuter|ZipCode|Year|Borough|
+---------+---------------+---------+------+----------+-------+----+-------+
|     9032|January 01 2019|  Pitbull|     U|     false|     -1|2019| Queens|
|     9033|January 03 2019|Shiba Inu|     M|      true|  11691|2019| Queens|
|     9034|January 03 2019|  Unknown|     M|     false|  11355|2019| Queens|
|     9035|January 03 2019|  Maltese|     F|      true|  11414|2019| Queens|
|     9036|January 05 2019|  Unknown|     U|     false|  11375|2019| Queens|
|     9037|January 04 2019|  Pitbull|     M|     false|  11105|2019| Queens|
|     9039|January 03 2019|  Unknown|     U|     false|     -1|2019| Queens|
|     9040|January 06 2019|Chow Chow|     U|     false|  11365|2019| Queens|
|     9042|January 10 2019|  Unknown|     U|     false|     -1|2019| Queens|
|     9043|January 11 2019|  Bulldog|     M|      true|  11101|2019| Queens|

In [7]:
# write to delta table
Gold_Mordidas_Caes \
    .select ("Breed", "Gender", "SpayNeuter", "Borough", "Year", "ZipCode") \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Mordidas_Caes/")

In [8]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Mordidas_Caes/`
""").show()

++
||
++
++



In [9]:
spark.sql("""
DROP TABLE IF EXISTS ProjetoGold.Gold_Mordidas_Caes_Presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE ProjetoGold.Gold_Mordidas_Caes_Presto (
        Breed string,
        Gender string,
        SpayNeuter boolean,
        Borough string,
        Year INT,
        ZipCode INT
        
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Mordidas_Caes/_symlink_format_manifest/'
""").show()


++
||
++
++

++
||
++
++

