In [1]:
import sys
!{sys.executable} -m pip install hdfs



In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *

In [4]:
from pyspark.sql import Row
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Gold'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [19]:
spark.sql(
    """
    DROP DATABASE IF EXISTS ProjetoGold CASCADE
    """
)
spark.sql(
    """
    create database ProjetoGold location 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db'
    """
)

DataFrame[]

In [20]:
spark.sql(
    """
    DROP DATABASE IF EXISTS Projeto_gold CASCADE
    """
)

DataFrame[]

In [21]:
from pyspark.sql.functions import substring, avg, sum

# read fontes from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000//Projeto/Silver/Projeto.db/Fontes"

Fontes = spark\
             .read\
             .load(hdfs_path)

Fontes.show()
Fontes.printSchema()


+------------+--------------------+--------+--------------------+--------------------+-------+----------+--------------------+----------+---------+------------+----------+----------+----+--------+
|  FountainTy|            the_geom|OBJECTID|            Position|          Collection|Painted|GISPROPNUM|            SIGNNAME|FountainCo| GISOBJID|      SYSTEM|DEPARTMENT|      Data|Year| Borough|
+------------+--------------------+--------+--------------------+--------------------+-------+----------+--------------------+----------+---------+------------+----------+----------+----+--------+
|  F High Low|POINT (-73.986591...|    1589|         Out in Open|02/10/2018 12:00:...|     no|      B100|Seth Low Playgrou...|         2|100038957| B100-DF0647|      B-11|02/10/2018|2018|Brooklyn|
|           C|POINT (-73.946392...|    2917|In Shade, In Play...|05/29/2018 12:00:...|   null|      B302|     Charlie's Place|         1|100040304| B302-DF0897|      B-03|05/29/2018|2018|Brooklyn|
|E Wheelchair|P

In [8]:
# join to make a flat table with product information
#flat_mordidas_fontes_df = Fontes.join(Mordidas_Caes, (Fontes.Year == Mordidas_Caes.Year) & (Fontes.Borough == Mordidas_Caes.Borough), "inner") \
#   .drop(Fontes.Borough) \
#   .drop(Mordidas_Caes.Year) \
    



In [22]:
# create fontes table in gold
spark.sql(
    """
    DROP TABLE IF EXISTS ProjetoGold.Gold_Fontes
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE ProjetoGold.Gold_Fontes (
        FountainTy string,
        the_geom string,
        OBJECTID INT,
        Position string,
        Collection string,
        Painted string,
        GISPROPNUM string,
        SIGNNAME string,
        Borough string,
        FountainCo INT,
        GISOBJID string,
        SYSTEM string,
        DEPARTMENT string,
        PARENTID string, 
        Data string,
        Year INT
            
    )
    USING DELTA
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Fontes/'
    """
)


DataFrame[]

In [23]:
from pyspark.sql.functions import substring, avg, sum

# read fontes from the silver tables
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Fontes"

Gold_Fontes = spark\
             .read\
             .load(hdfs_path)

Gold_Fontes.show()
Gold_Fontes.printSchema()

+------------+--------------------+--------+--------------------+--------------------+-------+----------+--------------------+----------+---------+------------+----------+----------+----+--------+
|  FountainTy|            the_geom|OBJECTID|            Position|          Collection|Painted|GISPROPNUM|            SIGNNAME|FountainCo| GISOBJID|      SYSTEM|DEPARTMENT|      Data|Year| Borough|
+------------+--------------------+--------+--------------------+--------------------+-------+----------+--------------------+----------+---------+------------+----------+----------+----+--------+
|  F High Low|POINT (-73.986591...|    1589|         Out in Open|02/10/2018 12:00:...|     no|      B100|Seth Low Playgrou...|         2|100038957| B100-DF0647|      B-11|02/10/2018|2018|Brooklyn|
|           C|POINT (-73.946392...|    2917|In Shade, In Play...|05/29/2018 12:00:...|   null|      B302|     Charlie's Place|         1|100040304| B302-DF0897|      B-03|05/29/2018|2018|Brooklyn|
|E Wheelchair|P

In [24]:
Gold_Fontes.toPandas()

Unnamed: 0,FountainTy,the_geom,OBJECTID,Position,Collection,Painted,GISPROPNUM,SIGNNAME,FountainCo,GISOBJID,SYSTEM,DEPARTMENT,Data,Year,Borough
0,F High Low,POINT (-73.98659181365889 40.60753207315604),1589,Out in Open,02/10/2018 12:00:00 AM +0000,no,B100,Seth Low Playground/ Bealin Square,2,100038957,B100-DF0647,B-11,02/10/2018,2018,Brooklyn
1,C,POINT (-73.94639259466328 40.69933443616386),2917,"In Shade, In Playground",05/29/2018 12:00:00 AM +0000,,B302,Charlie's Place,1,100040304,B302-DF0897,B-03,05/29/2018,2018,Brooklyn
2,E Wheelchair,POINT (-73.92412001948213 40.703148631662614),23,Out in Open,01/11/2018 12:00:00 AM +0000,yes,B016,Maria Hernandez Park,1,100037411,B016-DF0013,B-04,01/11/2018,2018,Brooklyn
3,C,POINT (-74.01152114448477 40.630582530228764),1898,In Playground,02/15/2018 12:00:00 AM +0000,no,B052,Leif Ericson Park,1,100039289,B052-DF0762,B-10,02/15/2018,2018,Brooklyn
4,C,POINT (-73.98647089425904 40.660201255822955),2894,In Shade,05/29/2018 12:00:00 AM +0000,,B255G,Butterfly Gardens,1,100040265,B255G-DF0895,B-07,05/29/2018,2018,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3115,E Wheelchair,POINT (-74.0792508385633 40.62266148329357),1533,In Playground,02/08/2018 12:00:00 AM +0000,yes,R061,Stapleton Playground,1,100038927,R061-DF0068,R-01,02/08/2018,2018,Staten Island
3116,D,POINT (-74.21031591361488 40.533181277394675),1868,Near Ballfield,02/15/2018 12:00:00 AM +0000,yes,R106,Bloomingdale Park,1,100039195,R106-DF0138,R-03,02/15/2018,2018,Staten Island
3117,D,POINT (-74.09878669564895 40.613386552308874),1762,Out in Open,02/13/2018 12:00:00 AM +0000,no,R069,Terrace Playground,1,100039136,R069-DF0110,R-01,02/13/2018,2018,Staten Island
3118,A,POINT (-74.16297563648502 40.616399379236725),1427,Near Ballfield,02/05/2018 12:00:00 AM +0000,yes,R075A,Father Macris Park,1,100038865,R075A-DF0021,R-02,02/05/2018,2018,Staten Island


In [25]:
# write to delta table
Gold_Fontes \
    .select ("FountainTy", "the_geom", "OBJECTID", "Position", "Collection", "Painted", "GISPROPNUM", "SIGNNAME", "Borough", "FountainCo", "GISOBJID", "SYSTEM", "DEPARTMENT", "Data", "Year") \
    .write  \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Fontes/")

In [26]:
spark.sql("""
GENERATE symlink_format_manifest FOR TABLE delta.`hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Fontes/`
""").show()

++
||
++
++



In [29]:
spark.sql("""
DROP TABLE IF EXISTS ProjetoGold.Gold_Fontes_Presto 
""").show()

spark.sql("""
    CREATE EXTERNAL TABLE ProjetoGold.Gold_Fontes_Presto (
       FountainTy string,
        the_geom string,
        OBJECTID INT,
        Position string,
        Collection string,
        Painted string,
        GISPROPNUM string,
        SIGNNAME string,
        Borough string,
        FountainCo INT,
        GISOBJID string,
        SYSTEM string,
        DEPARTMENT string,
        PARENTID string, 
        Data string,
        Year INT
        
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 'hdfs://hdfs-nn:9000/Projeto/Gold/ProjetoGold.db/Gold_Fontes/_symlink_format_manifest/'
""").show()






++
||
++
++

++
||
++
++

