In [3]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [4]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/Central_Park_Squirrel_Census_Squirrel_Data_2018.csv"


customSchema = StructType([
    StructField("X", DoubleType(), True),
    StructField("Y", DoubleType(), True),
    StructField("Unique_Squirrel_ID", StringType(), True),
    StructField("Hectare", StringType(), True),
    StructField("Shift", StringType(), True),
    StructField("Date", IntegerType(), True),
    StructField("Hectare_Squirrel_Number", IntegerType(), True),
    StructField("Age", StringType(), True), 
    StructField("Primary_Fur_Color", StringType(), True),
	StructField("Highlight_Fur_Color", StringType(), True),
	StructField("Combination_of_Primary_and_Highlight_Color", StringType(), True),
	StructField("Color_Notes", StringType(), True),
	StructField("Location", StringType(), True),
	StructField("Above_Ground_Sighter", StringType(), True),
    StructField("Specific_Location", StringType(), True),
	StructField("Running", BooleanType(), True),
	StructField("Chasing", BooleanType(), True),
	StructField("Climbing", BooleanType(), True),
	StructField("Eating", BooleanType(), True),
	StructField("Foraging", BooleanType(), True),
	StructField("Other_Activities", StringType(), True),
	StructField("Kuks", BooleanType(), True),
	StructField("Quaas", BooleanType(), True),
	StructField("Moans", BooleanType(), True),
	StructField("Tail_flags", BooleanType(), True),
	StructField("Tail_twitches", BooleanType(), True),
	StructField("Approaches", BooleanType(), True),
	StructField("Indifferent", BooleanType(), True),
	StructField("Runs_from", BooleanType(), True),
	StructField("Other_Interactions", StringType(), True),
	StructField("Lat/Long", StringType(), True)
    
])

Central_Park_Squirrel_Census_Squirrel_Data_2018 = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
Central_Park_Squirrel_Census_Squirrel_Data_2018.toPandas()

Unnamed: 0,X,Y,Unique_Squirrel_ID,Hectare,Shift,Date,Hectare_Squirrel_Number,Age,Primary_Fur_Color,Highlight_Fur_Color,...,Kuks,Quaas,Moans,Tail_flags,Tail_twitches,Approaches,Indifferent,Runs_from,Other_Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018,-73.963943,40.790868,30B-AM-1007-04,30B,AM,10072018,4,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9639431360458 40.7908677445466)
3019,-73.970402,40.782560,19A-PM-1013-05,19A,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9704015859639 40.7825600069973)
3020,-73.966587,40.783678,22D-PM-1012-07,22D,PM,10122018,7,Adult,Gray,"Black, Cinnamon, White",...,False,False,False,False,False,False,True,False,,POINT (-73.9665871993517 40.7836775064883)
3021,-73.963994,40.789915,29B-PM-1010-02,29B,PM,10102018,2,,Gray,"Cinnamon, White",...,False,False,False,False,False,False,True,False,,POINT (-73.9639941227864 40.7899152327912)


In [6]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/Central_Park_Squirrel_Census_Squirrel_Data_2018.csv"


customSchema = StructType([
    StructField("X", DoubleType(), True),
    StructField("Y", DoubleType(), True),
    StructField("Unique_Squirrel_ID", StringType(), True),
    StructField("Hectare", StringType(), True),
    StructField("Shift", StringType(), True),
    StructField("Date", IntegerType(), True),
    StructField("Hectare_Squirrel_Number", IntegerType(), True),
    StructField("Age", StringType(), True), 
    StructField("Primary_Fur_Color", StringType(), True),
	StructField("Highlight_Fur_Color", StringType(), True),
	StructField("Combination_of_Primary_and_Highlight_Color", StringType(), True),
	StructField("Color_Notes", StringType(), True),
	StructField("Location", StringType(), True),
	StructField("Above_Ground_Sighter", StringType(), True),
    StructField("Specific_Location", StringType(), True),
	StructField("Running", BooleanType(), True),
	StructField("Chasing", BooleanType(), True),
	StructField("Climbing", BooleanType(), True),
	StructField("Eating", BooleanType(), True),
	StructField("Foraging", BooleanType(), True),
	StructField("Other_Activities", StringType(), True),
	StructField("Kuks", BooleanType(), True),
	StructField("Quaas", BooleanType(), True),
	StructField("Moans", BooleanType(), True),
	StructField("Tail_flags", BooleanType(), True),
	StructField("Tail_twitches", BooleanType(), True),
	StructField("Approaches", BooleanType(), True),
	StructField("Indifferent", BooleanType(), True),
	StructField("Runs_from", BooleanType(), True),
	StructField("Other_Interactions", StringType(), True),
	StructField("Lat/Long", StringType(), True)
    
])

Central_Park_Squirrel_Census_Squirrel_Data_2018 = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
Central_Park_Squirrel_Census_Squirrel_Data_2018.toPandas()

Unnamed: 0,X,Y,Unique_Squirrel_ID,Hectare,Shift,Date,Hectare_Squirrel_Number,Age,Primary_Fur_Color,Highlight_Fur_Color,...,Kuks,Quaas,Moans,Tail_flags,Tail_twitches,Approaches,Indifferent,Runs_from,Other_Interactions,Lat/Long
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9561344937861 40.7940823884086)
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,,,,...,False,False,False,False,False,False,False,False,,POINT (-73.9688574691102 40.7837825208444)
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,,Gray,,...,False,False,False,False,False,False,False,False,,POINT (-73.97428114848522 40.775533619083)
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9596413903948 40.7903128889029)
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,False,False,False,False,False,False,False,False,,POINT (-73.9702676472613 40.7762126854894)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018,-73.963943,40.790868,30B-AM-1007-04,30B,AM,10072018,4,Adult,Gray,,...,False,False,False,False,False,False,False,True,,POINT (-73.9639431360458 40.7908677445466)
3019,-73.970402,40.782560,19A-PM-1013-05,19A,PM,10132018,5,Adult,Gray,White,...,False,False,False,False,False,False,True,False,,POINT (-73.9704015859639 40.7825600069973)
3020,-73.966587,40.783678,22D-PM-1012-07,22D,PM,10122018,7,Adult,Gray,"Black, Cinnamon, White",...,False,False,False,False,False,False,True,False,,POINT (-73.9665871993517 40.7836775064883)
3021,-73.963994,40.789915,29B-PM-1010-02,29B,PM,10102018,2,,Gray,"Cinnamon, White",...,False,False,False,False,False,False,True,False,,POINT (-73.9639941227864 40.7899152327912)


In [7]:
Alteracao_esquilos1 = Central_Park_Squirrel_Census_Squirrel_Data_2018.drop("Other_Activities")

Alteracao_esquilos2 = Alteracao_esquilos1.drop("Combination_of_Primary_and_Highlight_Color")

Alteracao_esquilos3 = Alteracao_esquilos2.drop("Lat/Long")

Alteracao_esquilos4 = Alteracao_esquilos3.drop("Color_Notes")

Alteracao_esquilos5 = Alteracao_esquilos4.drop("Specific_Location")


In [8]:
Alteracao_esquilos6 = Alteracao_esquilos5.withColumn(
    "Age",
    when(
        (col("Age").isNull()), 
        "Indefinido"
    ).otherwise(col("Age")))
	
Alteracao_esquilos7 = Alteracao_esquilos6.withColumn(
    "Primary_Fur_Color",
    when(
        (col("Primary_Fur_Color").isNull()), 
        "Indefinido"
    ).otherwise(col("Primary_Fur_Color")))
	
Alteracao_esquilos8 = Alteracao_esquilos7.withColumn(
    "Highlight_Fur_Color",
    when(
        (col("Highlight_Fur_Color").isNull()), 
        "Indefinido"
    ).otherwise(col("Highlight_Fur_Color")))
	
Alteracao_esquilos9 = Alteracao_esquilos8.withColumn(
    "Location",
    when(
        (col("Location").isNull()), 
        "Indefinido"
    ).otherwise(col("Location")))

Alteracao_esquilos10 = Alteracao_esquilos9.withColumn(
    "Above_Ground_Sighter",
    when(
        (col("Above_Ground_Sighter").isNull()), 
        "Indefinido"
    ).otherwise(col("Above_Ground_Sighter")))	
	
Alteracao_esquilos11 = Alteracao_esquilos10.withColumn(
    "Other_Interactions",
    when(
        (col("Other_Interactions").isNull()), 
        "Indefinido"
    ).otherwise(col("Other_Interactions")))
Alteracao_esquilos11.toPandas()

Unnamed: 0,X,Y,Unique_Squirrel_ID,Hectare,Shift,Date,Hectare_Squirrel_Number,Age,Primary_Fur_Color,Highlight_Fur_Color,...,Foraging,Kuks,Quaas,Moans,Tail_flags,Tail_twitches,Approaches,Indifferent,Runs_from,Other_Interactions
0,-73.956134,40.794082,37F-PM-1014-03,37F,PM,10142018,3,Indefinido,Indefinido,Indefinido,...,False,False,False,False,False,False,False,False,False,Indefinido
1,-73.968857,40.783783,21B-AM-1019-04,21B,AM,10192018,4,Indefinido,Indefinido,Indefinido,...,False,False,False,False,False,False,False,False,False,Indefinido
2,-73.974281,40.775534,11B-PM-1014-08,11B,PM,10142018,8,Indefinido,Gray,Indefinido,...,False,False,False,False,False,False,False,False,False,Indefinido
3,-73.959641,40.790313,32E-PM-1017-14,32E,PM,10172018,14,Adult,Gray,Indefinido,...,True,False,False,False,False,False,False,False,True,Indefinido
4,-73.970268,40.776213,13E-AM-1017-05,13E,AM,10172018,5,Adult,Gray,Cinnamon,...,True,False,False,False,False,False,False,False,False,Indefinido
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3018,-73.963943,40.790868,30B-AM-1007-04,30B,AM,10072018,4,Adult,Gray,Indefinido,...,True,False,False,False,False,False,False,False,True,Indefinido
3019,-73.970402,40.782560,19A-PM-1013-05,19A,PM,10132018,5,Adult,Gray,White,...,True,False,False,False,False,False,False,True,False,Indefinido
3020,-73.966587,40.783678,22D-PM-1012-07,22D,PM,10122018,7,Adult,Gray,"Black, Cinnamon, White",...,True,False,False,False,False,False,False,True,False,Indefinido
3021,-73.963994,40.789915,29B-PM-1010-02,29B,PM,10102018,2,Indefinido,Gray,"Cinnamon, White",...,False,False,False,False,False,False,False,True,False,Indefinido


In [9]:


spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Central_Park_Squirrel_Census_Squirrel_Data_2018
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.Central_Park_Squirrel_Census_Squirrel_Data_2018 (
        X DOUBLE,
        Y DOUBLE,
        Unique_Squirrel_ID string,
        Hectare string,
        Shift string,
        Date INT,
        Hectare_Squirrel_Number INT,
        Age string,
        Primary_Fur_Color string,
        Highlight_Fur_Color string,
        Location string,
        Above_Ground_Sighter string,
        Running BOOLEAN,
        Chasing BOOLEAN,
        Climbing BOOLEAN,
        Eating BOOLEAN,
        Foraging BOOLEAN,
        Kuks BOOLEAN,
        Quaas BOOLEAN,
        Moans BOOLEAN,
        Tail_Flags BOOLEAN,
        Tail_Twitches BOOLEAN,
        Approaches BOOLEAN,
        Indifferent BOOLEAN,
        Runs_From BOOLEAN,
        Other_Interactions string

    )
    

    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Central_Park_Squirrel_Census_Squirrel_Data_2018'
    """
)

DataFrame[]

In [10]:
Alteracao_esquilos11 \
    .select("X", "Y", "Unique_Squirrel_ID", "Hectare", "Shift", "Date", "Hectare_Squirrel_Number", "Age", "Primary_Fur_Color", "Highlight_Fur_Color", "Location", "Above_Ground_Sighter", "Running", "Chasing", "Climbing", "Eating", "Foraging", "Kuks", "Quaas", "Moans", "Tail_Flags", "Tail_Twitches", "Approaches", "Indifferent", "Runs_From", "Other_Interactions") \
    .write \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Central_Park_Squirrel_Census_Squirrel_Data_2018/")
