In [1]:
from pyspark.sql.types import StructType

In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/Urban_Park_Ranger_Animal_Condition_Response.csv"

customSchema = StructType([
    StructField("Date_and_Time_of_initial_call", StringType(), True),
    StructField("Date_and_time_of_Ranger_response", StringType(), True),
    StructField("Borough", StringType(), True),
    StructField("Property", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Species_Description", StringType(), True),
    StructField("Call_Source", StringType(), True),
    StructField("Species_Status", StringType(), True), 
    StructField("Animal_Condition", StringType(), True),
	StructField("Duration_of_Response", DoubleType(), True),
	StructField("Age", StringType(), True),
	StructField("Animal_Class", StringType(), True),
	StructField("311SR_Number", StringType(), True),
	StructField("Final_Ranger_Action", StringType(), True),
	StructField("Number_of_Animals", IntegerType(), True),
	StructField("PEP_Response", BooleanType(), True),
	StructField("Animal_Monitored", BooleanType(), True),
	StructField("Rehabiliator", StringType(), True),
	StructField("Hours_spent_monitoring", DoubleType(), True),
	StructField("Police_Response", BooleanType(), True),
	StructField("ESU_Response", BooleanType(), True),
	StructField("ACC_Intake Number", IntegerType(), True)
])
Urban_Park_Ranger_Animal_Condition_Response = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)
Urban_Park_Ranger_Animal_Condition_Response.toPandas()


Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,311SR_Number,Final_Ranger_Action,Number_of_Animals,PEP_Response,Animal_Monitored,Rehabiliator,Hours_spent_monitoring,Police_Response,ESU_Response,ACC_Intake Number
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6.00,...,311-06712416,ACC,6.0,False,False,,,False,False,163537.0
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,311-06714879,Rehabilitator,4.0,False,False,,,False,False,
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1.00,...,,Unfounded,0.0,False,False,,,False,False,
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2.00,...,,Unfounded,0.0,False,False,,,False,False,
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,311-06699415,ACC,1.0,False,False,,,False,False,119833.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,,raccoon,Central,Native,Healthy,0.75,...,,Relocated/Condition Corrected,1.0,False,True,,1.0,False,False,
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,1-1-1568786600,ACC,1.0,False,False,,,False,False,36061.0
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,,Raccoon,Employee,Native,DOA,1.50,...,,ACC,2.0,False,True,,0.5,False,False,28316.0
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,,Unfounded,0.0,,,,,,False,


In [5]:
Alteracao_resgates_inicial = Urban_Park_Ranger_Animal_Condition_Response.drop("ACC_Intake Number")
Alteracao_resgates_inicial.toPandas()

Alteracao_resgates = Alteracao_resgates_inicial.drop("Rehabiliator")
Alteracao_resgates.toPandas()

Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,Age,Animal_Class,311SR_Number,Final_Ranger_Action,Number_of_Animals,PEP_Response,Animal_Monitored,Hours_spent_monitoring,Police_Response,ESU_Response
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6.00,Adult,Birds,311-06712416,ACC,6.0,False,False,,False,False
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,Infant,Birds,311-06714879,Rehabilitator,4.0,False,False,,False,False
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1.00,Juvenile,Deer,,Unfounded,0.0,False,False,,False,False
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2.00,Adult;#Juvenile,Small Mammals-RVS,,Unfounded,0.0,False,False,,False,False
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,Juvenile,Small Mammals-non RVS,311-06699415,ACC,1.0,False,False,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,,raccoon,Central,Native,Healthy,0.75,Adult,Small Mammals-RVS,,Relocated/Condition Corrected,1.0,False,True,1.0,False,False
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,Juvenile,Small Mammals-RVS,1-1-1568786600,ACC,1.0,False,False,,False,False
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,,Raccoon,Employee,Native,DOA,1.50,Juvenile,Small Mammals-RVS,,ACC,2.0,False,True,0.5,False,False
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,Adult,Small Mammals-RVS,,Unfounded,0.0,,,,,False


In [6]:
Alteracao_resgates1 = Alteracao_resgates.drop("Hours_spent_monitoring")
Alteracao_resgates1.toPandas()

Alteracao_resgates2 = Alteracao_resgates1.withColumn('Data', split(Alteracao_resgates1['Date_and_Time_of_initial_call'],' ').getItem(0))
Alteracao_resgates2.toPandas()

Alteracao_resgates3 = Alteracao_resgates2.withColumn('Year', split(Alteracao_resgates2['Data'],'/').getItem(2))
Alteracao_resgates3.toPandas()

Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,Animal_Class,311SR_Number,Final_Ranger_Action,Number_of_Animals,PEP_Response,Animal_Monitored,Police_Response,ESU_Response,Data,Year
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6.00,...,Birds,311-06712416,ACC,6.0,False,False,False,False,06/23/2021,2021
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,Birds,311-06714879,Rehabilitator,4.0,False,False,False,False,06/24/2021,2021
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1.00,...,Deer,,Unfounded,0.0,False,False,False,False,06/23/2021,2021
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2.00,...,Small Mammals-RVS,,Unfounded,0.0,False,False,False,False,06/23/2021,2021
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,Small Mammals-non RVS,311-06699415,ACC,1.0,False,False,False,False,06/23/2021,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,,raccoon,Central,Native,Healthy,0.75,...,Small Mammals-RVS,,Relocated/Condition Corrected,1.0,False,True,False,False,06/05/2018,2018
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,Small Mammals-RVS,1-1-1568786600,ACC,1.0,False,False,False,False,06/01/2018,2018
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,,Raccoon,Employee,Native,DOA,1.50,...,Small Mammals-RVS,,ACC,2.0,False,True,False,False,05/16/2018,2018
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Small Mammals-RVS,,Unfounded,0.0,,,,False,05/02/2018,2018


In [7]:
Alteracao_resgates4 = Alteracao_resgates3.withColumn(
    "Duration_of_Response",
    when(
        (col("Duration_of_Response").isNull()), 
        "Indefinido"
    ).otherwise(col("Duration_of_Response")))


Alteracao_resgates5 = Alteracao_resgates4.withColumn(
    "Number_of_Animals",
    when(
        (col("Number_of_Animals").isNull()), 
        "Indefinido"
    ).otherwise(col("Number_of_Animals")))

Alteracao_resgates5.toPandas()


Unnamed: 0,Date_and_Time_of_initial_call,Date_and_time_of_Ranger_response,Borough,Property,Location,Species_Description,Call_Source,Species_Status,Animal_Condition,Duration_of_Response,...,Animal_Class,311SR_Number,Final_Ranger_Action,Number_of_Animals,PEP_Response,Animal_Monitored,Police_Response,ESU_Response,Data,Year
0,06/23/2021 04:45:00 PM,06/24/2021 08:00:00 AM,Brooklyn,Sternberg Park,Inside locked athletic field under construction,Chukar,Other,Exotic,Healthy,6.0,...,Birds,311-06712416,ACC,6,False,False,False,False,06/23/2021,2021
1,06/24/2021 10:00:00 AM,06/24/2021 11:00:00 AM,Bronx,Haffen Park,Haffen Pool,Sparrow,Central,Native,Healthy,1.75,...,Birds,311-06714879,Rehabilitator,4,False,False,False,False,06/24/2021,2021
2,06/23/2021 02:30:00 PM,06/23/2021 02:30:00 PM,Bronx,Pelham Bay Park,Pelham Bay South,White-tailed Deer,Employee,Native,,1.0,...,Deer,,Unfounded,0,False,False,False,False,06/23/2021,2021
3,06/23/2021 01:00:00 PM,06/23/2021 01:10:00 PM,Staten Island,Willowbrook Park,The carousel,Raccoon,Employee,Native,,2.0,...,Small Mammals-RVS,,Unfounded,0,False,False,False,False,06/23/2021,2021
4,06/23/2021 09:20:00 AM,06/23/2021 09:20:00 AM,Queens,Judge Moses Weinstein Playground,Garbage can,Virginia Opossum,Central,Native,Healthy,2.25,...,Small Mammals-non RVS,311-06699415,ACC,1,False,False,False,False,06/23/2021,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,06/05/2018 12:00:00 AM,06/05/2018 12:01:00 AM,Manhattan,Abingdon Square,,raccoon,Central,Native,Healthy,0.75,...,Small Mammals-RVS,,Relocated/Condition Corrected,1,False,True,False,False,06/05/2018,2018
2605,06/01/2018 12:00:00 PM,06/01/2018 12:30:00 PM,Manhattan,Central Park,park,Raccoon,Employee,Native,Injured,1.25,...,Small Mammals-RVS,1-1-1568786600,ACC,1,False,False,False,False,06/01/2018,2018
2606,05/16/2018 09:00:00 AM,05/17/2018 10:10:00 AM,Manhattan,Morningside Park,,Raccoon,Employee,Native,DOA,1.5,...,Small Mammals-RVS,,ACC,2,False,True,False,False,05/16/2018,2018
2607,05/02/2018 09:30:00 AM,05/02/2018 12:00:00 PM,Manhattan,Central Park,park sewer drain,Raccoon,Public,Native,Healthy,0.75,...,Small Mammals-RVS,,Unfounded,0,,,,False,05/02/2018,2018


In [8]:
Alteracao_resgates6 = Alteracao_resgates5.withColumn(
    "Location",
    when(
        (col("Location").isNull()), 
        "Indefinido"
    ).otherwise(col("Location")))
	
Alteracao_resgates7 = Alteracao_resgates6.withColumn(
    "311SR_Number",
    when(
        (col("311SR_Number").isNull()), 
        "Indefinido"
    ).otherwise(col("311SR_Number")))
    
Alteracao_resgates8 = Alteracao_resgates7.withColumn("Duration_of_Response",round(Alteracao_resgates7.Duration_of_Response.cast(DoubleType()),2))

Alteracao_resgates9 = Alteracao_resgates8.withColumn("Number_of_Animals",Alteracao_resgates8.Number_of_Animals.cast(IntegerType()))

Alteracao_resgates10 = Alteracao_resgates9.withColumn("Year",Alteracao_resgates9.Year.cast(IntegerType()))


In [9]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Urban_Park_Ranger_Animal_Condition_Response
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE Projeto.Urban_Park_Ranger_Animal_Condition_Response (
    
        Date_and_Time_of_initial_call string,
		Date_and_time_of_Ranger_response string,
		Property string,
		Location string,
		Call_Source string,
		Species_Status string,
		Animal_Condition string,
		Duration_of_Response DOUBLE,
		Animal_Class string,
		311SR_Number string,
		Final_Ranger_Action string,
		Number_of_Animals INT,
		PEP_Response BOOLEAN,
		Animal_Monitored BOOLEAN,
		Police_Response BOOLEAN,
		ESU_Response BOOLEAN,
		Data string,
        Year INT,
        Borough string,
        Species_Description string,
		Age string
		
       
    )
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Urban_Park_Ranger_Animal_Condition_Response'
    """
    )


AnalysisException: Database 'projeto' not found

In [10]:
Alteracao_resgates10.show()
Alteracao_resgates10.printSchema()

+-----------------------------+--------------------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+--------------------+---------------+--------------------+-------------+--------------------+-----------------+------------+----------------+---------------+------------+----------+----+
|Date_and_Time_of_initial_call|Date_and_time_of_Ranger_response|      Borough|            Property|            Location| Species_Description|         Call_Source|Species_Status|Animal_Condition|Duration_of_Response|            Age|        Animal_Class| 311SR_Number| Final_Ranger_Action|Number_of_Animals|PEP_Response|Animal_Monitored|Police_Response|ESU_Response|      Data|Year|
+-----------------------------+--------------------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------+----------------+--------------------+---------------+-------

In [11]:
Alteracao_resgates10 \
    .select("Date_and_Time_of_initial_call", "Date_and_time_of_Ranger_response", "Borough", "Property", "Location", "Species_Description", "Call_Source", 
            "Species_Status", "Animal_Condition", "Duration_of_Response", "Age", "Animal_Class", "311SR_Number", "Final_Ranger_Action", 
            "Number_of_Animals", "PEP_Response", "Animal_Monitored", "Police_Response", "ESU_Response", "Data", "Year") \
    .write \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Urban_Park_Ranger_Animal_Condition_Response/")