In [1]:
from pyspark.sql.types import StructType

In [2]:
pip install delta-spark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/Projeto/Silver'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
hdfs_path = "hdfs://hdfs-nn:9000/Projeto/Bronze/DOHMH_Dog_Bite_Data.csv"

customSchema = StructType([
    StructField("Unique_ID", StringType(), True),
    StructField("DateOfBite", StringType(), True),
    StructField("Species", StringType(), True),
    StructField("Breed", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("SpayNeuter", BooleanType(), True),
    StructField("Borough", StringType(), True), 
    StructField("ZipCode", IntegerType(), True)
])
DOHMH_Dog_Bite_Data = spark \
            .read\
            .option("delimiter",",")\
            .option("header","true")\
            .schema(customSchema) \
            .csv(hdfs_path)


DOHMH_Dog_Bite_Data.toPandas()

Unnamed: 0,Unique_ID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
...,...,...,...,...,...,...,...,...,...
22658,10276,December 24 2017,DOG,CHIWEENIE MIX,7.0,M,True,Staten Island,10303.0
22659,10277,December 24 2017,DOG,DUNKER,5.0,F,True,Staten Island,10303.0
22660,10278,December 21 2017,DOG,"Schnauzer, Miniature",,M,True,Staten Island,10312.0
22661,10279,December 28 2017,DOG,Mixed/Other,,F,False,Staten Island,10308.0


In [5]:
Alteracao_mordidas = DOHMH_Dog_Bite_Data.drop("Species")
Alteracao_mordidas.toPandas()

Unnamed: 0,Unique_ID,DateOfBite,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,Pit Bull,,U,False,Brooklyn,11224.0
...,...,...,...,...,...,...,...,...
22658,10276,December 24 2017,CHIWEENIE MIX,7.0,M,True,Staten Island,10303.0
22659,10277,December 24 2017,DUNKER,5.0,F,True,Staten Island,10303.0
22660,10278,December 21 2017,"Schnauzer, Miniature",,M,True,Staten Island,10312.0
22661,10279,December 28 2017,Mixed/Other,,F,False,Staten Island,10308.0


In [6]:
Alteracao2_mordidas = Alteracao_mordidas.withColumn('Year', split(Alteracao_mordidas['DateOfBite'],' ').getItem(2).cast("Int"))
Alteracao2_mordidas.toPandas()

Unnamed: 0,Unique_ID,DateOfBite,Breed,Age,Gender,SpayNeuter,Borough,ZipCode,Year
0,1,January 01 2018,UNKNOWN,,U,False,Brooklyn,11220.0,2018
1,2,January 04 2018,UNKNOWN,,U,False,Brooklyn,,2018
2,3,January 06 2018,Pit Bull,,U,False,Brooklyn,11224.0,2018
3,4,January 08 2018,Mixed/Other,4.0,M,False,Brooklyn,11231.0,2018
4,5,January 09 2018,Pit Bull,,U,False,Brooklyn,11224.0,2018
...,...,...,...,...,...,...,...,...,...
22658,10276,December 24 2017,CHIWEENIE MIX,7.0,M,True,Staten Island,10303.0,2017
22659,10277,December 24 2017,DUNKER,5.0,F,True,Staten Island,10303.0,2017
22660,10278,December 21 2017,"Schnauzer, Miniature",,M,True,Staten Island,10312.0,2017
22661,10279,December 28 2017,Mixed/Other,,F,False,Staten Island,10308.0,2017


In [7]:
Alteracao3_mordidas = Alteracao2_mordidas.withColumn(
    "Unique_ID",
    when(
        (col("Unique_ID").isNull()), 
        "Indefinido"
    ).otherwise(col("Unique_ID")))

In [8]:
Alteracao4_mordidas = Alteracao3_mordidas.withColumn(
    "Breed",
    when(
        (col("Breed").isNull()), 
        "unknown"
    ).otherwise(col("Breed")))

In [9]:
Alteracao5_mordidas = Alteracao4_mordidas.drop("Age")

In [10]:
Alteracao6_mordidas = Alteracao5_mordidas.withColumn(
    "ZipCode",
    when(
        (col("ZipCode").isNull()), 
        -1
    ).otherwise(col("ZipCode")))

In [11]:
Alteracao7_mordidas = Alteracao6_mordidas.withColumn("Breed", lower(col("Breed")))

In [12]:
Alteracao8_mordidas = Alteracao7_mordidas.withColumn(
    "Breed",
    when((col("Breed").contains("pit bull")) | (col("Breed").contains("pitbull")) | (col("Breed").contains("pit")), "Pitbull").otherwise(col("Breed")))

In [13]:
Alteracao9_mordidas = Alteracao8_mordidas.withColumn(
    "Breed",
    when((col("Breed").contains("sharpei")) | (col("Breed").contains("shar-pei")) , "Sharpei")
    .when((col("Breed").contains("westie")) | (col("Breed").contains("mouth cur")) | (col("Breed").contains("lapso")) | (col("Breed").contains("schichi")) | (col("Breed").contains("mexican hairless")) | (col("Breed").contains("water")) | (col("Breed").contains("canario")) | (col("Breed").contains("shorky")) | (col("Breed").contains("persian")) | (col("Breed").contains("miki")) | (col("Breed").contains("lathese")) | (col("Breed").contains("podengo")) | (col("Breed").contains("markey")) | (col("Breed").contains("refused")) | (col("Breed").contains("portuguese")) | (col("Breed").contains("blue tick hound")) | (col("Breed").contains("marquis chinese tien yun chu")) | (col("Breed").contains("turvuren")) | (col("Breed").contains("brussel")) | (col("Breed").contains("bernese")) | (col("Breed").contains("wild dog")) | (col("Breed").contains("large")) | (col("Breed").contains("dogo")) | (col("Breed").contains("dogue")) | (col("Breed").contains("breed")) | (col("Breed").contains("medium size dog")) | (col("Breed").contains("basenji")) | (col("Breed").contains("mutt")) | (col("Breed").contains("hot dog")) | (col("Breed").contains("breed")) | (col("Breed").contains("big size dog")) | (col("Breed").contains("westy whippet")) | (col("Breed").contains("small")) | (col("Breed").contains("australian herd dog")), "Other")
    .when((col("Breed").contains("terr")) | (col("Breed").contains("terrier")), "Terrier")
    .when((col("Breed").contains("borzoi")), "Borzoi")
    .when((col("Breed").contains("briard")), "Briard")
    .when((col("Breed").contains("kelpie")), "Kelpie")
    .when((col("Breed").contains("morkie")), "Morkie")
    .when((col("Breed").contains("papillon")), "Papillon")
    .when((col("Breed").contains("schnoodle")), "Schnoodle")
    .when((col("Breed").contains("bloodhound")), "Bloodhound")
    .when((col("Breed").contains("lacy")), "Lacy")
    .when((col("Breed").contains("setter")), "Setter")
    .when((col("Breed").contains("cavapoo")), "Cavapoo")
    .when((col("Breed").contains("cavashu")), "Cavashu")
    .when((col("Breed").contains("griffon")), "Griffon")
    .when((col("Breed").contains("bouvier")), "Bouvier")
    .when((col("Breed").contains("carolina")), "Carolina")
    .when((col("Breed").contains("chiweenie")), "Chiweenie")
    .when((col("Breed").contains("chorkie")), "Chorkie")
    .when((col("Breed").contains("xolo")), "Xolo")
    .when((col("Breed").contains("dalma")), "Dalmata")
    .when((col("Breed").contains("hound")), "Hound")
    .when((col("Breed").contains("havanese")), "Havanese")
    .when((col("Breed").contains("jindo")), "Jindo")
    .when((col("Breed").contains("potcake")), "Potcake")
    .when((col("Breed").contains("queensland")), "Queensland Heeler")
    .when((col("Breed").contains("weimaraner")), "Weimaraner")
    .when((col("Breed").contains("springer")), "Springer")
    .when((col("Breed").contains("doberman")), "Doberman")
    .when((col("Breed").contains("malinois")), "Malinois")
    .when((col("Breed").contains("malamute")), "Malamute")
    .when((col("Breed").contains("spitz")), "Spitz")
    .when((col("Breed").contains("spaniel")), "Spaniel")
    .when((col("Breed").contains("collie")), "Collie")
    .when((col("Breed").contains("beauceron")), "Beauceron")
    .when((col("Breed").contains("charles")), "Charles Cavalier")
    .when((col("Breed").contains("crested")), "Crested")
    .when((col("Breed").contains("corso")), "Corso")
    .when((col("Breed").contains("pekingese")), "Pekingese")
    .when((col("Breed").contains("hound")), "Hound")
    .when((col("Breed").contains("boerboel")), "Boerboel")
    .when((col("Breed").contains("great dane")), "Great Dane")
    .when((col("Breed").contains("airedale terrier")), "Airedale Terrier")
    .when((col("Breed").contains("coonhound")), "Coonhound")
    .when((col("Breed").contains("akita")), "Akita")
    .when((col("Breed").contains("lhasa")), "Lhasa")
    .when((col("Breed").contains("catahoula")), "Catahoula")
    .when((col("Breed").contains("husky")), "Husky")
    .when((col("Breed").contains("klee kai")), "Klee Kai")
    .when((col("Breed").contains("labradoodle")), "Labradoodle")
    .when((col("Breed").contains("eskimo")), "Eskimo")
    .when((col("Breed").contains("arubian cunucu")), "Arubian Cunucu")
    .when((col("Breed").contains("pug")), "Pug")
    .when((col("Breed").contains("scotia")), "Scotia")
    .when((col("Breed").contains("puggle")), "Puggle")
    .when((col("Breed").contains("zuchon")), "Zuchon")
    .when((col("Breed").contains("yorkipoo")), "Yorkipoo")
    .when((col("Breed").contains("vizsla")), "Vizsla")
    .when((col("Breed").contains("boxer")), "Boxer")
    .when((col("Breed").contains("schnauzer")), "Schnauzer")
    .when((col("Breed").contains("bichon")), "Bichon")
    .when((col("Breed").contains("formosan mountain dog")), "Formosan Mountain")
    .when((col("Breed").contains("multi poo")) | (col("Breed").contains("multipoo")) , "Multipoo")
    .when((col("Breed").contains("ridgeback")) | (col("Breed").contains("ridge back")) , "Ridgeback")
    .when((col("Breed").contains("coton de tulear")) | (col("Breed").contains("cotton de tulear")) | (col("Breed").contains("cotton de tullear")) , "Coton de Tulear")
    .when((col("Breed").contains("pyrenees")) | (col("Breed").contains("pyreenes")) , "Pyrenees")
    .when((col("Breed").contains("sapsal")) | (col("Breed").contains("sapsali")) , "Sapsali")
    .when((col("Breed").contains("saint bernard")) | (col("Breed").contains("st bernard")) , "Saint Bernard")
    .when((col("Breed").contains("basset hound")) | (col("Breed").contains("basset hound x")) , "Basset Hound")
    .when((col("Breed").contains("corgi")) | (col("Breed").contains("corgie")) , "Corgi")
    .when((col("Breed").contains("cock")) | (col("Breed").contains("cook")) , "Cockapoo")
    .when((col("Breed").contains("pointer")) | (col("Breed").contains("pionter")) , "Pointer")
    .when((col("Breed").contains("pomski")) | (col("Breed").contains("pomsky")) , "Pomski")
    .when((col("Breed").contains("pincher")) | (col("Breed").contains("pinscher")) , "Pincher")
    .when((col("Breed").contains("healer")) | (col("Breed").contains("heeler")) , "Healer")
    .when((col("Breed").contains("chow chow")) | (col("Breed").contains("chow")) , "Chow Chow")
    .when((col("Breed").contains("labrador")) | (col("Breed").contains("lab")) , "Labrador")
    .when((col("Breed").contains("beagle")) | (col("Breed").contains("beasle")) , "Beagle")
    .when((col("Breed").contains("chihuahua")) | (col("Breed").contains("chi hua hua")) , "Chihuahua")
    .when((col("Breed").contains("cattle")) | (col("Breed").contains("cattledog")) , "Cattle")
    .when((col("Breed").contains("maltese")) | (col("Breed").contains("malti")) , "Maltese")
    .when((col("Breed").contains("poodle")) | (col("Breed").contains("poodlde")) , "Poodle")
    .when((col("Breed").contains("border collie")) | (col("Breed").contains("border")) , "Border Collie")
    .when((col("Breed").contains("masif")) | (col("Breed").contains("masiff")) | (col("Breed").contains("mastiff")) , "Mastiff")
    .when((col("Breed").contains("dachschund")) | (col("Breed").contains("daschound")) | (col("Breed").contains("daschund")) | (col("Breed").contains("daushand")) | (col("Breed").contains("dachshund")) | (col("Breed").contains("dachsund")) , "Daschschund")
    .when((col("Breed").contains("golden doodle")) | (col("Breed").contains("golden doddle")) | (col("Breed").contains("doodle")) | (col("Breed").contains("goldendoodle")) , "Golden Doodle")
    .when((col("Breed").contains("bulldog")) | (col("Breed").contains("frenchie")) | (col("Breed").contains("bull dog")) | (col("Breed").contains("buldog")) | (col("Breed").contains("american bull")) | (col("Breed").contains("bully")) | (col("Breed").contains("american bully")) , "Bulldog")
    .when((col("Breed").contains("staffordshire")) | (col("Breed").contains("staff")) | (col("Breed").contains("stafford")) , "Staffordshire Terrier")
    .when((col("Breed").contains("shepherd")) | (col("Breed").contains("shpherd")) | (col("Breed").contains("sheep")) | (col("Breed").contains("shep")) | (col("Breed").contains("aussie")) | (col("Breed").contains("shpherd")) | (col("Breed").contains("shephered")) | (col("Breed").contains("sherphard")) | (col("Breed").contains("shehperd")) | (col("Breed").contains("sheherd")) | (col("Breed").contains("shepard")) | (col("Breed").contains("sheperd")) , "Shepherd")
    .when((col("Breed").contains("shih tzu")) | (col("Breed").contains("shih poo")) | (col("Breed").contains("schi-po")) | (col("Breed").contains("schitzu")) | (col("Breed").contains("shihtzu")) , "Shih Tzu")
    .when((col("Breed").contains("uknown")) | (col("Breed").contains("not certain")) | (col("Breed").contains("n/a")) | (col("Breed").contains("newfoundland")) | (col("Breed").contains("not sure, rescue")) | (col("Breed").contains("not given")) | (col("Breed").contains("unc")) | (col("Breed").contains("uncertain")) | (col("Breed").contains("unknown 2 dogs")) | (col("Breed").contains("unsure")) | (col("Breed").contains("unnkown")) | (col("Breed").contains("unknown")) , "Unknown")
    .when((col("Breed").contains("rottweiler")) | (col("Breed").contains("rotweiler")) | (col("Breed").contains("rotweiller")) , "Rottweiler")
    .when((col("Breed").contains("yorkie")) | (col("Breed").contains("yorkshire")) | (col("Breed").contains("yorky")) , "Yorkie")
    .when((col("Breed").contains("shiba inu")) | (col("Breed").contains("inu")) | (col("Breed").contains("shiba")) , "Shiba Inu")
    .when((col("Breed").contains("pomeranian")) | (col("Breed").contains("pomerian")) | (col("Breed").contains("pom")) , "Pomerian")
    .when((col("Breed").contains("jack russell")) | (col("Breed").contains("jsck")) | (col("Breed").contains("jack russ")) , "Jack Russel")
    .when((col("Breed").contains("golden retriever")) | (col("Breed").contains("retrever")) | (col("Breed").contains("retriever")) | (col("Breed").contains("retreiver")) | (col("Breed").contains("golden retriver")) | (col("Breed").contains("golden retr")) | (col("Breed").contains("golden retreiver")) , "Golden Retriever")
    .when((col("Breed").contains("mix")) | (col("Breed").contains("mxied")) | (col("Breed").contains("mixed")) | (col("Breed").contains("cross")) , "Mixed")
    
    .otherwise(col("Breed"))
)

Alteracao9_mordidas.toPandas()

Unnamed: 0,Unique_ID,DateOfBite,Breed,Gender,SpayNeuter,Borough,ZipCode,Year
0,1,January 01 2018,Unknown,U,False,Brooklyn,11220,2018
1,2,January 04 2018,Unknown,U,False,Brooklyn,-1,2018
2,3,January 06 2018,Pitbull,U,False,Brooklyn,11224,2018
3,4,January 08 2018,Mixed,M,False,Brooklyn,11231,2018
4,5,January 09 2018,Pitbull,U,False,Brooklyn,11224,2018
...,...,...,...,...,...,...,...,...
22658,10276,December 24 2017,Chiweenie,M,True,Staten Island,10303,2017
22659,10277,December 24 2017,dunker,F,True,Staten Island,10303,2017
22660,10278,December 21 2017,Schnauzer,M,True,Staten Island,10312,2017
22661,10279,December 28 2017,Mixed,F,False,Staten Island,10308,2017


In [14]:
spark.sql(
    """
    DROP TABLE IF EXISTS Projeto.Mordidas_Caes
    """
)

spark.sql(
    
    
    """
    CREATE EXTERNAL TABLE Projeto.Mordidas_Caes (
    
        Unique_ID string,
        DateOfBite string,
        Breed string,
        Gender string,
        SpayNeuter boolean,
        ZipCode INT
        )
        USING DELTA
        PARTITIONED BY(
         Year INT,
         Borough string
         
    )
    
    
    
    
    LOCATION 'hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Mordidas_Caes'
    """
)

DataFrame[]

In [15]:
#write df to hive deltalake_table

Alteracao9_mordidas \
    .select("Unique_ID", "DateOfBite", "Breed", "Gender", "SpayNeuter", "Year", "Borough", "ZipCode") \
    .write \
    .mode("overwrite") \
    .partitionBy("Year", "Borough") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Projeto/Silver/Projeto.db/Mordidas_Caes/")
