# Delta Lake fundamentals

This notebook was built to be run on the following docker image: `jupyter/pyspark-notebook:spark-3.3.1`

## Connect to Spark

In [1]:
!pip install delta-spark



In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

from delta.pip_utils import configure_spark_with_delta_pip

spark = (
    SparkSession
    .builder.master("spark://spark:7077")
    .appName("DeltaLakeFundamentals")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(spark).getOrCreate()

## Delta Lake fundamentals

### 1. Create a Delta Table

Let's load the data

In [2]:
SCHEMA = StructType(
    [
        StructField('id', StringType(), True), 
        StructField('data_inversa', StringType(), True), 
        StructField('dia_semana', StringType(), True), 
        StructField('horario', StringType(), True), 
        StructField('uf', StringType(), True), 
        StructField('br', StringType(), True), 
        StructField('km', StringType(), True), 
        StructField('municipio', StringType(), True), 
        StructField('causa_acidente', StringType(), True), 
        StructField('tipo_acidente', StringType(), True), 
        StructField('classificacao_acidente', StringType(), True), 
        StructField('fase_dia', StringType(), True), 
        StructField('sentido_via', StringType(), True), 
        StructField('condicao_metereologica', StringType(), True), 
        StructField('tipo_pista', StringType(), True), 
        StructField('tracado_via', StringType(), True), 
        StructField('uso_solo', StringType(), True), 
        StructField('pessoas', IntegerType(), True), 
        StructField('mortos', IntegerType(), True), 
        StructField('feridos_leves', IntegerType(), True), 
        StructField('feridos_graves', IntegerType(), True), 
        StructField('ilesos', IntegerType(), True), 
        StructField('ignorados', IntegerType(), True), 
        StructField('feridos', IntegerType(), True), 
        StructField('veiculos', StringType(), True), 
        StructField('latitude', DoubleType(), True), 
        StructField('longitude', DoubleType(), True), 
        StructField('regional', StringType(), True), 
        StructField('delegacia', StringType(), True), 
        StructField('uop', StringType(), True)
    ]
)

In [3]:
df_acidentes = (
    spark
    .read.format("csv")
    .option("schema", SCHEMA)
    .option("delimiter", ";")
    .option("header", "true")
    .option("encoding", "ISO-8859-1")
    .load("/data/acidentes/datatran2020.csv")
)

df_acidentes.show(5)

+------+------------+------------+--------+---+---+-----+--------------------+--------------------+--------------------+----------------------+---------+-----------+----------------------+----------+-----------+--------+-------+------+-------------+--------------+------+---------+-------+--------+------------+------------+--------+---------+--------------+
|    id|data_inversa|  dia_semana| horario| uf| br|   km|           municipio|      causa_acidente|       tipo_acidente|classificacao_acidente| fase_dia|sentido_via|condicao_metereologica|tipo_pista|tracado_via|uso_solo|pessoas|mortos|feridos_leves|feridos_graves|ilesos|ignorados|feridos|veiculos|    latitude|   longitude|regional|delegacia|           uop|
+------+------------+------------+--------+---+---+-----+--------------------+--------------------+--------------------+----------------------+---------+-----------+----------------------+----------+-----------+--------+-------+------+-------------+--------------+------+---------+-

Write a Delta Table is simple

In [6]:
df_acidentes\
    .write.format("delta")\
    .mode("overwrite")\
    .save("/data/delta/acidentes/")