# Packages

In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, DoubleType, ByteType, ShortType, StringType, DecimalType, LongType, BinaryType
from pyspark.sql.functions import to_date
# Legger på nødvendige metadata og tar i bruk DASK sin metadata-validering  
from dask_felleskomponenter.governance.main import Metadata
from sedona.spark import *

# Constant variable

In [0]:
BUCKET_FOLDER = "/AIS/"

# Config

In [0]:
%run ./config

{'catalog_name': 'land_techtroll_dev',
 'landing_zone_prefix': '/Volumes/land_techtroll_dev/external_dev/landing_zone',
 'location_prefix': '/Volumes/land_techtroll_dev/external_dev/static_data/cloudFiles',
 'static_data_prefix': '/Volumes/land_techtroll_dev/external_dev/static_data',
 'env': 'dev'}

# Drop tabel
Hvis kan endre strukturen eller partitionering, så er det greit å droppe tabellen for å unngå error meldinger. Dette går greit siden den statiske dataen som blir lest inn til databasen endrer seg ikke

In [0]:

%sql
DROP TABLE IF EXISTS bronze.ais;


# Schema
Strukturer dataen ifra AIS som dataen lest og lagt til riktig i databasen

In [0]:
parquet_schema = StructType([
    StructField("date_time_utc", TimestampType(), True),
    StructField("mmsi", IntegerType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("status", ByteType(), True),
    StructField("course_over_ground", DoubleType(), True),
    StructField("speed_over_ground", DoubleType(), True),
    StructField("rate_of_turn", ShortType(), True),
    StructField("maneuvre", ShortType(), True),
    StructField("imo", IntegerType(), True),
    StructField("callsign", StringType(), True),
    StructField("ship_name", StringType(), True),
    StructField("ship_type", ByteType(), True),
    StructField("length", ShortType(), True),
    StructField("draught", DecimalType(10, 2), True),
    StructField("data_source", StringType(), True),
    StructField("ais_class", StringType(), True),
    StructField("hex_7", LongType(), True),
    StructField("hex_14", LongType(), True),
    StructField("geometry", BinaryType(), True)
])

# Setup of catelog

In [0]:
spark.sql(f'USE CATALOG {spark.conf.get("conf.catalog_name")}')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS bronze')
spark.sql('USE SCHEMA bronze')

DataFrame[]

# Read bucket
Leser data som satt inn i bucket hos google cloud.

In [0]:
# Sti til landingssonen registert som et eksternt volum
landing_zone = spark.conf.get("conf.landing_zone_prefix") # Lokasjon for å lagre informasjon om skjema, samt hvilke filer som allerede er prosessert
location = spark.conf.get("conf.location_prefix") + BUCKET_FOLDER

In [0]:
# Bruk cloudFiles-formatet som prosesserer nye filer som kommer inn
df = (spark.read
    .format('parquet')
    .schema(parquet_schema)  # Apply the schema
    .option('cloudFiles.schemaLocation', location)
    .option('mergeSchema', 'true')
    .load(landing_zone + BUCKET_FOLDER)
)

In [0]:
df.display()

date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
2024-12-30T17:59:38Z,231523000,8.48022833,57.84725666,0,96.3,7.1,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538987351,AAAAAAFAIPXgfNfs5kBM7HLn/wJU
2024-12-30T17:59:28Z,231523000,8.47960166,57.84727,0,95.7,6.9,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538971951,AAAAAAFAIPWOWUmdWEBM7HNX5nDj
2024-12-30T17:59:18Z,231523000,8.47892333,57.84731833,0,91.7,6.8,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538990623,AAAAAAFAIPU1cE9IaEBM7HTtUlNf
2024-12-30T17:58:59Z,231523000,8.477845,57.84735333,0,97.4,7.4,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539102879,AAAAAAFAIPSoGY8dP0BM7HYS7EDb
2024-12-30T17:58:39Z,231523000,8.47657333,57.84740833,0,96.6,7.0,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540066695,AAAAAAFAIPQBa2T93EBM7HfgS9qf
2024-12-30T17:58:18Z,231523000,8.47523666,57.84751,0,93.3,7.1,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540070055,AAAAAAFAIPNSODEe3EBM7Hs1KoQ4
2024-12-30T17:57:59Z,231523000,8.47410666,57.84755,0,99.4,6.9,0,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539965775,AAAAAAFAIPK+G68IFkBM7HyEtdzG
2024-12-30T17:57:39Z,231523000,8.47288333,57.84762166,0,94.4,7.1,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539973327,AAAAAAFAIPIdw4pY30BM7H7d1opm
2024-12-30T17:57:28Z,231523000,8.47220666,57.84765166,0,94.2,7.2,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540096631,AAAAAAFAIPHFEkNOiUBM7H/ZfwzQ
2024-12-30T17:57:18Z,231523000,8.47151666,57.84767666,0,95.2,7.1,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540089207,AAAAAAFAIPFqobRwNUBM7ICrNiQp


In [0]:
# Skriver ut data for å se lastet inn riktig
df.limit(10).display()

date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
2024-12-30T17:59:38Z,231523000,8.48022833,57.84725666,0,96.3,7.1,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538987351,AAAAAAFAIPXgfNfs5kBM7HLn/wJU
2024-12-30T17:59:28Z,231523000,8.47960166,57.84727,0,95.7,6.9,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538971951,AAAAAAFAIPWOWUmdWEBM7HNX5nDj
2024-12-30T17:59:18Z,231523000,8.47892333,57.84731833,0,91.7,6.8,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389538990623,AAAAAAFAIPU1cE9IaEBM7HTtUlNf
2024-12-30T17:58:59Z,231523000,8.477845,57.84735333,0,97.4,7.4,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539102879,AAAAAAFAIPSoGY8dP0BM7HYS7EDb
2024-12-30T17:58:39Z,231523000,8.47657333,57.84740833,0,96.6,7.0,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540066695,AAAAAAFAIPQBa2T93EBM7HfgS9qf
2024-12-30T17:58:18Z,231523000,8.47523666,57.84751,0,93.3,7.1,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540070055,AAAAAAFAIPNSODEe3EBM7Hs1KoQ4
2024-12-30T17:57:59Z,231523000,8.47410666,57.84755,0,99.4,6.9,0,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539965775,AAAAAAFAIPK+G68IFkBM7HyEtdzG
2024-12-30T17:57:39Z,231523000,8.47288333,57.84762166,0,94.4,7.1,127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389539973327,AAAAAAFAIPIdw4pY30BM7H7d1opm
2024-12-30T17:57:28Z,231523000,8.47220666,57.84765166,0,94.2,7.2,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540096631,AAAAAAFAIPHFEkNOiUBM7H/ZfwzQ
2024-12-30T17:57:18Z,231523000,8.47151666,57.84767666,0,95.2,7.1,-127,0,8417259,OZ2063,EIKEFJORD,70,64,3.5,G,A,608155192159895551,639680389540089207,AAAAAAFAIPFqobRwNUBM7ICrNiQp


# Setup for database (delta lake)
Oppsett følger Medaljongarkitektur. Koden støtter hovedsaklig bare bronze data, altså den ubehandlet data leser inn ifra bucket til database. Vurdering om å lage sølv tabell kan vurders

Relevante lenker:
- https://www.databricks.com/glossary/medallion-architecture


In [0]:

# Legger til dato-kolonne for å kunne partitionere på den
df = df.withColumn("date", to_date("date_time_utc"))
df.write.mode("overwrite").partitionBy("date").saveAsTable("bronze.ais")

# Setup for metadata
Sett tags for å beskrive tabell som blir laget

In [0]:
%sql
ALTER TABLE land_techtroll_dev.bronze.tilsyn SET TAGS ( 'medaljongnivaa' = 'bronze');
COMMENT ON TABLE land_techtroll_dev.bronze.tilsyn IS 'ais data ifra kystverket';
ALTER TABLE land_techtroll_dev.bronze.tilsyn SET TAGS ( 'tilgangsnivaa' = 'Ugradert');

In [0]:
catalog_name = spark.conf.get('conf.catalog_name')
# Sjekker at tabellen tilfredsstiller kravene for bronse-produkter
metadata = Metadata(catalog=catalog_name, schema='bronze', table='tilsyn')
metadata.validate()

[]