In [0]:
'''
Iniciando build_airport_dim
'''

'\nIniciando build_airport_dim\n'

Importando funções comuns para uso no notebook

In [0]:
%run
./shared_gold_functions

Inicializando sessão spark, definindo o schema da tabela final e importando dados da tabela silver

In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.sql import functions as F
import urllib

# Inicializa a sessão Spark
spark = SparkSession.builder.appName("accidents_analysis").getOrCreate()

schema = StructType([
   StructField("airport_ICAO_code", StringType(), False),
   StructField("airport_IATA_code", StringType(), False),
   StructField("airport_name", StringType(), False)])

accidents_silver_df = spark.read.format("delta").load('dbfs:/user/hive/warehouse/silver_database.db/silver_accidents')

accidents_silver_df.display()

index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,approx_location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants,country_or_region,departure_airport_IATA,departure_airport_ICAO,destination_airport_IATA,destination_airport_ICAO
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,0,Substantial,Accident,"Erechim Airport (ERM), RS",Landing,Private,"Chapecó Airport, SC","Erechim-Comandante Kraemer Airport, RS",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday,0.0,5.0,Brazil,XAP,SBCH,ERM,SSER
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA",Approach,Private,"Fort Lauderdale International Airport, FL","Hot Springs-Ingalls Field, VA",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5.0,5.0,USA,FLL,KFLL,HSP,KHSP
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,0,"Destroyed, written off",Accident,near Kalongo Airstrip,Landing,Military,,,,"Information is only available from news, social media or unofficial sources",SC7,Friday,0.0,2.0,Uganda,,,,
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,0,Substantial,Accident,Vårgårda,Take off,Private,,,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday,0.0,5.0,Sweden,,,,
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA",En route,Private,,"Islip-Long Island MacArthur Airport, NY",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4.0,4.0,USA,,,ISP,KISP
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ),Standing,Passenger - Scheduled,N'Djamena Airport,Abuja-Nnamdi Azikiwe International Airport,,Information verified through data from accident investigation authorities,A332,Friday,0.0,,Chad,NDJ,FTTJ,ABV,DNAA
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ)",Standing,Military,,,,"Information is only available from news, social media or unofficial sources",C130,Wednesday,0.0,,Canada,,,,
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,0,Unknown,Accident,E of Florida,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY",Bogotá-Eldorado Airport,Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday,0.0,,Atlantic Ocean,JFK,KJFK,BOG,SKBO
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,0,"Destroyed, written off",OT,Khartoum International Airport (KRT),Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday,0.0,0.0,Sudan,,,,
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport,Saint-Barthélémy-Rémy de Haenen Airport,BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0.0,6.0,,PTP,TFFR,SBH,TFFJ


## Processamento

Na etapa de processamento, vamos:
- Selecionar apenas as colunas da base silver_accidents que contenham os códigos dos aeroportos
  - Primeiro selecionamos as colunas relacionadas a "departure"
  - Depois selecionamos as colunas relacionadas a "destination"
- Removemos as linhas nulas
- Pegamos os registros distintos
- Fazemos um "union" para juntas as duas listas ("departure" e "destination")
- Dropamos duplicatas


Tratando aeroportos de departure

In [0]:
departure_airports_df = accidents_silver_df.select('departure_airport_ICAO','departure_airport_IATA','departure_airport')

departure_airports_df = departure_airports_df.na.drop("all")

departure_airports_df = departure_airports_df.distinct()

departure_airports_df.display()

departure_airport_ICAO,departure_airport_IATA,departure_airport
YBRS,,"Barwon Heads Airport, VIC"
FZEA,MDK,Mbandaka Airport
HSSP,PZU,Port Sudan Airport
MMTG,TGZ,Tuxtla Gutiérrez-Francisco Sarabia National Airport
RJAA,NRT,Tokyo-Narita Airport
KLAN,LAN,"Lansing-Capital Region International Airport, MI"
KOSU,OSU,"Columbus-Ohio State University Airport, OH"
CYYC,YYC,"Calgary International Airport, AB"
KFLL,FLL,"Fort Lauderdale International Airport, FL"
MYES,TYM,Staniel Cay Airport


Tratando aeroportos de destination

In [0]:
destination_airports_df = accidents_silver_df.select('destination_airport_ICAO','destination_airport_IATA','destination_airport')

destination_airports_df = destination_airports_df.na.drop("all")

destination_airports_df = destination_airports_df.distinct()

destination_airports_df.display()

destination_airport_ICAO,destination_airport_IATA,destination_airport
KISP,ISP,"Islip-Long Island MacArthur Airport, NY"
CYHZ,YHZ,"Halifax-Stanfield International Airport, NS"
YBRS,,"Barwon Heads Airport, VIC"
KPWA,PWA,"Oklahoma City-Wiley Post Airport, OK"
LEMD,MAD,Madrid-Barajas Adolfo Suárez Airport
RJAA,NRT,Tokyo-Narita Airport
CYYC,YYC,"Calgary International Airport, AB"
LTCC,DIY,Diyarbakir Airport
MMCB,CVJ,Cuernavaca Airport
NZAA,AKL,Auckland International Airport


Unindo as duas listas e eliminando duplicatas

In [0]:
# Unindo os dois datasets e eliminando duplicatas

airports_df_dim = departure_airports_df.union(destination_airports_df)

airports_df_dim = airports_df_dim.distinct()


airports_df_dim = airports_df_dim.withColumnRenamed('departure_airport_ICAO','airport_ICAO_code')
airports_df_dim = airports_df_dim.withColumnRenamed('departure_airport_IATA','airport_IATA_code')
airports_df_dim = airports_df_dim.withColumnRenamed('departure_airport','airport_name')

airports_df_dim.display()



airport_ICAO_code,airport_IATA_code,airport_name
YBRS,,"Barwon Heads Airport, VIC"
FZEA,MDK,Mbandaka Airport
HSSP,PZU,Port Sudan Airport
MMTG,TGZ,Tuxtla Gutiérrez-Francisco Sarabia National Airport
RJAA,NRT,Tokyo-Narita Airport
KLAN,LAN,"Lansing-Capital Region International Airport, MI"
KOSU,OSU,"Columbus-Ohio State University Airport, OH"
CYYC,YYC,"Calgary International Airport, AB"
KFLL,FLL,"Fort Lauderdale International Airport, FL"
MYES,TYM,Staniel Cay Airport


##Avaliando a qualidade dos dados
- Testar se os códigos IATA tem 3 caracteres
- Testar se os códigos ICAO tem 4 caracteres

In [0]:
# Testando a qualidade dos dados

test_IATA_codes(airports_df_dim,'airport_IATA_code')
test_ICAO_codes(airports_df_dim,'airport_ICAO_code')

Avaliando a condição airport_IATA_code contem 3 caracteres
Avaliando a condição airport_ICAO_code contem 4 caracteres


## Registrando dados no banco
Após aprovação, salvamos o dado sanitizado no banco e verificamos se tivemos sucesso

In [0]:
# Cria o banco de dados se ele não existir
database_name = 'gold_database'
table_name = 'gold_airport_dim'


spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")

# Salva o DataFrame como tabela Delta
airports_df_dim.write.mode("overwrite").format("delta").option("mergeSchema", "true").saveAsTable(
    f"{database_name}.{table_name}"
)

In [0]:
%sql
SELECT * from gold_database.gold_airport_dim

airport_ICAO_code,airport_IATA_code,airport_name
YBRS,,"Barwon Heads Airport, VIC"
FZEA,MDK,Mbandaka Airport
HSSP,PZU,Port Sudan Airport
MMTG,TGZ,Tuxtla Gutiérrez-Francisco Sarabia National Airport
RJAA,NRT,Tokyo-Narita Airport
KLAN,LAN,"Lansing-Capital Region International Airport, MI"
KOSU,OSU,"Columbus-Ohio State University Airport, OH"
CYYC,YYC,"Calgary International Airport, AB"
KFLL,FLL,"Fort Lauderdale International Airport, FL"
MYES,TYM,Staniel Cay Airport


In [0]:
'''
Finalizando build_airport_dim
'''

'\nFinalizando build_airport_dim\n'