In [0]:
'''
Iniciando clean_bronze_data_accidents
'''

'\nIniciando clean_bronze_data_accidents\n'

Importando funções comuns para uso no notebook

In [0]:
%run
./shared_silver_functions

Inicializando sessão spark e importando dados do banco bronze

In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import functions as F
import urllib

# Inicializa a sessão Spark
spark = SparkSession.builder.appName("accidents_analysis").getOrCreate()

# Pega os dados bronze
spark.catalog.listDatabases()
accidents_bronze_df = spark.read.format("delta").load('dbfs:/user/hive/warehouse/bronze_database.db/bronze_accidents')

accidents_bronze_df.display()

index,date,time,type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code
305652,Sunday 1 January 2023,07:00,Boeing 737 MAX 8,Flair Airlines,C-FLRS,61808/7870,2022.0,CFMI LEAP 1B27,,,Fatalities: 0 / Occupants: 154,0,Substantial,Accident,"Kitchener-Region of Waterloo International Airport, ON (YKF/CYKF) - Canada",Standing,Passenger - Scheduled,"Kitchener-Region of Waterloo International Airport, ON (YKF/CYKF)",Cancún Airport (CUN/MMUN),TSB,Information verified through data from accident investigation authorities,B38M
305778,Monday 2 January 2023,23:25 UTC,Bombardier CRJ-900LR,"Endeavor Air, opf Delta Connection",N928XJ,15190,2008.0,,,,Fatalities: 0 / Occupants:,0,Minor,Accident,"New York-John F. Kennedy International Airport, NY (JFK/KJFK) - United States of America",Taxi,Passenger - Scheduled,"Detroit-Metropolitan Wayne County Airport, MI (DTW/KDTW)","New York-John F. Kennedy International Airport, NY (JFK/KJFK)",NTSB,"Information is only available from news, social media or unofficial sources",CRJ9
305779,Monday 2 January 2023,23:25 UTC,Airbus A330-202,ITA Airways,EI-EJM,1308,2012.0,GE CF6-80E1A4,,,Fatalities: 0 / Occupants:,0,Minor,Accident,"New York-John F. Kennedy International Airport, NY (JFK/KJFK) - United States of America",Taxi,Passenger - Scheduled,Roma-Fiumicino Airport (FCO/LIRF),"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",NTSB,"Information is only available from news, social media or unofficial sources",A332
314139,Monday 2 January 2023,21:40,Boeing 737-8K2 (WL),Transavia Airlines,PH-HXJ,62159/6363,2017.0,,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,Rotterdam/The Hague Airport (RTM/EHRD) - Netherlands,Standing,Passenger - Scheduled,Roma-Fiumicino Airport (FCO/LIRF),Rotterdam/The Hague Airport (RTM/EHRD),Dutch Safety Board,Accident investigation report completed and information captured,B738
318710,Monday 2 January 2023,11:35,Embraer EMB-505 Phenom 300,Eagle Jet 300 LLC,N555NR,50500327,,Pratt & Whitney Canada PW545E,,,Fatalities: 1 / Occupants: 4,0,"Destroyed, written off",Accident,"Provo Airport, UT (PVU) - United States of America",Take off,Passenger - Non-Scheduled/charter/Air Taxi,"Provo Airport, UT (PVU/KPVU)","Chino Airport, CA (CNO/KCNO)",NTSB,"Information is only available from news, social media or unofficial sources",E55P
305729,Tuesday 3 January 2023,12:30,Boeing 767-300F,"Amerijet International, lst Maersk Air Cargo",N496MM,67027/1272,2022.0,,,,Fatalities: 0 / Occupants: 3,0,Minor,Accident,near Seoul-Incheon International Airport (ICN/RKSI) - South Korea,Initial climb,Cargo,"Seoul-Incheon International Airport (ICN/RKSI), South Korea","Anchorage-Ted Stevens International Airport, AK (ANC/PANC)",,Information verified through data from accident investigation authorities,B763
305630,Saturday 7 January 2023,07:18 LT,Airbus A320-232 (WL),JetStar Japan,JA14JJ,5695,2013.0,IAE V2527-A5,,,Fatalities: 0 / Occupants: 142,0,,Accident,Nagoya/Chubu Centrair International Airport (NGO/RJGG) - Japan,En route,Passenger - Scheduled,Tokyo-Narita Airport (NRT/RJAA),Fukuoka Airport (FUK/RJFF),JTSB,Information verified through data from accident investigation authorities,A320
307267,Saturday 7 January 2023,,Boeing 737-846 (WL),Japan Airlines,JA307J,35336/2450,2007.0,,,,Fatalities: 0 / Occupants: 140,0,,Accident,E of Miyazaki - Japan,En route,Passenger - Scheduled,Tokyo-Haneda Airport (HND/RJTT),Miyazaki Airport (KMI/RJFM),JTSB,Information verified through data from accident investigation authorities,B738
318709,Monday 9 January 2023,14:30,Antonov An-2R,Naryan-Mar Air Enterprise,RA-71165,1G200-08,1983.0,Shvetsov ASh-62IR,12705 hours,,Fatalities: 2 / Occupants: 12,0,"Destroyed, written off",Accident,10 km SW of Karatayka - Russia,En route,Passenger - Scheduled,Ust-Kara Airstrip,Karatayka Airport (ULDT),MAK,Information verified through data from accident investigation authorities,AN2
318708,Wednesday 11 January 2023,,Antonov An-2R,Stavropol City Aviation Sports Club,RA-17789,1G203-50,1983.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,"near Novomaryevskaya, Stavropol Region - Russia",Standing,-,-,-,,"Information is only available from news, social media or unofficial sources",AN2


## Processamento
Seção para realizar a limpeza padrão de dados:
- Fazer o trim para remover espaços antes e ao final dos valores
- Transformar valores vazios ou sem caracteres em null
- Dropar duplicatas

In [0]:
from functools import reduce

# Lista de colunas que desejamos fazer o trim
all_columns = accidents_bronze_df.columns


# Função lambda para aplicar o trim em mais de uma coluna
accidents_bronze_df = reduce(
    lambda df, col: df.withColumn(col, F.trim(df[col])),
    all_columns,
    accidents_bronze_df
)

# Valores vazios transformar em null

# Função lambda para deixar como null colunas que não possuam nem texto, nem números, ou seja que tenham apenas caracteres especiais
accidents_bronze_df = reduce(
    lambda df, col: df.withColumn(col, F.when(F.regexp_like(col,F.lit('^[^\w]*$')), None).otherwise(F.col(col))),
    all_columns,
    accidents_bronze_df
)

# Função lambda para substituir valores vazios por null em uma coluna
accidents_bronze_df = reduce(
    lambda df, col: df.withColumn(col, F.when(F.col(col) == "", None).otherwise(F.col(col))),
    all_columns,
    accidents_bronze_df
)

# Drop em duplicatas
accidents_bronze_df = accidents_bronze_df.dropDuplicates()

accidents_bronze_df.display()


index,date,time,type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code
386881,Tuesday 23 April 2024,c. 13:45,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75
354203,Sunday 10 March 2024,14:52,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR
318650,Friday 19 May 2023,ca 11,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7
318643,Saturday 8 July 2023,c. 15,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2
318648,Sunday 4 June 2023,c. 15,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560
389190,Friday 31 May 2024,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332
343377,Wednesday 9 August 2023,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130
351839,Sunday 18 February 2024,20:09 UTC,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763
318674,Monday 17 April 2023,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12
344872,Thursday 24 August 2023,15:42 UTC,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6


### Limpeza da Coluna por tipo
Agora vamos limpar cada uma das colunas conforme o seu tipo 
- Começando pela coluna __index__, onde vamos remover as linhas nulas e deixar o tipo como inteiro

In [0]:
# Limpa cada coluna por tipo

# Avaliando index

accidents_bronze_df.select('index').summary().show()

# Removendo as linhas onde 'index' é null
accidents_bronze_df = accidents_bronze_df.filter(accidents_bronze_df["index"].isNotNull())

# Convertendo para int
accidents_bronze_df = accidents_bronze_df.withColumn("index", accidents_bronze_df["index"].cast("int"))

accidents_bronze_df.select('index').summary().show()

accidents_bronze_df.display()

# Salva no banco Silver

+-------+------------------+
|summary|             index|
+-------+------------------+
|  count|               263|
|   mean| 337185.9087452472|
| stddev|22267.527254840756|
|    min|            305630|
|    25%|          318658.0|
|    50%|          344883.0|
|    75%|          349611.0|
|    max|            389861|
+-------+------------------+

+-------+------------------+
|summary|             index|
+-------+------------------+
|  count|               263|
|   mean| 337185.9087452472|
| stddev|22267.527254840756|
|    min|            305630|
|    25%|            318658|
|    50%|            344883|
|    75%|            349611|
|    max|            389861|
+-------+------------------+



index,date,time,type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code
386881,Tuesday 23 April 2024,c. 13:45,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75
354203,Sunday 10 March 2024,14:52,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR
318650,Friday 19 May 2023,ca 11,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7
318643,Saturday 8 July 2023,c. 15,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2
318648,Sunday 4 June 2023,c. 15,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560
389190,Friday 31 May 2024,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332
343377,Wednesday 9 August 2023,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130
351839,Sunday 18 February 2024,20:09 UTC,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763
318674,Monday 17 April 2023,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12
344872,Thursday 24 August 2023,15:42 UTC,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6


Passando para a coluna __Date__ vamos:
- Separar a coluna em duas, uma para o dia da semana, em inglês
- Outra com a data no formato AAAA-MM-DD
- Precisamos verificar também quais são os valores distintos de dia de semana, para garantir que não haja duplicatas

In [0]:
# Avaliando a coluna Date
accidents_bronze_df.select('date').summary().show()

# Quebrando a coluna Date em Date e Weekday
accidents_bronze_df = accidents_bronze_df.withColumn("date", F.trim(accidents_bronze_df["date"]))

accidents_bronze_df = accidents_bronze_df.withColumn("weekday", F.split(accidents_bronze_df["date"]," ")[0])
accidents_bronze_df = accidents_bronze_df.withColumn("date", F.split(accidents_bronze_df["date"],"^\w+\s")[1])

accidents_bronze_df = accidents_bronze_df.withColumn("weekday", F.trim(accidents_bronze_df["weekday"]))
accidents_bronze_df = accidents_bronze_df.withColumn("date", F.trim(accidents_bronze_df["date"]))

# Verificando os valores distintos de weekday
distinct_weekday = accidents_bronze_df.select("weekday").distinct().collect()
for value in distinct_weekday:
    print(value)

# Transformando a date em um formato de data
accidents_bronze_df = accidents_bronze_df.withColumn("date", F.to_date(accidents_bronze_df["date"],'[d][dd] MMMM yyyy'))

accidents_bronze_df.select('date').summary().show()
accidents_bronze_df.display()

+-------+--------------------+
|summary|                date|
+-------+--------------------+
|  count|                 263|
|   mean|                NULL|
| stddev|                NULL|
|    min|Friday 1 Septembe...|
|    25%|                NULL|
|    50%|                NULL|
|    75%|                NULL|
|    max|Wednesday 9 Augus...|
+-------+--------------------+

Row(weekday='Wednesday')
Row(weekday='Tuesday')
Row(weekday='Friday')
Row(weekday='Thursday')
Row(weekday='Saturday')
Row(weekday='Monday')
Row(weekday='Sunday')
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    25%|
|    50%|
|    75%|
|    max|
+-------+



index,date,time,type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code,weekday
386881,2024-04-23,c. 13:45,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday
354203,2024-03-10,14:52,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday
318650,2023-05-19,ca 11,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday
318643,2023-07-08,c. 15,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday
318648,2023-06-04,c. 15,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday
351839,2024-02-18,20:09 UTC,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday
344872,2023-08-24,15:42 UTC,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday


Passando para a coluna __Time__ vamos:
- Remover os caracteres que aparecem antes e após os números
- Remover caracter intermediario entre os números (":")
- Salvar no padrão de 4 dígitos

In [0]:
# Tratando a coluna time
accidents_bronze_df.select('time').summary().show()


# Removendo tudo que venha antes ou depois dos números
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.regexp_replace(accidents_bronze_df["time"],"^[^\d]*\D",""))
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.reverse(accidents_bronze_df["time"]))
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.regexp_replace(accidents_bronze_df["time"],"^[^\d]*\D",""))
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.reverse(accidents_bronze_df["time"]))

# Removendo ':'
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.trim(accidents_bronze_df["time"]))
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.replace(accidents_bronze_df["time"],F.lit(":")))

# Deixando no padrao de 4 digitos
accidents_bronze_df = accidents_bronze_df.withColumn("time", F.rpad(F.col("time"), 4, "0"))

accidents_bronze_df.display()


+-------+--------+
|summary|    time|
+-------+--------+
|  count|     214|
|   mean|    NULL|
| stddev|    NULL|
|    min|00:03 LT|
|    25%|    NULL|
|    50%|    NULL|
|    75%|    NULL|
|    max|    noon|
+-------+--------+



index,date,time,type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code,weekday
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday


In [0]:
accidents_bronze_df.select('time').summary().show()


+-------+------------------+
|summary|              time|
+-------+------------------+
|  count|               214|
|   mean|1276.3177570093458|
| stddev| 555.6485800988221|
|    min|              0000|
|    25%|             900.0|
|    50%|            1250.0|
|    75%|            1706.0|
|    max|              2347|
+-------+------------------+



Passando para a coluna __Type__ vamos:
- Renomear para aircraft_type

In [0]:

# Renomeando type
accidents_bronze_df = accidents_bronze_df.withColumnRenamed("type","aircraft_type")

accidents_bronze_df.display()



index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,type_code,weekday
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday


Passando para a coluna __Type Code__ vamos:
- Renomear para aircraft_type_code
- Verificar se seus valores estão dentro dos possíveis de aircraft_code em silver_aircrafts
- Valores que não estejam na lista permitida serão colocados como Null

In [0]:
# Renomeando type_code
accidents_bronze_df = accidents_bronze_df.withColumnRenamed("type_code","aircraft_type_code")

# Verificando se aircraft_type_code está dentro dos valores possíveis de aircraft_type_code. Caso contrário, colocar null

# Obter os valores distintos da coluna
aircrafts_df = spark.read.format("delta").load('dbfs:/user/hive/warehouse/silver_database.db/silver_aircrafts')
distinct_aircrafts_code = aircrafts_df.select("aircraft_code").distinct().collect()

# Converter os valores distintos para uma lista de strings
distinct_values_list = [row["aircraft_code"] for row in distinct_aircrafts_code]

# Identificar valores que não estão na lista permitida
accidents_with_invalid_aircraft_codes = accidents_bronze_df.filter(~F.col("aircraft_type_code").isin(distinct_values_list))

# Printar valores que não estão na lista permitida
print("Valores que não estão na lista permitida:")
accidents_with_invalid_aircraft_codes.display()

# Transformar valores não permitidos em null
accidents_bronze_df = accidents_bronze_df.withColumn("aircraft_type_code", F.when(F.col("aircraft_type_code").isin(distinct_values_list), F.col("aircraft_type_code")).otherwise(None))

accidents_bronze_df.display()

Valores que não estão na lista permitida:


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday


Passando para as colunas do tipo __Número Inteiro__: Total Airframe Hrs, Cycles, Other Fatalities e Year Of Manufacture vamos:

- Realizar a mudança de tipo (cast) de texto para int

In [0]:
# Tratando total_airframe_hrs, cycles, other_fatalities, year_of_manufacture

# Aplicar a função às colunas desejadas
columns_to_clean = ["total_airframe_hrs", "cycles", "other_fatalities","year_of_manufacture"]

for column in columns_to_clean:
    accidents_bronze_df = clean_and_cast_to_int(accidents_bronze_df, column)

accidents_bronze_df.display()



index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,fatalities,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,Fatalities: 5 / Occupants: 5,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,Fatalities: 0 / Occupants: 2,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,Fatalities: 0 / Occupants: 5,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,Fatalities: 4 / Occupants: 4,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,Fatalities: 0 / Occupants:,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,Fatalities: 0 / Occupants:,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,Fatalities: 0 / Occupants: 0,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,Fatalities: 0 / Occupants: 6,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday


Passando para __Fatalities__ vamos:

- Separar os valores em duas colunas, 'nb_fatalities' para o número de vítimas e 'nb_occupants' para o número de passageiros

In [0]:
# Transformando a coluna fatalities em 2: uma com as vítimas e outras com o número de ocupantes

# Expressão regular para extrair os valores de Fatalities e Occupants
fatalities_expr = F.regexp_extract(accidents_bronze_df['fatalities'], 'Fatalities: (\d+)', 1)
occupants_expr = F.regexp_extract(accidents_bronze_df['fatalities'], 'Occupants: (\d+)', 1)

# Criar as novas colunas
accidents_bronze_df = accidents_bronze_df.withColumn('nb_fatalities', fatalities_expr.cast('int'))
accidents_bronze_df = accidents_bronze_df.withColumn('nb_occupants', occupants_expr.cast('int'))

# Drop em fatalities que se tornou obsoleta
accidents_bronze_df = accidents_bronze_df.drop('fatalities')

accidents_bronze_df.display()


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,0,Substantial,Accident,"Erechim Airport (ERM), RS - Brazil",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday,0.0,5.0
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA - United States of America",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5.0,5.0
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,0,"Destroyed, written off",Accident,near Kalongo Airstrip - Uganda,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday,0.0,2.0
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,0,Substantial,Accident,Vårgårda - Sweden,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday,0.0,5.0
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA - United States of America",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4.0,4.0
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ) - Chad,Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday,0.0,
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ) - Canada",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday,0.0,
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,0,Unknown,Accident,E of Florida - Atlantic Ocean,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday,0.0,
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,0,"Destroyed, written off",OT,Khartoum International Airport (KRT) - Sudan,Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday,0.0,0.0
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ) - Saint Barth�lemy,Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0.0,6.0


Passando para __Location__ vamos:

- Extrair o país ou região do acidente e salvar em "country_or_region"
- Verificar país ou região cujo nome não seja um dos permitidos em silver_countries
- Implementar algumas traduções para que os nomes dos países e regiões não conformes fiquem conformes (caso dos USA e do Congo)
- Os demais valores que permanecem inválidos serão colocados como null
- Renomear a coluna "location" para "approx_location", dado que agora ela não contém mais o país ou região, apenas o nome da localidade

In [0]:
# Extraindo a região ou país do local do acidente

# Invertendo a string de location para pegar o texto após o último "-"
accidents_bronze_df = accidents_bronze_df.withColumn("location", F.reverse(accidents_bronze_df["location"]))
accidents_bronze_df = accidents_bronze_df.withColumn("text_split", F.split(accidents_bronze_df["location"], "-",2))

# Remover o campo extraido de location, tirar o hifen ao final, reverter e limpar
accidents_bronze_df = accidents_bronze_df.withColumn("location", F.trim(accidents_bronze_df["text_split"][1]))
accidents_bronze_df = accidents_bronze_df.withColumn("location", F.reverse(accidents_bronze_df["location"]))
accidents_bronze_df = accidents_bronze_df.withColumn("location", F.trim(accidents_bronze_df["location"]))


# Atribuir a primeira parte a country_or_region e reverter e fazer trim
accidents_bronze_df = accidents_bronze_df.withColumn("country_or_region", F.trim(accidents_bronze_df["text_split"][0]))
accidents_bronze_df = accidents_bronze_df.withColumn("country_or_region", F.reverse(accidents_bronze_df["country_or_region"]))
accidents_bronze_df = accidents_bronze_df.withColumn("country_or_region", F.trim(accidents_bronze_df["country_or_region"]))

# Avaliar quais countries não estão na lista de países válidos com base em silver_countries

# Obter os valores distintos da coluna
countries_df = spark.read.format("delta").load('dbfs:/user/hive/warehouse/silver_database.db/silver_countries')
distinct_countries_code = countries_df.select("country_name").distinct().collect()

# Converter os valores distintos para uma lista de strings
distinct_values_list = [row["country_name"] for row in distinct_countries_code]

# Identificar valores que não estão na lista permitida
countries_with_invalid_country_names = accidents_bronze_df.filter(~F.col("country_or_region").isin(distinct_values_list))

# Printar valores que não estão na lista permitida
print("Valores que não estão na lista permitida:")
countries_with_invalid_country_names.display()

# Definindo um dicionário com traduções
countries_traductions = [
    ('United States of America','USA'),
    ('Congo (Democratic Republic)','Congo')
]

# Traduzir os valores do dicionário do primeiro valor para o segundo valor
for old_value, new_value in countries_traductions:
    accidents_bronze_df = accidents_bronze_df.withColumn(
        "country_or_region",
        F.when(F.col("country_or_region") == old_value, new_value).otherwise(F.col("country_or_region"))
    )

# Transformar valores não permitidos em null
accidents_bronze_df = accidents_bronze_df.withColumn("country_or_region", F.when(F.col("country_or_region").isin(distinct_values_list), F.col("country_or_region")).otherwise(None))

# Renomeando a coluna de localização para approx_location
accidents_bronze_df = accidents_bronze_df.withColumnRenamed("location","approx_location")

# Drop em text_split
accidents_bronze_df = accidents_bronze_df.drop('text_split')

accidents_bronze_df.display()



Valores que não estão na lista permitida:


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants,text_split,country_or_region
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5,5.0,"List(aciremA fo setatS detinU , AV ,sgnirpS toH ,)PSHK( tropriA dleiF sllagnI raen)",United States of America
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4,4.0,"List(aciremA fo setatS detinU , AV ,ollebetnoM raen)",United States of America
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0,6.0,"List(ymel�htraB tniaS , )JFFT/HBS( tropriA neneaH ed yméR-yméléhtraB-tniaS)",Saint Barth�lemy
318645,2023-06-28,858.0,Boeing 717-2BD,Delta Air Lines,N955AT,55017/5040,2000.0,BMW RR BR715,,,0,Substantial,Accident,"Charlotte-Douglas International Airport, NC (CLT/KCLT)",Landing,Passenger - Scheduled,"Atlanta Hartsfield-Jackson International Airport, GA (ATL/KATL)","Charlotte-Douglas International Airport, NC (CLT/KCLT)",NTSB,Information verified through data from accident investigation authorities,B712,Wednesday,0,104.0,"List(aciremA fo setatS detinU , )TLCK/TLC( CN ,tropriA lanoitanretnI salguoD-ettolrahC)",United States of America
348182,2023-11-20,1359.0,Boeing P-8A Poseidon,US Navy (USN),169561,66094/8026,2020.0,CFMI CFM56-7B27E,,,0,Substantial,Accident,"Kaneohe Bay MCAS (Marion E. Carl Field) (NGF/PHNG), Kaneohe, HI",Landing,Military,,"Kaneohe Bay MCAS (Marion E. Carl Field) Airport, HI (NGF/PHNG)",,"Information is only available from news, social media or unofficial sources",P8,Monday,0,9.0,"List(aciremA fo setatS detinU , IH ,ehoenaK ,)GNHP/FGN( )dleiF lraC .E noiraM( SACM yaB ehoenaK)",United States of America
345513,2023-07-03,1350.0,Cessna 208B Supervan 900,Arne Aviation LLC,N716MM,208B0746,1999.0,Honeywell TPE331-12JR,9519.0,,0,Substantial,Accident,"Suffolk Executive Airport, VA (KSFQ)",Landing,Parachuting,"Suffolk Executive Airport, VA (KSFQ)","Suffolk Executive Airport, VA (KSFQ)",NTSB,Accident investigation report completed and information captured,C208,Monday,0,1.0,"List(aciremA fo setatS detinU , )QFSK( AV ,tropriA evitucexE kloffuS)",United States of America
308996,2023-03-05,1809.0,Honda HA-420 HondaJet Elite,Nesama LLC dba Volato,N118CX,42000231,2022.0,General Electric HF-120,486.0,,0,Substantial,Accident,"Buena Vista-Central Colorado Regional Airport (AEJ/KAEJ), CO",Landing,Private,"Monterey Regional Airport, CA (MRY/KMRY)","Buena Vista-Central Colorado Regional Airport, CO (KAEJ)",NTSB,Accident investigation report completed and information captured,HDJT,Sunday,0,2.0,"List(aciremA fo setatS detinU , OC ,)JEAK/JEA( tropriA lanoigeR odaroloC lartneC-atsiV aneuB)",United States of America
343043,2023-07-29,1034.0,Boeing 767-322ER (WL),United Airlines,N641UA,25091/360,1991.0,,,,0,Substantial,Accident,"Houston-George Bush Intercontinental Airport, TX (IAH/KIAH)",Landing,Passenger - Scheduled,"Newark-Liberty International Airport, NJ (EWR/KEWR)","Houston-George Bush Intercontinental Airport, TX (IAH/KIAH)",NTSB,Information verified through data from accident investigation authorities,B763,Saturday,0,202.0,"List(aciremA fo setatS detinU , )HAIK/HAI( XT ,tropriA latnenitnocretnI hsuB egroeG-notsuoH)",United States of America
318699,2023-02-08,1000.0,Cirrus SF50 Vision Jet G2,"Mag Aviation, LLC",N426SJ,0148,2019.0,Williams International FJ33-5A,680.0,,0,Substantial,Accident,"Waukesha Airport, WI (UES)",Landing,Private,"Waukesha Airport, WI (UES/KUES)","Waukesha Airport, WI (UES/KUES)",NTSB,Accident investigation report completed and information captured,SF50,Wednesday,0,2.0,"List(aciremA fo setatS detinU , )SEU( IW ,tropriA ahsekuaW)",United States of America
351079,2024-01-20,,Cessna 208B Grand Caravan,IMI,5Y-SPZ,,,,,,0,Substantial,Accident,Kasese Airport,Landing,Passenger,Goma Airport (GOM/FZNA),Kasese Airport,,"Information is only available from news, social media or unofficial sources",C208,Saturday,0,3.0,"List()cilbupeR citarcomeD( ognoC , tropriA esesaK)",Congo (Democratic Republic)


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,approx_location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants,country_or_region
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,0,Substantial,Accident,"Erechim Airport (ERM), RS",Landing,Private,"Chapecó Airport, SC (XAP/SBCH)","Erechim-Comandante Kraemer Airport, RS (ERM/SSER)",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday,0.0,5.0,Brazil
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA",Approach,Private,"Fort Lauderdale International Airport, FL (FLL/KFLL)","Hot Springs-Ingalls Field, VA (HSP/KHSP)",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5.0,5.0,USA
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,0,"Destroyed, written off",Accident,near Kalongo Airstrip,Landing,Military,Nakasongola Air Base,Kalongo Airstrip,,"Information is only available from news, social media or unofficial sources",SC7,Friday,0.0,2.0,Uganda
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,0,Substantial,Accident,Vårgårda,Take off,Private,,Kattleberg Airport,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday,0.0,5.0,Sweden
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA",En route,Private,"Elizabethton Municipal Airport, TN","Islip-Long Island MacArthur Airport, NY (ISP/KISP)",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4.0,4.0,USA
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ),Standing,Passenger - Scheduled,N'Djamena Airport (NDJ/FTTJ),Abuja-Nnamdi Azikiwe International Airport (ABV/DNAA),,Information verified through data from accident investigation authorities,A332,Friday,0.0,,Chad
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ)",Standing,Military,"CFB Comox, BC",,,"Information is only available from news, social media or unofficial sources",C130,Wednesday,0.0,,Canada
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,0,Unknown,Accident,E of Florida,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY (JFK/KJFK)",Bogotá-Eldorado Airport (BOG/SKBO),Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday,0.0,,Atlantic Ocean
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,0,"Destroyed, written off",OT,Khartoum International Airport (KRT),Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday,0.0,0.0,Sudan
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport (PTP/TFFR),Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0.0,6.0,


Passando para __Departure Airport e Destination Airport__ vamos:

- Extrair os códigos dos aeroportos e salvar nas colunas departure_airport_ICAO, departure_airport_IATA, destination_airport_ICAO ou destination_airport_IATA
- Códigos podem aparecer no formato IATA (3 letras) ou ICAO (4 letras) ou ambos (separados por uma /)

In [0]:
from pyspark.sql.types import StringType, ArrayType


# Extraindo código do aeroporto de origem e do aeroporto de destino, quando houver

airport_cols = ['departure_airport','destination_airport']

# Alocanco o código IATA (3) e ICAO (4) em suas respectivas colunas
@udf(returnType=StringType())
def extrair_codigo(array, tamanho):
    print(array)
    if array is None:
        return None
    for elemento in array:
        if len(elemento) == tamanho:
            return elemento
    return None

for col in airport_cols:

    extracted_codes_col = "extracted_codes_dep_" + col
    extracted_codes_list_col = "extracted_codes_dep_list_" + col

    # Procurando por códigos ICAO e IATA dentro de parentesis no aeroporto
    accidents_bronze_df = accidents_bronze_df.withColumn(extracted_codes_col, F.regexp_substr(accidents_bronze_df[col],F.lit('\(\/?([A-Z]{3,4})\/?(?:\/[A-Z]{3,4})?\)')))

    # Removendo a extração do campo original
    accidents_bronze_df = accidents_bronze_df.withColumn(col, F.replace(accidents_bronze_df[col],accidents_bronze_df[extracted_codes_col]))
    # Limpando o campo original
    accidents_bronze_df = accidents_bronze_df.withColumn(col, F.trim(accidents_bronze_df[col]))

    # Limpando a extração
    accidents_bronze_df = accidents_bronze_df.withColumn(extracted_codes_col, F.regexp_replace(accidents_bronze_df[extracted_codes_col],"\(?\)?",""))
    # Quebrando o código em 2 quando há os 2
    accidents_bronze_df = accidents_bronze_df.withColumn(extracted_codes_list_col, F.split(accidents_bronze_df[extracted_codes_col],'/'))

    

    # Aplicar a função para extrair elementos de tamanho 3 e 4
    accidents_bronze_df = accidents_bronze_df.withColumn(col + "_IATA", extrair_codigo(accidents_bronze_df[extracted_codes_list_col], F.lit(3)))
    accidents_bronze_df = accidents_bronze_df.withColumn(col + "_ICAO", extrair_codigo(accidents_bronze_df[extracted_codes_list_col], F.lit(4)))

    # Dropando colunas intermediarias
    accidents_bronze_df = accidents_bronze_df.drop(extracted_codes_col, extracted_codes_list_col)

accidents_bronze_df.display()


index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,approx_location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants,country_or_region,departure_airport_IATA,departure_airport_ICAO,destination_airport_IATA,destination_airport_ICAO
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,0,Substantial,Accident,"Erechim Airport (ERM), RS",Landing,Private,"Chapecó Airport, SC","Erechim-Comandante Kraemer Airport, RS",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday,0.0,5.0,Brazil,XAP,SBCH,ERM,SSER
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA",Approach,Private,"Fort Lauderdale International Airport, FL","Hot Springs-Ingalls Field, VA",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5.0,5.0,USA,FLL,KFLL,HSP,KHSP
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,0,"Destroyed, written off",Accident,near Kalongo Airstrip,Landing,Military,,,,"Information is only available from news, social media or unofficial sources",SC7,Friday,0.0,2.0,Uganda,,,,
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,0,Substantial,Accident,Vårgårda,Take off,Private,,,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday,0.0,5.0,Sweden,,,,
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA",En route,Private,,"Islip-Long Island MacArthur Airport, NY",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4.0,4.0,USA,,,ISP,KISP
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ),Standing,Passenger - Scheduled,N'Djamena Airport,Abuja-Nnamdi Azikiwe International Airport,,Information verified through data from accident investigation authorities,A332,Friday,0.0,,Chad,NDJ,FTTJ,ABV,DNAA
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ)",Standing,Military,,,,"Information is only available from news, social media or unofficial sources",C130,Wednesday,0.0,,Canada,,,,
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,0,Unknown,Accident,E of Florida,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY",Bogotá-Eldorado Airport,Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday,0.0,,Atlantic Ocean,JFK,KJFK,BOG,SKBO
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,0,"Destroyed, written off",OT,Khartoum International Airport (KRT),Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday,0.0,0.0,Sudan,,,,
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport,Saint-Barthélémy-Rémy de Haenen Airport,BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0.0,6.0,,PTP,TFFR,SBH,TFFJ


##Avaliando a qualidade dos dados
- Testar se não há nulos em index
- Testar se a maior data é menor que o dia atual (dado que não podem ter acidentes no futuro)
- Testar se a coluna tempo está com os valores entre 0000 e 2359 (dado que esses são os limites de um dia)
- Testar se o ano de fabricação de uma aeronave é menor ou igual ao ano atual ou então maior ou igual a 1903 (dado que não existia avião antes disso)
- Testar se os valores total_airframe_hrs, cycles, other_fatalities, nb_fatalities e nb_occupants são não negativos ou nulos (não podemos ter número negativo de vítimas, por exemplo)
- Testar se os valores de weekday estão entre os 7 valores aceitáveis
- Testar se os códigos IATA tem 3 caracteres
- Testar se os códigos ICAO tem 4 caracteres
- Testar se os números de ocupantes de um acidente é maior igual ao número de vítimas (dado que não podemos ter mais vítimas do que ocupantes)

In [0]:
# Testando a qualidade dos dados
test_col_not_null(accidents_bronze_df,"index")
test_biggest_date_before_current_date(accidents_bronze_df,"date")
test_value_range(accidents_bronze_df,"time",">= 0000 or time is null")
test_value_range(accidents_bronze_df,"time","<= 2359 or time is null")

import datetime as dt
current_year = dt.date.today().year
test_value_range(accidents_bronze_df,"year_of_manufacture",f"<= {current_year} or year_of_manufacture is null")
test_value_range(accidents_bronze_df,"year_of_manufacture",">= 1903 or year_of_manufacture is null")
test_value_range(accidents_bronze_df,"total_airframe_hrs","> 0 or total_airframe_hrs is null ")
test_value_range(accidents_bronze_df,"cycles","> 0 or cycles is null")
test_value_range(accidents_bronze_df,"other_fatalities",">= 0 or other_fatalities is null")
test_value_range(accidents_bronze_df,"nb_fatalities",">= 0 or nb_fatalities is null")
test_value_range(accidents_bronze_df,"nb_occupants",">= 0 or nb_occupants is null")
test_weekday(accidents_bronze_df,"weekday")
test_IATA_codes(accidents_bronze_df,"departure_airport_IATA")
test_IATA_codes(accidents_bronze_df,"destination_airport_IATA")
test_ICAO_codes(accidents_bronze_df,"departure_airport_ICAO")
test_ICAO_codes(accidents_bronze_df,"destination_airport_ICAO")
test_value_range(accidents_bronze_df,"nb_occupants",">= nb_fatalities or nb_occupants is null or nb_fatalities is null")





Avaliando a condição index não contem nulos
Avaliando a condição date menor que a data atual
Avaliando a condição time >= 0000 or time is null
Avaliando a condição time <= 2359 or time is null
Avaliando a condição year_of_manufacture <= 2024 or year_of_manufacture is null
Avaliando a condição year_of_manufacture >= 1903 or year_of_manufacture is null
Avaliando a condição total_airframe_hrs > 0 or total_airframe_hrs is null 
Avaliando a condição cycles > 0 or cycles is null
Avaliando a condição other_fatalities >= 0 or other_fatalities is null
Avaliando a condição nb_fatalities >= 0 or nb_fatalities is null
Avaliando a condição nb_occupants >= 0 or nb_occupants is null
Avaliando a condição weekday está dentre os valores aceitos para o dia de semana
Avaliando a condição departure_airport_IATA contem 3 caracteres
Avaliando a condição destination_airport_IATA contem 3 caracteres
Avaliando a condição departure_airport_ICAO contem 4 caracteres
Avaliando a condição destination_airport_ICAO co

## Registrando dados no banco
Após aprovação, salvamos o dado sanitizado no banco e verificamos se tivemos sucesso

In [0]:
# Cria o banco de dados se ele não existir
database_name = 'silver_database'
table_name = 'silver_accidents'


spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")

# Salva o DataFrame como tabela Delta
accidents_bronze_df.write.mode("overwrite").format("delta").option("mergeSchema", "true").saveAsTable(
    f"{database_name}.{table_name}"
)

In [0]:
%sql
SELECT * from silver_database.silver_accidents

index,date,time,aircraft_type,owner_operator,registration,msn,year_of_manufacture,engine_model,total_airframe_hrs,cycles,other_fatalities,aircraft_damage,category,approx_location,phase,nature,departure_airport,destination_airport,investigating_agency,confidence_rating,aircraft_type_code,weekday,nb_fatalities,nb_occupants,country_or_region,departure_airport_IATA,departure_airport_ICAO,destination_airport_IATA,destination_airport_ICAO
386881,2024-04-23,1345.0,Learjet 75,Cimed & Co SA,PP-DYB,45-565,,,,,0,Substantial,Accident,"Erechim Airport (ERM), RS",Landing,Private,"Chapecó Airport, SC","Erechim-Comandante Kraemer Airport, RS",CENIPA,"Information is only available from news, social media or unofficial sources",LJ75,Tuesday,0.0,5.0,Brazil,XAP,SBCH,ERM,SSER
354203,2024-03-10,1452.0,IAI 1125 Astra SP,SkyJet Elite,N1125A,051,1990.0,,,,0,Destroyed,Accident,"near Ingalls Field Airport (KHSP), Hot Springs, VA",Approach,Private,"Fort Lauderdale International Airport, FL","Hot Springs-Ingalls Field, VA",NTSB,Information verified through data from accident investigation authorities,ASTR,Sunday,5.0,5.0,USA,FLL,KFLL,HSP,KHSP
318650,2023-05-19,1100.0,Shorts SC.7 Skyvan 3A-100,Uganda Peoples Defence Force,AF-519,SH.1901,1972.0,,,,0,"Destroyed, written off",Accident,near Kalongo Airstrip,Landing,Military,,,,"Information is only available from news, social media or unofficial sources",SC7,Friday,0.0,2.0,Uganda,,,,
318643,2023-07-08,1500.0,Antonov An-2R,Aviatörsföreningen Antonov 2,SE-KCE,1G189-59,1981.0,Shvetsov ASh-62IR,,,0,Substantial,Accident,Vårgårda,Take off,Private,,,SHK,"Information is only available from news, social media or unofficial sources",AN2,Saturday,0.0,5.0,Sweden,,,,
318648,2023-06-04,1500.0,Cessna 560 Citation V,Encore Motors of Melbourne Inc,N611VG,560-0091,1990.0,Pratt & Whitney Canada JT15D-5,,,0,"Destroyed, written off",Accident,"near Montebello, VA",En route,Private,,"Islip-Long Island MacArthur Airport, NY",NTSB,Information verified through data from accident investigation authorities,C560,Sunday,4.0,4.0,USA,,,ISP,KISP
389190,2024-05-31,,Airbus A330-203,Air France,F-GZCL,519,2003.0,GE CF6-80E1A3,,,0,Substantial,Accident,N'Djamena Airport (NDJ/FTTJ),Standing,Passenger - Scheduled,N'Djamena Airport,Abuja-Nnamdi Azikiwe International Airport,,Information verified through data from accident investigation authorities,A332,Friday,0.0,,Chad,NDJ,FTTJ,ABV,DNAA
343377,2023-08-09,,Lockheed CC-130H Hercules,Royal Canadian Air Force (RCAF),130337,382-4584,,,,,0,Unknown,Accident,"Comox Airport, British Columbia. (YQQ/CYQQ)",Standing,Military,,,,"Information is only available from news, social media or unofficial sources",C130,Wednesday,0.0,,Canada,,,,
351839,2024-02-18,2009.0,Boeing 767-332ER (WL),Delta Air Lines,N176DZ,29697/745,1999.0,,,,0,Unknown,Accident,E of Florida,En route,Passenger - Scheduled,"New York-John F. Kennedy International Airport, NY",Bogotá-Eldorado Airport,Aerocivil,"Information is only available from news, social media or unofficial sources",B763,Sunday,0.0,,Atlantic Ocean,JFK,KJFK,BOG,SKBO
318674,2023-04-17,,Antonov An-12BK,Al Quwwat al-Jawwiya As-Sudaniya (Sudanese Air Force),9977,9346302,,,,,0,"Destroyed, written off",OT,Khartoum International Airport (KRT),Standing,Military,,,,"Information is only available from news, social media or unofficial sources",AN12,Monday,0.0,0.0,Sudan,,,,
344872,2023-08-24,1542.0,Viking Air DHC-6 Twin Otter 400,Air Antilles,F-OMYS,971,,,,,0,Substantial,Accident,Saint-Barthélémy-Rémy de Haenen Airport (SBH/TFFJ),Landing,Passenger - Scheduled,Pointe-à-Pitre-Le Raizet Airport,Saint-Barthélémy-Rémy de Haenen Airport,BEA,"Information is only available from news, social media or unofficial sources",DHC6,Thursday,0.0,6.0,,PTP,TFFR,SBH,TFFJ
