## Configuração de Env e Sparks

In [14]:
import os
os.environ["JAVA_HOME"] = "/usr/local/openjdk-8"
os.environ["SPARK_HOME"] = "/user_data/spark-3.3.0-bin-hadoop2"

import findspark
findspark.init('spark-3.3.0-bin-hadoop2')

## Imports

In [15]:
# Importando bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = (
    SparkSession.builder.appName("spark_flight")
    .config("spark.sql.warehouse.dir", "hdfs:///user/hive/warehouse")
    .config("spark.sql.catalogImplementation", "hive")
    .getOrCreate()
)

## Dataset escolhido

O dataset escolhido foi o [Flight Status Prediction (link do Kaggle)](https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022/data). Esse dataset possui diversas informações sobre voos realizados, incluindo dados sobre cancelamento e atrasos.

Apesar da disponibilidade de dados adquiridos desde 2018, selecionamos o arquivo CSV referente ao ano de 2022, que contém 1.42 GB de dados.

## Leitura do dataset

In [16]:
dataframe = spark.read.csv("hdfs://spark-master:9000/datasets/flights/Combined_Flights_2022.csv", header=True, inferSchema=True)
num_linhas = dataframe.count()
print(f"Número de linhas no DataFrame: {num_linhas}")



Número de linhas no DataFrame: 4078318


                                                                                

## Análise Exploratória de Dados (EDA)

In [17]:
# Visualizando o esquema dos dados
dataframe.printSchema()

root
 |-- FlightDate: timestamp (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = tru

In [18]:
# Remover depois, é apenas para conseguir rodar
dataframe_sample = dataframe.sample(fraction=0.1, seed=3)
print(f"Número de linhas no DataFrame Sample: {dataframe_sample.count()}")

# Exibindo algumas informações dos atributos
dataframe_sample.summary().show()

                                                                                

Número de linhas no DataFrame Sample: 408034


[Stage 10:>                                                         (0 + 1) / 1]

+-------+--------------------+------+------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+--------------------+------------------+------------------+------------------+------------------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+------------------+------------------+------------------+--------------+-----------+------------------+---------------+------------------+------------------+------------------+------------------+------------+---------+------------------+-------------+------------------+-------------------+--------------------+----------+------------------+------------------+------------------+-----------------+---------

                                                                                

In [19]:
# Mostrando as primeiras linhas do DataFrame
dataframe.show(5)

+-------------------+--------------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+-------------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+
|         FlightDate|             Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|

In [21]:
# Checagem por dados nulos
Dict_Null = {col:dataframe_sample.filter(dataframe_sample[col].isNull()).count() for col in dataframe_sample.columns}
Dict_Null

                                                                                

{'FlightDate': 0,
 'Airline': 0,
 'Origin': 0,
 'Dest': 0,
 'Cancelled': 0,
 'Diverted': 0,
 'CRSDepTime': 0,
 'DepTime': 12107,
 'DepDelayMinutes': 12120,
 'DepDelay': 12120,
 'ArrTime': 12479,
 'ArrDelayMinutes': 13392,
 'AirTime': 13392,
 'CRSElapsedTime': 0,
 'ActualElapsedTime': 13392,
 'Distance': 0,
 'Year': 0,
 'Quarter': 0,
 'Month': 0,
 'DayofMonth': 0,
 'DayOfWeek': 0,
 'Marketing_Airline_Network': 0,
 'Operated_or_Branded_Code_Share_Partners': 0,
 'DOT_ID_Marketing_Airline': 0,
 'IATA_Code_Marketing_Airline': 0,
 'Flight_Number_Marketing_Airline': 0,
 'Operating_Airline': 0,
 'DOT_ID_Operating_Airline': 0,
 'IATA_Code_Operating_Airline': 0,
 'Tail_Number': 2770,
 'Flight_Number_Operating_Airline': 0,
 'OriginAirportID': 0,
 'OriginAirportSeqID': 0,
 'OriginCityMarketID': 0,
 'OriginCityName': 0,
 'OriginState': 0,
 'OriginStateFips': 0,
 'OriginStateName': 0,
 'OriginWac': 0,
 'DestAirportID': 0,
 'DestAirportSeqID': 0,
 'DestCityMarketID': 0,
 'DestCityName': 0,
 'DestStat

## Pré-processamento

In [24]:
# Remover colunas do DataFrame
colunas_para_remover = ['DOT_ID_Marketing_Airline', 'DOT_ID_Operating_Airline', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID']
dataframe_sem_colunas = dataframe_sample.drop(*colunas_para_remover)

# Mostrar somente o nome das colunas
print("Nome das colunas:")
for coluna in dataframe_sem_colunas.columns:
    print(coluna)

# Exibindo algumas estatísticas dos atributos
dataframe_sample.summary().show()

+-------------------+--------------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+---------------------------+-------------------------------+-----------------+---------------------------+-----------+-------------------------------+------------------+-----------+---------------+---------------+---------+--------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+
|         FlightDate|             Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|ActualElapsedTime|Distance|Year|Quarter|Month|DayofMonth|DayOfWeek|Marketing_Airline_Ne

[Stage 230:>                                                        (0 + 1) / 1]

+-------+--------------------+------+------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+--------------------+------------------+------------------+------------------+------------------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+------------------+------------------+------------------+--------------+-----------+------------------+---------------+------------------+------------------+------------------+------------------+------------+---------+------------------+-------------+------------------+-------------------+--------------------+----------+------------------+------------------+------------------+-----------------+---------

                                                                                

In [6]:
# Tratando valores nulos
dataframe = dataframe.dropna()

In [7]:
# Lista de tipos de dados permitidos
allowed_types = ['int', 'double']

# Filtrando colunas com tipos permitidos
selected_columns = [col_name for col_name, col_type in dataframe.dtypes if any(data_type in col_type for data_type in allowed_types)]

# Selecionando apenas as colunas permitidas
df_filtered = dataframe.select(*selected_columns)

# Mostrando o esquema do DataFrame resultante
df_filtered.printSchema()

root
 |-- Severity: integer (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- End_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- Visibility(mi): double (nullable = true)
 |-- Wind_Speed(mph): double (nullable = true)
 |-- Precipitation(in): double (nullable = true)



## Data Preprocessing

In [10]:
# Adicionando coluna Severity4 e definindo valores
df_filtered = df_filtered.withColumn('Severity4', col('Severity').cast('int'))
df_filtered = df_filtered.withColumn('Severity4', (col('Severity4') == 4).cast('int'))
df_filtered = df_filtered.drop('Severity')
df_filtered.groupBy('Severity4').count().show()



+---------+-------+
|Severity4|  count|
+---------+-------+
|        1|  94421|
|        0|3264408|
+---------+-------+



                                                                                

In [11]:
# Criando um vetor de features para o modelo
target = 'Severity4'
feature_columns = df_filtered.columns
feature_columns.remove(target)

vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_vector = vector_assembler.transform(df_filtered)

In [12]:
# Dividindo o conjunto de dados em treino e teste
train_data, test_data = df_vector.randomSplit([0.8, 0.2], seed=42)

## Model

### Random Forest

In [13]:
# Criando e treinando um modelo de classificação (Random Forest)
rf_classifier = RandomForestClassifier(labelCol=target, featuresCol="features", numTrees=10)
pipeline = Pipeline(stages=[rf_classifier])
model = pipeline.fit(train_data)



24/02/16 00:07:12 WARN MemoryStore: Not enough space to cache rdd_64_16 in memory! (computed 19.7 MiB so far)
24/02/16 00:07:12 WARN BlockManager: Persisting block rdd_64_16 to disk instead.
24/02/16 00:07:12 WARN MemoryStore: Not enough space to cache rdd_64_17 in memory! (computed 29.6 MiB so far)
24/02/16 00:07:12 WARN BlockManager: Persisting block rdd_64_17 to disk instead.




24/02/16 00:07:15 WARN MemoryStore: Not enough space to cache rdd_64_19 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:15 WARN BlockManager: Persisting block rdd_64_19 to disk instead.




24/02/16 00:07:18 WARN MemoryStore: Not enough space to cache rdd_64_21 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:18 WARN BlockManager: Persisting block rdd_64_21 to disk instead.




24/02/16 00:07:22 WARN MemoryStore: Not enough space to cache rdd_64_20 in memory! (computed 29.6 MiB so far)
24/02/16 00:07:22 WARN BlockManager: Persisting block rdd_64_20 to disk instead.


                                                                                

24/02/16 00:07:24 WARN MemoryStore: Not enough space to cache rdd_64_12 in memory! (computed 19.7 MiB so far)
24/02/16 00:07:24 WARN MemoryStore: Not enough space to cache rdd_64_11 in memory! (computed 19.7 MiB so far)
24/02/16 00:07:24 WARN MemoryStore: Not enough space to cache rdd_64_13 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:24 WARN MemoryStore: Not enough space to cache rdd_64_10 in memory! (computed 29.6 MiB so far)


                                                                                

24/02/16 00:07:25 WARN MemoryStore: Not enough space to cache rdd_64_12 in memory! (computed 3.7 MiB so far)
24/02/16 00:07:25 WARN MemoryStore: Not enough space to cache rdd_64_10 in memory! (computed 8.3 MiB so far)
24/02/16 00:07:25 WARN MemoryStore: Not enough space to cache rdd_64_13 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:25 WARN MemoryStore: Not enough space to cache rdd_64_11 in memory! (computed 19.7 MiB so far)


                                                                                

24/02/16 00:07:27 WARN MemoryStore: Not enough space to cache rdd_64_10 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:27 WARN MemoryStore: Not enough space to cache rdd_64_13 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:27 WARN MemoryStore: Not enough space to cache rdd_64_12 in memory! (computed 8.3 MiB so far)
24/02/16 00:07:27 WARN MemoryStore: Not enough space to cache rdd_64_11 in memory! (computed 13.1 MiB so far)


                                                                                

24/02/16 00:07:29 WARN MemoryStore: Not enough space to cache rdd_64_12 in memory! (computed 8.3 MiB so far)
24/02/16 00:07:29 WARN MemoryStore: Not enough space to cache rdd_64_13 in memory! (computed 3.7 MiB so far)
24/02/16 00:07:29 WARN MemoryStore: Not enough space to cache rdd_64_10 in memory! (computed 13.1 MiB so far)
24/02/16 00:07:29 WARN MemoryStore: Not enough space to cache rdd_64_11 in memory! (computed 19.7 MiB so far)


                                                                                

In [14]:
# Fazendo previsões no conjunto de teste
predictions = model.transform(test_data)

In [15]:
# Avaliando o desempenho do modelo
evaluator = MulticlassClassificationEvaluator(labelCol="Severity4", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)



Accuracy: 0.972008995725541


                                                                                

In [None]:
# Salvando o DataFrame em formato Parquet
parquet_output_path="hdfs://spark-master:9000/datasets/accidents_output"
df_filtered.write.parquet(parquet_output_path)