## Configuração de Env e Sparks

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/local/openjdk-8"
os.environ["SPARK_HOME"] = "/user_data/spark-3.3.0-bin-hadoop2"

import findspark
findspark.init('spark-3.3.0-bin-hadoop2')

## Imports

In [2]:
# Importando bibliotecas necessárias
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.stat import Correlation

spark = (
    SparkSession.builder.appName("spark_flight")
    .config("spark.sql.warehouse.dir", "hdfs:///user/hive/warehouse")
    .config("spark.sql.catalogImplementation", "hive")
    .getOrCreate()
)

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/user_data/spark-3.3.0-bin-hadoop2/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hadoop-2.7.3/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/05/04 14:43:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Dataset escolhido

O dataset escolhido foi o [Flight Status Prediction (link do Kaggle)](https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022/data). Esse dataset possui diversas informações sobre voos realizados, incluindo dados sobre cancelamento e atrasos.

Apesar da disponibilidade de dados adquiridos desde 2018, selecionamos o arquivo CSV referente ao ano de 2022, que contém 1.42 GB de dados.

## Leitura do dataset

In [3]:
dataframe = spark.read.csv("hdfs://spark-master:9000/datasets/taxi/yellow_tripdata_2016-03.csv", header=True, inferSchema=True)
num_linhas = dataframe.count()
print(f"Número de linhas no DataFrame: {num_linhas}")

[Stage 2:>                                                        (0 + 16) / 16]

Número de linhas no DataFrame: 12210952


                                                                                

## Análise Exploratória de Dados (EDA)

In [4]:
# Visualizando o esquema dos dados
dataframe.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)



In [14]:
# 70% do dataset para rodar
dataframe_sample = dataframe.sample(fraction=0.7, seed=3)
print(f"Número de linhas no DataFrame Sample: {dataframe_sample.count()}")

# Exibindo algumas informações dos atributos
dataframe_sample.summary().show(truncate=False, vertical=True)

                                                                                

Número de linhas no DataFrame Sample: 8546588


[Stage 78:>                                                         (0 + 1) / 1]

-RECORD 0-------------------------------------
 summary               | count                
 VendorID              | 8546588              
 passenger_count       | 8546588              
 trip_distance         | 8546588              
 pickup_longitude      | 8546588              
 pickup_latitude       | 8546588              
 RatecodeID            | 8546588              
 store_and_fwd_flag    | 8546588              
 dropoff_longitude     | 8546588              
 dropoff_latitude      | 8546588              
 payment_type          | 8546588              
 fare_amount           | 8546588              
 extra                 | 8546588              
 mta_tax               | 8546588              
 tip_amount            | 8546588              
 tolls_amount          | 8546588              
 improvement_surcharge | 8546588              
 total_amount          | 8546588              
-RECORD 1-------------------------------------
 summary               | mean                 
 VendorID    

                                                                                

In [19]:
# Mostrando as primeiras linhas do DataFrame
dataframe.show(n=5, truncate=False, vertical=True)

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2016-03-01 00:00:00 
 tpep_dropoff_datetime | 2016-03-01 00:07:55 
 passenger_count       | 1                   
 trip_distance         | 2.5                 
 pickup_longitude      | -73.97674560546875  
 pickup_latitude       | 40.76515197753906   
 RatecodeID            | 1                   
 store_and_fwd_flag    | N                   
 dropoff_longitude     | -74.00426483154297  
 dropoff_latitude      | 40.74612808227539   
 payment_type          | 1                   
 fare_amount           | 9.0                 
 extra                 | 0.5                 
 mta_tax               | 0.5                 
 tip_amount            | 2.05                
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 12.35               
-RECORD 1------------------------------------
 VendorID              | 1        

In [16]:
# Checagem por dados nulos
Dict_Null = {col:dataframe_sample.filter(dataframe_sample[col].isNull()).count() for col in dataframe_sample.columns}
Dict_Null

                                                                                

{'VendorID': 0,
 'tpep_pickup_datetime': 0,
 'tpep_dropoff_datetime': 0,
 'passenger_count': 0,
 'trip_distance': 0,
 'pickup_longitude': 0,
 'pickup_latitude': 0,
 'RatecodeID': 0,
 'store_and_fwd_flag': 0,
 'dropoff_longitude': 0,
 'dropoff_latitude': 0,
 'payment_type': 0,
 'fare_amount': 0,
 'extra': 0,
 'mta_tax': 0,
 'tip_amount': 0,
 'tolls_amount': 0,
 'improvement_surcharge': 0,
 'total_amount': 0}

Pudemos visualizar que existem alguns dados nulos principalmente nas colunas referentes ao tempo de viagem, como o delay na partida ou chegada e tempo de voo.

## Pré-processamento

In [20]:
# Remover colunas de ID e nome do DataFrame
colunas_para_remover = [
'VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'RatecodeID',
]

dataframe_sem_colunas_de_id = dataframe_sample.drop(*colunas_para_remover)

# Mostrar somente o nome das colunas
print("Nome das colunas:")
for coluna in dataframe_sem_colunas_de_id.columns:
    print(coluna)


Nome das colunas:
passenger_count
trip_distance
pickup_longitude
pickup_latitude
store_and_fwd_flag
dropoff_longitude
dropoff_latitude
payment_type
fare_amount
extra
mta_tax
tip_amount
tolls_amount
improvement_surcharge
total_amount


In [21]:
from pyspark.sql.functions import when

dataframe_sem_valores_nulos = dataframe_sem_valores_nulos.withColumn("store_and_fwd_flag", when(col("store_and_fwd_flag") == 'Y', 1).otherwise(0))
dataframe_sem_valores_nulos.show(n=5, truncate=False, vertical=True)

# Correlação
assembler = VectorAssembler(inputCols=dataframe_sem_valores_nulos.columns, outputCol="features")
dataframe_vetorizado = assembler.transform(dataframe_sem_valores_nulos)

correlation = Correlation.corr(dataframe_vetorizado, "features", method="pearson").collect()[0][0]

rows = correlation.toArray().tolist()
spark.createDataFrame(rows,dataframe_sem_valores_nulos.columns).show(n=30, truncate=False, vertical=True)

-RECORD 0-----------------------------------
 passenger_count       | 1                  
 trip_distance         | 2.5                
 pickup_longitude      | -73.97674560546875 
 pickup_latitude       | 40.76515197753906  
 store_and_fwd_flag    | 0                  
 dropoff_longitude     | -74.00426483154297 
 dropoff_latitude      | 40.74612808227539  
 payment_type          | 1                  
 fare_amount           | 9.0                
 extra                 | 0.5                
 mta_tax               | 0.5                
 tip_amount            | 2.05               
 tolls_amount          | 0.0                
 improvement_surcharge | 0.3                
 total_amount          | 12.35              
-RECORD 1-----------------------------------
 passenger_count       | 1                  
 trip_distance         | 2.0                
 pickup_longitude      | -74.00672912597656 
 pickup_latitude       | 40.730716705322266 
 store_and_fwd_flag    | 0                  
 dropoff_l

                                                                                

24/05/04 15:02:23 WARN PearsonCorrelation: Pearson correlation matrix contains NaN values.
-RECORD 0---------------------------------------
 passenger_count       | 1.0                    
 trip_distance         | -5.263318328016018E-4  
 pickup_longitude      | -0.01756957887859424   
 pickup_latitude       | 0.017537438727294703   
 store_and_fwd_flag    | NaN                    
 dropoff_longitude     | -0.015379834179621667  
 dropoff_latitude      | 0.015350113919973317   
 payment_type          | 0.02175554777893588    
 fare_amount           | 0.008612146645432929   
 extra                 | 0.0012962471610263589  
 mta_tax               | 0.005080171589824292   
 tip_amount            | -0.024930887828952437  
 tolls_amount          | -2.9409326707183393E-4 
 improvement_surcharge | 0.0030607063591546833  
 total_amount          | -0.0011767825074115372 
-RECORD 1---------------------------------------
 passenger_count       | -5.263318328016018E-4  
 trip_distance         | 1.

In [28]:
# Criando um vetor de features para o modelo
target = 'tip_amount'
feature_columns = dataframe_sem_valores_nulos.columns
feature_columns.remove(target)

vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
dataframe_vetorizado_sem_target = vector_assembler.transform(dataframe_sem_valores_nulos)

In [29]:
# Dividindo o conjunto de dados em treino e teste
train_data, test_data = dataframe_vetorizado_sem_target.randomSplit([0.8, 0.2], seed=3)

## Regressão linear

In [37]:
from pyspark.ml.regression import LinearRegression

regressor = LinearRegression(featuresCol = 'features', labelCol = 'tip_amount', loss='squaredError', maxIter=50, regParam = 0.05)
regressor = regressor.fit(train_data)

                                                                                

In [35]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show(n=5, truncate=False, vertical=True)

# Printing the results
print('MAE:', pred_results.meanAbsoluteError)
print('MSE:', pred_results.meanSquaredError)
print('RMSE:', pred_results.rootMeanSquaredError)
print('R2-Score:', pred_results.r2)

                                                                                

24/05/04 15:23:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Stage 177:>                                                        (0 + 1) / 1]

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------
 passenger_count       | 1                                                                                                                     
 trip_distance         | 0.0                                                                                                                   
 pickup_longitude      | -74.01785278320312                                                                                                    
 pickup_latitude       | 40.706382751464844                                                                                                    
 store_and_fwd_flag    | 0                                                                                                                     
 dropoff_longitude     | -74.00518798828125                                                                                             

                                                                                

### Árvore de decisão

In [39]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol = 'features', labelCol = 'tip_amount', maxDepth=5)
model = dt.fit(train_data)

                                                                                

In [46]:
pred_results = model.transform(test_data)
pred_results.show(n=5, truncate=False, vertical=True)

# Printing the results
evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'tip_amount')

mae = evaluator.evaluate(pred_results,
{evaluator.metricName: "mae"})

mse = evaluator.evaluate(pred_results,
{evaluator.metricName: "mse"})

rmse = evaluator.evaluate(pred_results,
{evaluator.metricName: "rmse"})

r2 = evaluator.evaluate(pred_results,
{evaluator.metricName: "r2"})

print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)
print('R2-Score:', r2)

                                                                                

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------
 passenger_count       | 1                                                                                                                     
 trip_distance         | 0.0                                                                                                                   
 pickup_longitude      | -74.01785278320312                                                                                                    
 pickup_latitude       | 40.706382751464844                                                                                                    
 store_and_fwd_flag    | 0                                                                                                                     
 dropoff_longitude     | -74.00518798828125                                                                                             

Exception ignored in: <function JavaWrapper.__del__ at 0x7f14859bd1f0>          
Traceback (most recent call last):
  File "spark-3.3.0-bin-hadoop2/python/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'RegressionEvaluator' object has no attribute '_java_obj'
[Stage 225:>                                                      (0 + 16) / 16]

MAE: 0.1567148249167305
MSE: 0.08861165792549609
RMSE: 0.29767710346194937
R2-Score: 0.8807107671872798


                                                                                