Configurações Iniciais e Carregamento dos Dados

In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.sql.functions import col

# Inicialização da sessão Spark
spark = SparkSession\
        .builder\
        .appName("Projeto_AJP")\
        .master("local[*]")\
        .config("spark.executor.memory", "8g")\
        .config("spark.driver.memory", "8g")\
        .config("spark.driver.maxResultSize", "1g")\
        .config("spark.memory.offHeap.enabled", False)\
        .enableHiveSupport()\
        .getOrCreate()

In [2]:
# Carregar os dados transformados
flights = spark.read.parquet("../data/processed/flights_features.parquet")

# Verificar os dados carregados
flights.show(5)

+--------------------+---------+
|      scaledFeatures|IsDelayed|
+--------------------+---------+
|(807,[0,1,2,3,4,5...|        1|
|(807,[0,1,2,3,4,5...|        0|
|(807,[0,1,2,3,4,5...|        0|
|(807,[0,1,2,3,4,5...|        0|
|(807,[0,1,2,3,4,5...|        0|
+--------------------+---------+
only showing top 5 rows



Separar os Dados em Treinamento e Teste

In [3]:
# Dividir os dados em 80% para treinamento e 20% para teste
train_data, test_data = flights.randomSplit([0.8, 0.2], seed=1234)

# Verificar as divisões
print("Train data count: ", train_data.count())
print("Test data count: ", test_data.count())

# Salvar dados teste para posterior avaliação
test_data.write.mode("overwrite").parquet("../data/processed/test_data.parquet")

Train data count:  22678840
Test data count:  5667738


Balanceamento dos Dados

In [4]:
# Contagem de exemplos em cada classe
class_counts = train_data.groupBy('IsDelayed').count().collect()
minority_class_count = min(class_counts, key=lambda x: x['count'])['count']
majority_class_count = max(class_counts, key=lambda x: x['count'])['count']

In [5]:
print(minority_class_count)

7313754


In [6]:
print(majority_class_count)

15365086


In [7]:
# Calcular a taxa de undersampling
undersample_ratio = minority_class_count / majority_class_count

In [8]:
# Aplicar undersampling à classe majoritária
undersampled_majority = train_data.filter(col('IsDelayed') == 0).sample(False, undersample_ratio, seed=1234)

# Combinar com a classe minoritária
balanced_train_data = train_data.filter(col('IsDelayed') == 1).unionAll(undersampled_majority)

print(balanced_train_data.count())

14627193


In [10]:
# Contagem de exemplos em cada classe
class_counts_balanced = balanced_train_data.groupBy('IsDelayed').count().collect()
minority_class_count_balanced = min(class_counts_balanced, key=lambda x: x['count'])['count']
majority_class_count_balanced = max(class_counts_balanced, key=lambda x: x['count'])['count']

# Calcular a taxa de undersampling
undersample_ratio_balanced = minority_class_count_balanced / majority_class_count_balanced

print(undersample_ratio_balanced)

0.9999569304627965


Definição e Treinamento dos Modelos

In [11]:
##REGRESSÃO LOGISTICA

# Definir o modelo
lr = LogisticRegression(featuresCol='scaledFeatures', labelCol='IsDelayed')

# Treinar o modelo
lr_model = lr.fit(balanced_train_data)

# Gravar o modelo
lr_model.write().overwrite().save("../models/logistic_regression_model")

In [12]:
##ÁRVORE DE DECISÃO

dt = DecisionTreeClassifier(featuresCol='scaledFeatures', labelCol='IsDelayed')

dt_model = dt.fit(balanced_train_data)

dt_model.write().overwrite().save("../models/decision_tree_model")

In [13]:
##RANDOM FOREST

rf = RandomForestClassifier(featuresCol='scaledFeatures', labelCol='IsDelayed')

rf_model = rf.fit(balanced_train_data)

rf_model.write().overwrite().save("../models/random_forest_model")

Encerrar Sessão

In [14]:
spark.stop()