## Importing Libraries

In [None]:
import math

from datetime import datetime

from pyspark import SparkContext, SparkSession

from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.linalg import Vectors

from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, when, regexp_replace, monotonically_increasing_id, udf, count, sum, when, format_string, lpad, substring, hour, dayofmonth, month, year, dayofweek

## Data Uploading

In [None]:
try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .appName("MyApp") \
    .master("local[*]") \
    .getOrCreate()

print("Spark session started successfully!")

24/12/09 10:36:14 WARN Utils: Your hostname, MacBook-Pro-di-Marco-6.local resolves to a loopback address: 127.0.0.1; using 131.114.174.33 instead (on interface en0)
24/12/09 10:36:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/09 10:36:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session started successfully!


In [None]:
df = spark.read.csv('df_final_with_weather.csv',header = True, inferSchema = True, sep=",")

                                                                                

## Data Manipulation

### Remove Attributes

In [None]:
# Remove diverted and cancelled flights

df = df.filter(df.Flight_Diverted == 0)
df = df.filter(df.Flight_Cancelled == 0)

In [None]:
# Remove columns of diverted and cancelled flights

df = df.drop("Flight_Diverted")
df = df.drop("Flight_Cancelled")

In [None]:
df.show(5)

24/12/06 18:21:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----------+-----------------+-------------+--------------+-------------------+------------------------+---------------------+-----------------------+-------------+------------+------------+------------+----------------------+-------------------+---------------------+------------------------+-------------------------+----------------------+-------------+---------------+---------------------+---------------------+-----------------+----------------------+---------------------------+---------+----------+-------------+--------------+----------+----------+---+-----------------+--------+-----------+----+----+-----+------+
|Flight_Date|Operating_Carrier|Flight_Number|Origin_Airport|Destination_Airport|Scheduled_Departure_Time|Actual_Departure_Time|Departure_Delay_Minutes|Taxi_Out_Time|Takeoff_Time|Landing_Time|Taxi_In_Time|Scheduled_Arrival_Time|Actual_Arrival_Time|Arrival_Delay_Minutes|Cancellation_Reason_Code|Scheduled_Flight_Duration|Actual_Flight_Duration|Airborne_Time|Flight_Distance|Ca

In [None]:
# Remove other columns

columns_to_drop = ["Actual_Departure_time", "Actual_Arrival_time", "Operating_Carrier", "Taxi_out_Time", "Taxi_in_Time", "Takeoff_Time", 'Landing_Time', 'Actual_Flight_Duration',
                   'Airborne_Time']

df = df.drop(*columns_to_drop)

### Dealing with Missing Values

In [None]:
# Fill missing Departure_Delay_Minutes with 0

df = df.withColumn(
    "Departure_Delay_Minutes",
    when(col("Departure_Delay_Minutes").isNull(), 0).otherwise(col("Departure_Delay_Minutes"))
)

# Fill missing Arrival_Delay_Minutes with 0

df = df.withColumn(
    "Arrival_Delay_Minutes",
    when(col("Arrival_Delay_Minutes").isNull(), 0).otherwise(col("Arrival_Delay_Minutes"))
)

In [None]:
# Convert Scheduled_Departure_Time to a valid string time format (HH:MM)

df = df.withColumn("Formatted_Departure_Time",
                   format_string("%02d:%02d",
                                 (col("Scheduled_Departure_Time") / 100).cast("int"),
                                 (col("Scheduled_Departure_Time") % 100).cast("int")))

In [None]:
# Convert the formatted time to a timestamp, assuming the same date for simplicity

df = df.withColumn("Departure_Timestamp",
                   to_timestamp(col("Formatted_Departure_Time"), "HH:mm"))

df = df.withColumn("Hour", hour(col("Departure_Timestamp")))

In [None]:
# Count the number of null values in each column

null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

null_counts.show()



+-----------+-------------+--------------+-------------------+------------------------+-----------------------+----------------------+---------------------+------------------------+-------------------------+---------------+---------------------+---------------------+-----------------+----------------------+---------------------------+---------+----------+-------------+--------------+---------+----------+---+-----------+--------+---------+----+----+----+----+------------------------+-------------------+----+
|Flight_Date|Flight_Number|Origin_Airport|Destination_Airport|Scheduled_Departure_Time|Departure_Delay_Minutes|Scheduled_Arrival_Time|Arrival_Delay_Minutes|Cancellation_Reason_Code|Scheduled_Flight_Duration|Flight_Distance|Carrier_Delay_Minutes|Weather_Delay_Minutes|NAS_Delay_Minutes|Security_Delay_Minutes|Late_Aircraft_Delay_Minutes|city_orig|state_orig|latitude_dest|longitude_dest|city_dest|state_dest| id|distance_km|latitude|longitude|tavg|wspd|wdir|pres|Formatted_Departure_Time|

                                                                                

### Remove Ouliers and Unuseful Features

In [None]:
# Identify numeric columns for IQR-based outlier removal

delay_columns = ["Departure_Delay_Minutes", "Arrival_Delay_Minutes"]

for column in delay_columns:
    q1, q3 = df.approxQuantile(column, [0.25, 0.75], 0.01) # Compute Q1 and Q3
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    df_cleaned = df.filter((col(column) >= lower_bound) & (col(column) <= upper_bound)) # Filter rows within the bounds

                                                                                

In [None]:
df_cleaned.show(5)

+-----------+-------------+--------------+-------------------+------------------------+-----------------------+----------------------+---------------------+------------------------+-------------------------+---------------+---------------------+---------------------+-----------------+----------------------+---------------------------+---------+----------+-------------+--------------+----------+----------+---+-----------------+--------+-----------+----+----+-----+------+------------------------+-------------------+----+
|Flight_Date|Flight_Number|Origin_Airport|Destination_Airport|Scheduled_Departure_Time|Departure_Delay_Minutes|Scheduled_Arrival_Time|Arrival_Delay_Minutes|Cancellation_Reason_Code|Scheduled_Flight_Duration|Flight_Distance|Carrier_Delay_Minutes|Weather_Delay_Minutes|NAS_Delay_Minutes|Security_Delay_Minutes|Late_Aircraft_Delay_Minutes|city_orig|state_orig|latitude_dest|longitude_dest| city_dest|state_dest| id|      distance_km|latitude|  longitude|tavg|wspd| wdir|  pres|F

In [None]:
df_cleaned.printSchema()

root
 |-- Flight_Date: date (nullable = true)
 |-- Flight_Number: integer (nullable = true)
 |-- Origin_Airport: string (nullable = true)
 |-- Destination_Airport: string (nullable = true)
 |-- Scheduled_Departure_Time: integer (nullable = true)
 |-- Departure_Delay_Minutes: double (nullable = true)
 |-- Scheduled_Arrival_Time: integer (nullable = true)
 |-- Arrival_Delay_Minutes: double (nullable = true)
 |-- Cancellation_Reason_Code: string (nullable = true)
 |-- Scheduled_Flight_Duration: double (nullable = true)
 |-- Flight_Distance: double (nullable = true)
 |-- Carrier_Delay_Minutes: double (nullable = true)
 |-- Weather_Delay_Minutes: double (nullable = true)
 |-- NAS_Delay_Minutes: double (nullable = true)
 |-- Security_Delay_Minutes: double (nullable = true)
 |-- Late_Aircraft_Delay_Minutes: double (nullable = true)
 |-- city_orig: string (nullable = true)
 |-- state_orig: string (nullable = true)
 |-- latitude_dest: double (nullable = true)
 |-- longitude_dest: double (nullab

In [None]:
# Remove columns departure_timestamp and formatted_departure_time

df_cleaned = df_cleaned.drop("Departure_Timestamp")
df_cleaned = df_cleaned.drop("Formatted_Departure_Time")

df_cleaned = df_cleaned.withColumn("Delayed", when((col("Departure_Delay_Minutes") > 15), 1).otherwise(0))

In [None]:
# Value counts of delayed flights

df_cleaned.groupBy("Delayed").count().show()



+-------+-------+
|Delayed|  count|
+-------+-------+
|      1| 651872|
|      0|5774535|
+-------+-------+



                                                                                

In [None]:
# Add new columns for day, month, year and day of the week

df_cleaned = df_cleaned.withColumn("Flight_Day", dayofmonth("Flight_Date")) \
                       .withColumn("Flight_Month", month("Flight_Date")) \
                       .withColumn("Flight_Year", year("Flight_Date")) \
                       .withColumn("Flight_DayOfWeek", dayofweek("Flight_Date"))


In [None]:
print(df_cleaned.show(5))

+-----------+-------------+--------------+-------------------+------------------------+-----------------------+----------------------+---------------------+------------------------+-------------------------+---------------+---------------------+---------------------+-----------------+----------------------+---------------------------+---------+----------+-------------+--------------+----------+----------+---+-----------------+--------+-----------+----+----+-----+------+----+-------+----------+------------+-----------+----------------+
|Flight_Date|Flight_Number|Origin_Airport|Destination_Airport|Scheduled_Departure_Time|Departure_Delay_Minutes|Scheduled_Arrival_Time|Arrival_Delay_Minutes|Cancellation_Reason_Code|Scheduled_Flight_Duration|Flight_Distance|Carrier_Delay_Minutes|Weather_Delay_Minutes|NAS_Delay_Minutes|Security_Delay_Minutes|Late_Aircraft_Delay_Minutes|city_orig|state_orig|latitude_dest|longitude_dest| city_dest|state_dest| id|      distance_km|latitude|  longitude|tavg|wsp

In [None]:
columns_to_drop = ["Flight_Date", 'Flight_Year']
df_cleaned = df_cleaned.drop(*columns_to_drop)

### Transform Categorical Columns into Numeric Indexes

In [None]:
# Apply StringIndexer for Origin_Airport and Destination_Airport

origin_airport_indexer = StringIndexer(inputCol="Origin_Airport", outputCol="Origin_Airport_Indexed")
destination_airport_indexer = StringIndexer(inputCol="Destination_Airport", outputCol="Destination_Airport_Indexed")

# Fit and transform the StringIndexer models

origin_airport_indexer_model = origin_airport_indexer.fit(df_cleaned)
df_cleaned = origin_airport_indexer_model.transform(df_cleaned)

destination_airport_indexer_model = destination_airport_indexer.fit(df_cleaned)
df_cleaned = destination_airport_indexer_model.transform(df_cleaned)

                                                                                

In [None]:
df_cleaned = df_cleaned.drop("Origin_Airport", "Destination_Airport")
df_cleaned = df_cleaned.drop("Departure_Delay_Minutes", "Arrival_Delay_Minutes")

In [None]:
columns = ['Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'latitude_dest', 'longitude_dest', 'distance_km', 'latitude', 'longitude', 'tavg', 'wspd', 'wdir', 'pres', 'Hour',
           'Delayed', 'Flight_Day', 'Flight_Month', 'Flight_DayOfWeek', 'Origin_Airport_Indexed', 'Destination_Airport_Indexed']

In [None]:
df_cleaned = df_cleaned.select(columns)

df_cleaned.show(5)

+------------------------+----------------------+-------------+--------------+-----------------+--------+-----------+----+----+-----+------+----+-------+----------+------------+----------------+----------------------+---------------------------+
|Scheduled_Departure_Time|Scheduled_Arrival_Time|latitude_dest|longitude_dest|      distance_km|latitude|  longitude|tavg|wspd| wdir|  pres|Hour|Delayed|Flight_Day|Flight_Month|Flight_DayOfWeek|Origin_Airport_Indexed|Destination_Airport_Indexed|
+------------------------+----------------------+-------------+--------------+-----------------+--------+-----------+----+----+-----+------+----+-------+----------+------------+----------------+----------------------+---------------------------+
|                     820|                  1258|    21.972065|    -159.33672|4300.254682582812|32.73336|-117.192245|15.8| 7.2|337.0|1019.5|   8|      0|         2|           1|               3|                  23.0|                       80.0|
|               

In [None]:
len(df_cleaned.columns)

18

## Models Implementation

### Random Forest for Feature Evaluation

In [None]:
# Definition of feature columns and target column

feature_columns = ['Scheduled_Departure_Time', 'Scheduled_Arrival_Time', 'distance_km', 'tavg', 'wspd', 'wdir', 'pres', 'Hour', 'Flight_Month', 'Flight_DayOfWeek',
                   'Origin_Airport_Indexed', 'Destination_Airport_Indexed']

label_column = "Delayed"

In [None]:
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features") # Assemble features into a single vector

data = vector_assembler.transform(df_cleaned)
data = data.select("features", label_column) # Select columns that are useful for the model

In [None]:
# Dividing the dataset into training and testing

train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [None]:
train_data.groupBy("Delayed").count().show()

24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


CodeCache: size=131072Kb used=43628Kb max_used=45050Kb free=87443Kb
 bounds [0x00000001089f8000, 0x000000010b638000, 0x00000001109f8000]
 total_blobs=15106 nmethods=14094 adapters=922
 compilation: disabled (not enough contiguous free space left)


24/12/06 18:22:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:07 WARN RowBasedKeyValueBatch: Calling spill() on

+-------+-------+
|Delayed|  count|
+-------+-------+
|      1| 456356|
|      0|4042205|
+-------+-------+



                                                                                

In [None]:
test_data.groupBy("Delayed").count().show()

24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:22:26 WARN RowBasedKeyValueBatch: Calling spill() on

+-------+-------+
|Delayed|  count|
+-------+-------+
|      1| 195516|
|      0|1732330|
+-------+-------+



                                                                                

In [None]:
# Balancing classes in the training set

major_class = train_data.groupBy(label_column).count().orderBy(col("count").desc()).first()[0]
minor_class = train_data.groupBy(label_column).count().orderBy(col("count")).first()[0]

major_class_data = train_data.filter(col(label_column) == major_class)
minor_class_data = train_data.filter(col(label_column) == minor_class)

major_class_downsampled = major_class_data.sample(fraction=(minor_class_data.count() / major_class_data.count())*1.5, seed=42)
balanced_train_data = major_class_downsampled.union(minor_class_data)

                                                                                

In [None]:
balanced_train_data.groupBy("Delayed").count().show()

24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:16 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/06 18:40:19 WARN RowBasedKeyValueBatch: Calling spill() on

+-------+------+
|Delayed| count|
+-------+------+
|      0|912645|
|      1|456356|
+-------+------+



                                                                                

In [None]:
# Random Forest model initialization

rf = RandomForestClassifier(featuresCol="features", labelCol=label_column, numTrees=100, seed=42, maxBins=500)
rf_model = rf.fit(balanced_train_data) # Model training

24/12/09 11:15:32 WARN DAGScheduler: Broadcasting large task binary with size 1597.0 KiB
                                                                                

In [None]:
# Evaluation of the model on the test set

predictions = rf_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy}")

24/12/09 11:15:41 WARN DAGScheduler: Broadcasting large task binary with size 1281.3 KiB

Accuracy: 0.8129007449782982


                                                                                

In [None]:
# Initialize the BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol=label_column,
    rawPredictionCol="rawPrediction",  # Use the "rawPrediction" column for AUC.
    metricName="areaUnderROC"
)


In [None]:
# AUC

auc = evaluator.evaluate(predictions)

print(f"AUC: {auc}")

24/12/09 11:16:09 WARN DAGScheduler: Broadcasting large task binary with size 1269.6 KiB
                                                                                

AUC: 0.6472168813989705


In [None]:
# F1 score

evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)

print(f"F1 Score: {f1}")

24/12/09 11:16:37 WARN DAGScheduler: Broadcasting large task binary with size 1281.3 KiB

F1 Score: 0.8243307509544141


                                                                                

In [None]:
# Feature importance extraction

feature_importances = rf_model.featureImportances

importances = [(feature, importance) for feature, importance in zip(feature_columns, feature_importances)] # Combine feature columns with their importances
sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True) # Sort features by importance in descending order

print("Feature Importances:")
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

Feature Importances:
Scheduled_Departure_Time: 0.350535832816442
Hour: 0.32927898884726686
Scheduled_Arrival_Time: 0.21896632183060402
Origin_Airport_Indexed: 0.034550436113975784
Destination_Airport_Indexed: 0.02799876612039216
distance_km: 0.013855899560884401
pres: 0.012303578966549692
Flight_Month: 0.007479674536862777
tavg: 0.004016655502810999
wspd: 0.0005972188794976863
Flight_DayOfWeek: 0.00032192146182303074
wdir: 9.470536289092241e-05


In [None]:
# Confusion matrix

predictions.groupBy("Delayed", "prediction").count().show()

24/12/09 11:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1279.3 KiB

+-------+----------+-------+
|Delayed|prediction|  count|
+-------+----------+-------+
|      1|       0.0| 148080|
|      0|       0.0|1518651|
|      1|       1.0|  47838|
|      0|       1.0| 212467|
+-------+----------+-------+



24/12/09 11:17:27 WARN DAGScheduler: Broadcasting large task binary with size 1257.1 KiB
                                                                                

## Multilayer Perceptron

In [None]:
print(train_data.groupBy("Delayed").count().show())
print(test_data.groupBy("Delayed").count().show())

                                                                                

+-------+-------+
|Delayed|  count|
+-------+-------+
|      1| 521566|
|      0|4619611|
+-------+-------+

None




+-------+-------+
|Delayed|  count|
+-------+-------+
|      1| 130306|
|      0|1154924|
+-------+-------+

None


                                                                                

In [None]:
print(balanced_train_data.groupBy("Delayed").count().show())



+-------+------+
|Delayed| count|
+-------+------+
|      0|782629|
|      1|521566|
+-------+------+

None


                                                                                

In [None]:
# Initialization of the neural network model

layers = [len(feature_columns), 5, 4, len(train_data.select(label_column).distinct().collect())]
mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol=label_column, maxIter=100, layers=layers, blockSize=128, seed=42)

mlp_model = mlp.fit(balanced_train_data) # Model training

24/12/09 11:20:53 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [None]:
# AUC

predictions = mlp_model.transform(test_data)

evaluator = BinaryClassificationEvaluator(
    labelCol=label_column,
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

                                                                                

AUC: 0.6041342091852437


In [None]:
# Evaluation of the model on the test set

predictions = mlp_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Neural Network Accuracy: {accuracy}")



Neural Network Accuracy: 0.8783323196868144


                                                                                

In [None]:
# F1 Score

evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
print(f"Neural Network F1 Score: {f1}")



Neural Network F1 Score: 0.8471819080781542


                                                                                

In [None]:
# Confusion matrix

predictions.groupBy("Delayed", "prediction").count().show()



+-------+----------+-------+
|Delayed|prediction|  count|
+-------+----------+-------+
|      1|       0.0| 186805|
|      0|       0.0|1683465|
|      1|       1.0|   9113|
|      0|       1.0|  47653|
+-------+----------+-------+



                                                                                

## Gradient Boosting Classifier

In [None]:
# Model definition

gbt = GBTClassifier(featuresCol="features", labelCol="Delayed", maxIter=50, maxBins=500, seed=42)
gbt_model = gbt.fit(balanced_train_data)

24/12/09 11:26:33 WARN DAGScheduler: Broadcasting large task binary with size 1014.0 KiB
24/12/09 11:26:33 WARN DAGScheduler: Broadcasting large task binary with size 1035.0 KiB
24/12/09 11:26:34 WARN DAGScheduler: Broadcasting large task binary with size 1035.5 KiB
24/12/09 11:26:34 WARN DAGScheduler: Broadcasting large task binary with size 1038.0 KiB
24/12/09 11:26:35 WARN DAGScheduler: Broadcasting large task binary with size 1049.5 KiB
24/12/09 11:26:35 WARN DAGScheduler: Broadcasting large task binary with size 1067.4 KiB
24/12/09 11:26:35 WARN DAGScheduler: Broadcasting large task binary with size 1089.3 KiB
24/12/09 11:26:36 WARN DAGScheduler: Broadcasting large task binary with size 1089.8 KiB
24/12/09 11:26:37 WARN DAGScheduler: Broadcasting large task binary with size 1093.1 KiB
24/12/09 11:26:37 WARN DAGScheduler: Broadcasting large task binary with size 1096.9 KiB
24/12/09 11:26:37 WARN DAGScheduler: Broadcasting large task binary with size 1111.1 KiB
24/12/09 11:26:38 WAR

In [None]:
# AUC

predictions = gbt_model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="Delayed", metricName="areaUnderROC")

auc = evaluator.evaluate(predictions)
print(f"AUC: {auc}")

24/12/09 11:27:53 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
                                                                                

AUC: 0.685194373945657


In [None]:
# F1 score

evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="f1")

f1 = evaluator.evaluate(predictions)
print(f"F1 Score: {f1}")

24/12/09 11:28:19 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB

F1 Score: 0.8059697730935206


                                                                                

In [None]:
# Confusion matrix

predictions.groupBy("Delayed", "prediction").count().show()

24/12/09 11:30:17 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB

+-------+----------+-------+
|Delayed|prediction|  count|
+-------+----------+-------+
|      1|       0.0| 116559|
|      0|       0.0|1413558|
|      1|       1.0|  79359|
|      0|       1.0| 317560|
+-------+----------+-------+



24/12/09 11:30:41 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
                                                                                

In [None]:
# Accuracy

evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

24/12/09 11:29:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB

Accuracy: 0.7747219045207251


                                                                                