Flight Delay Status Classification

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler, Imputer
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql import SQLContext
import pyspark.ml.feature as ftr
import pyspark.ml as ml
 
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml import PipelineModel
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import PCA
 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("Flight_Status").getOrCreate()

file_paths = [
    "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2018.parquet",
    "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2019.parquet",
    "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2020.parquet",
    "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2021.parquet",
    "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2022.parquet"]

In [3]:
# Use read.parquet to read Parquet files
df = spark.read.parquet(file_paths[0])

for file_path in file_paths[1:]:
    df_temp = spark.read.parquet(file_path)
    df = df.union(df_temp)

# Write the merged DataFrame to Parquet
df.write.parquet("gs://msca-bdp-student-gcs/Group4_Final_Project/archive/merged-file.parquet")

23/11/14 23:31:24 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [4]:
df.show()



+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------------+
|         FlightDate|          Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|D

                                                                                

In [5]:
df.printSchema()

root
 |-- FlightDate: timestamp (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Mar

In [6]:
airlines = spark.read.parquet("gs://msca-bdp-student-gcs/Group4_Final_Project/archive/merged-file.parquet")

airlines.printSchema()

# Check the number of records loaded
f'{airlines.count():,}'

root
 |-- FlightDate: timestamp (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Mar

                                                                                

'29,193,782'

In [8]:
yearly_counts = airlines.groupBy("Year").agg(F.count("*").alias("Count of Records")).orderBy("Year")

yearly_counts.show()



+----+----------------+
|Year|Count of Records|
+----+----------------+
|2018|         5689512|
|2019|         8091684|
|2020|         5022397|
|2021|         6311871|
|2022|         4078318|
+----+----------------+



                                                                                

In [9]:
target_features = ["CRSDepTime", "DepDelayMinutes", "CRSArrTime", "ArrDelay", "Distance"]

# Select the target features from the DataFrame
selected_df = airlines.select(target_features)

# Convert the selected DataFrame to Pandas
selected_pandas_df = selected_df.toPandas()

# Calculate the correlation matrix
corr_matrix = selected_pandas_df.corr()

# Display the correlation matrix
print(corr_matrix)

23/11/14 23:38:16 ERROR org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Sending KillExecutors(ArrayBuffer(5)) to AM was unsuccessful
org.apache.spark.rpc.RpcTimeoutException: Cannot receive any reply from hub-msca-bdp-dphub-students-jiaweixu-m.c.msca-bdp-student-ap.internal:39427 in 120 seconds. This timeout is controlled by spark.rpc.askTimeout
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at scala.util.Failure.recover(Try.scala:234)
	at scala.concurrent.Future.$anonfun$recover$1(Future.scala:395)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$tra

Py4JJavaError: An error occurred while calling o133.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded


In [None]:
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(240, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(20,10))
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=.5)