# Flight Data Xploration
Jean-Sebastien Gaultier

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Download the datasets

In [2]:
spark = SparkSession.builder.getOrCreate()

In [None]:
datapath_18 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2018.csv"
df_18 = spark.read.csv(datapath_18, header=True, inferSchema=True)

23/11/28 16:06:41 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/11/28 16:06:56 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/11/28 16:07:11 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/11/28 16:07:26 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
23/11/28 16:07:41 WARN org.apache.spark.scheduler.cluster.YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are regist

In [None]:
datapath_19 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2019.csv"
df_19 = spark.read.csv(datapath_19, header=True, inferSchema=True)

In [None]:
datapath_20 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2020.csv"
df_20 = spark.read.csv(datapath_20, header=True, inferSchema=True)

In [None]:
datapath_21 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2021.csv"
df_21 = spark.read.csv(datapath_21, header=True, inferSchema=True)

In [None]:
datapath_22 = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2022.csv"
df_22 = spark.read.csv(datapath_22, header=True, inferSchema=True)

In [None]:
datapath_air = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Airlines.csv"
df_airlines = spark.read.csv(datapath_air, header=True, inferSchema=True)

In [None]:
df_all = df_18.union(df_19).union(df_20).union(df_21).union(df_22)

In [None]:
df_18.columns

## Visualizing

In [None]:
top_cancelled_tailnumbers = df_all.filter(df_all.Cancelled == True) \
    .groupBy("Tail_Number") \
    .agg(F.count("Tail_Number").alias("CancellationCount")) \
    .orderBy(F.desc("CancellationCount")) \
    .limit(10)

top_cancelled_tailnumbers.show()

In [None]:
top_avg_delay_tailnumbers = df_all.groupBy("Tail_Number") \
    .agg(F.avg("DepDelay").alias("AvgDepDelay")) \
    .orderBy(F.desc("AvgDepDelay")) \
    .limit(10)

# Display the result or save it to a file or use it in plotting libraries
top_avg_delay_tailnumbers.show()

In [None]:
top_flown_tailnumbers = df_all.groupBy("Tail_Number") \
    .agg(F.count("Tail_Number").alias("FlightCount")) \
    .orderBy(F.desc("FlightCount")) \
    .limit(10)

# Display the result or save it to a file or use it in plotting libraries
top_flown_tailnumbers.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert Spark DataFrame to Pandas DataFrame
top_cancelled_pd = top_cancelled_tailnumbers.toPandas()

# Plot the data using matplotlib or other plotting libraries
plt.figure(figsize = (10,8))
plt.bar(top_cancelled_pd['Tail_Number'], top_cancelled_pd['CancellationCount'])
plt.xlabel('TailNumber')
plt.ylabel('CancellationCount')
plt.title('Top 10 TailNumbers with the most cancellations')
plt.show()

In [None]:

# Convert Spark DataFrame to Pandas DataFrame
top_avg_delay_tailnumbers_pd = top_avg_delay_tailnumbers.toPandas()

# Plot the data using matplotlib or other plotting libraries
plt.figure(figsize = (10,8))
plt.bar(top_avg_delay_tailnumbers_pd['Tail_Number'], top_avg_delay_tailnumbers_pd['AvgDepDelay'])
plt.xlabel('TailNumber')
plt.ylabel('Delay Count')
plt.title('Top 10 TailNumbers with the most delay')
plt.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
top_flown_tailnumbers_pd = top_flown_tailnumbers.toPandas()

# Plot the data using matplotlib or other plotting libraries
plt.figure(figsize = (10,8))
plt.bar(top_flown_tailnumbers_pd['Tail_Number'], top_flown_tailnumbers_pd['FlightCount'])
plt.xlabel('TailNumber')
plt.ylabel('Flight Count')
plt.title('Top 10 TailNumbers with the most flights')
plt.show()

In [None]:
indexer = StringIndexer(inputCol="Tail_Number", outputCol="TailNumberIndex")
flights_indexed = indexer.fit(df_all).transform(df_all)

flights_indexed = flights_indexed.withColumn("Cancelled", flights_indexed["Cancelled"].cast("integer"))
selected_columns = ["DepDelay", "Cancelled", "ArrDelay"]
flights_selected = flights_indexed.select(selected_columns).na.drop()
assembler = VectorAssembler(inputCols=selected_columns, outputCol="features")
flights_assembled = assembler.transform(flights_selected)

# Calculate correlation matrix
try:
    correlation_matrix = Correlation.corr(flights_assembled, "features").head()

    # Extract the correlation matrix as a NumPy array
    correlation_array = correlation_matrix[0].toArray()

    # Convert correlation matrix to Pandas DataFrame
    correlation_df = pd.DataFrame(correlation_array, columns=selected_columns, index=selected_columns)

    # Use seaborn to create a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_df, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
    plt.title('Correlation Matrix Heatmap')
    plt.show()

except Exception as e:
    print(f"Error: {str(e)}")
    # Print more detailed information about the error for troubleshooting
    raise e
