In [2]:
# Initial Load - let's just do 2022 data. 2020 might be skewed, covid.

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns

spark = SparkSession.builder.appName("FlightDataAnalysis").config("spark.driver.memory", "4g").getOrCreate()

# Load your flight data into a Spark DataFrame
data_path = "gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2022.csv"
data = spark.read.csv(data_path, header=True, inferSchema=True)

# Data preprocessing (replace this with your specific cleaning logic)
# Example: Remove rows with missing values
data_cleaned = data.dropna()

# Summary statistics for numeric variables
numeric_vars = ['DepDelayMinutes', 'ArrDelayMinutes']
summary_numeric = data_cleaned.select(numeric_vars).describe()

# Summary statistics for categorical variables
categorical_vars = ['Airline', 'Origin', 'Dest', 'DepartureDelayGroups', 'ArrivalDelayGroups', 'Month', 'DayofMonth', 'DayOfWeek', 'Marketing_Airline_Network']
summary_categorical = data_cleaned.select(categorical_vars).describe()




                                                                                

In [13]:
# Output and print the summary statistics
print("Summary statistics for numeric variables:")
summary_numeric.show()

print("Summary statistics for categorical variables:")
summary_categorical.show()

Summary statistics for numeric variables:
+-------+------------------+------------------+
|summary|   DepDelayMinutes|   ArrDelayMinutes|
+-------+------------------+------------------+
|  count|           3944916|           3944916|
|   mean|15.940134086505264|15.783071426615928|
| stddev| 52.16239928825909| 51.98423581313936|
|    min|               0.0|               0.0|
|    max|            7223.0|            7232.0|
+-------+------------------+------------------+

Summary statistics for categorical variables:
+-------+--------------------+-------+-------+--------------------+-------------------+------------------+-----------------+------------------+-------------------------+
|summary|             Airline| Origin|   Dest|DepartureDelayGroups| ArrivalDelayGroups|             Month|       DayofMonth|         DayOfWeek|Marketing_Airline_Network|
+-------+--------------------+-------+-------+--------------------+-------------------+------------------+-----------------+---------------

In [12]:
# Perform time series analysis on departure delays
data_cleaned = data_cleaned.withColumn("FlightDate", F.to_date("FlightDate", "yyyy-MM-dd"))
data_cleaned = data_cleaned.withColumn("Year", F.year("FlightDate"))

# Group the data by year and calculate the average departure and arrival delays
time_series_dep_delay = data_cleaned.groupBy("Year").agg(F.avg("DepDelayMinutes").alias("AvgDepDelayMinutes"))
time_series_arr_delay = data_cleaned.groupBy("Year").agg(F.avg("ArrDelayMinutes").alias("AvgArrDelayMinutes"))

# Convert the result to Pandas DataFrame for plotting
time_series_dep_delay_df = time_series_dep_delay.toPandas()
time_series_arr_delay_df = time_series_arr_delay.toPandas()

# Plot the time series of departure and arrival delays
plt.figure(figsize=(12, 6))
plt.plot(time_series_dep_delay_df["Year"], time_series_dep_delay_df["AvgDepDelayMinutes"], label="Average Departure Delay")
plt.plot(time_series_arr_delay_df["Year"], time_series_arr_delay_df["AvgArrDelayMinutes"], label="Average Arrival Delay")
plt.xlabel("Year")
plt.ylabel("Average Delay (Minutes)")
plt.title("Time Series of Departure and Arrival Delays Over Years")
plt.legend()
plt.show()

# Stop the SparkSession when you're done
spark.stop()



Py4JJavaError: An error occurred while calling o227.collectToPython.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange hashpartitioning(Year#4272, 200), ENSURE_REQUIREMENTS, [id=#352]
+- *(1) HashAggregate(keys=[Year#4272], functions=[partial_avg(DepDelayMinutes#2478)], output=[Year#4272, sum#4466, count#4467L])
   +- *(1) Project [DepDelayMinutes#2478, year(cast(gettimestamp(cast(gettimestamp(FlightDate#2470, yyyy-MM-dd, Some(Etc/UTC), false) as date), yyyy-MM-dd, Some(Etc/UTC), false) as date)) AS Year#4272]
      +- *(1) Filter AtLeastNNulls(n, FlightDate#2470,Airline#2471,Origin#2472,Dest#2473,Cancelled#2474,Diverted#2475,CRSDepTime#2476,DepTime#2477,DepDelayMinutes#2478,DepDelay#2479,ArrTime#2480,ArrDelayMinutes#2481,AirTime#2482,CRSElapsedTime#2483,ActualElapsedTime#2484,Distance#2485,Year#2486,Quarter#2487,Month#2488,DayofMonth#2489,DayOfWeek#2490,Marketing_Airline_Network#2491,Operated_or_Branded_Code_Share_Partners#2492,DOT_ID_Marketing_Airline#2493,IATA_Code_Marketing_Airline#2494,Flight_Number_Marketing_Airline#2495,Operating_Airline#2496,DOT_ID_Operating_Airline#2497,IATA_Code_Operating_Airline#2498,Tail_Number#2499,Flight_Number_Operating_Airline#2500,OriginAirportID#2501,OriginAirportSeqID#2502,OriginCityMarketID#2503,OriginCityName#2504,OriginState#2505,OriginStateFips#2506,OriginStateName#2507,OriginWac#2508,DestAirportID#2509,DestAirportSeqID#2510,DestCityMarketID#2511,DestCityName#2512,DestState#2513,DestStateFips#2514,DestStateName#2515,DestWac#2516,DepDel15#2517,DepartureDelayGroups#2518,DepTimeBlk#2519,TaxiOut#2520,WheelsOff#2521,WheelsOn#2522,TaxiIn#2523,CRSArrTime#2524,ArrDelay#2525,ArrDel15#2526,ArrivalDelayGroups#2527,ArrTimeBlk#2528,DistanceGroup#2529,DivAirportLandings#2530)
         +- FileScan csv [FlightDate#2470,Airline#2471,Origin#2472,Dest#2473,Cancelled#2474,Diverted#2475,CRSDepTime#2476,DepTime#2477,DepDelayMinutes#2478,DepDelay#2479,ArrTime#2480,ArrDelayMinutes#2481,AirTime#2482,CRSElapsedTime#2483,ActualElapsedTime#2484,Distance#2485,Year#2486,Quarter#2487,Month#2488,DayofMonth#2489,DayOfWeek#2490,Marketing_Airline_Network#2491,Operated_or_Branded_Code_Share_Partners#2492,DOT_ID_Marketing_Airline#2493,... 37 more fields] Batched: false, DataFilters: [AtLeastNNulls(n, FlightDate#2470,Airline#2471,Origin#2472,Dest#2473,Cancelled#2474,Diverted#2475..., Format: CSV, Location: InMemoryFileIndex[gs://msca-bdp-student-gcs/Group4_Final_Project/archive/Combined_Flights_2022.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<FlightDate:string,Airline:string,Origin:string,Dest:string,Cancelled:boolean,Diverted:bool...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:163)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:387)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3532)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3700)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3698)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3529)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:750)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1514)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:408)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:399)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:497)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:151)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:149)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:166)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 39 more


FlightDate: The date of the flight, which can be used to analyze trends and patterns over time.

Airline: The airline operating the flight, as different airlines may have varying on-time performance records.

Origin and Dest: The departure and arrival airports, which can be used to examine how on-time performance varies by location.

CRSDepTime and CRSArrTime: The scheduled departure and arrival times, which are important for calculating on-time performance.

DepTime and ArrTime: The actual departure and arrival times, which are needed to calculate the actual on-time performance.

DepDelayMinutes and ArrDelayMinutes: The delay in minutes for departure and arrival, which directly relate to on-time performance.

DepDel15 and ArrDel15: Binary indicators of whether a flight was delayed by 15 minutes or more. These can be used to categorize flights as on-time or delayed.

DepartureDelayGroups and ArrivalDelayGroups: Categorical groups that classify flights based on their departure and arrival delays.

Month, DayofMonth, and DayOfWeek: Date-related variables that allow you to analyze how on-time performance varies by month, day of the month, and day of the week.

Marketing_Airline_Network: Information about the airline's marketing network can provide insights into the airline's marketing strategies and how they affect on-time performance.



**Most rich: Marketing_airline_network -- can do k-means clustering?
DepDelayMinutes, ArrDelayMinutes, Origin, Destination, Month, Day of month, Day of week.**

Some very rough plots:

In [None]:
A lot of different approaches. We Can do
1/ seasonality? ACF/PACF, according to time? lets try that -- 