In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rand, avg

In [11]:
spark = SparkSession.builder.appName("S4").getOrCreate()

In [12]:
file_path = "/content/Airlines.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [13]:
df_with_delay = df.withColumn("DelayTime", when(col("Delay")==1, (rand()*30).cast("int")+1).otherwise(0))
df_with_delay.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|       11|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       22|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|       22|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|       15|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|        0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|        9|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|        0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|        0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|        6|
| 10|     AA|  2538|        

In [14]:
avg_delay = df_with_delay.groupBy("Airline").agg(avg("DelayTime").alias("Avg_Delay"))
avg_delay.show()

+-------+------------------+
|Airline|         Avg_Delay|
+-------+------------------+
|     UA| 5.038850066982874|
|     AA|5.9863106711056595|
|     EV| 6.266483221956188|
|     B6| 7.327407243816254|
|     DL| 6.967902855267476|
|     OO| 6.999542324989056|
|     F9| 6.921003717472119|
|     YV|3.7616029143897998|
|     US| 5.216695652173913|
|     MQ| 5.436716295588035|
|     OH| 4.260095011876484|
|     HA| 5.018644675510936|
|     XE| 5.831909014971407|
|     AS| 5.217853718071659|
|     CO| 8.700776588692111|
|     FL| 4.647908964325155|
|     WN|10.820174925874364|
|     9E| 6.122256598665764|
+-------+------------------+



In [17]:
top_5_avg = avg_delay.sort(col("Avg_Delay").desc()).limit(5)
top_5_avg.show()

+-------+------------------+
|Airline|         Avg_Delay|
+-------+------------------+
|     WN|10.820174925874364|
|     CO| 8.700776588692111|
|     B6| 7.327407243816254|
|     OO| 6.999542324989056|
|     DL| 6.967902855267476|
+-------+------------------+



In [18]:
#Trying to group by route and finding average

avg_route_delay = df_with_delay.groupBy("AirportFrom", "AirportTo").agg(avg("DelayTime").alias("Avg_Delay"))
avg_route_delay.show()

+-----------+---------+------------------+
|AirportFrom|AirportTo|         Avg_Delay|
+-----------+---------+------------------+
|        BQN|      MCO| 7.354838709677419|
|        AMA|      MEM| 3.473684210526316|
|        MCI|      IAH|5.2622377622377625|
|        TYS|      MIA|3.1774193548387095|
|        SPI|      ORD|3.5632183908045976|
|        PHL|      MCO|  6.38045738045738|
|        SNA|      PHX| 7.611111111111111|
|        SMF|      BUR|  9.81081081081081|
|        PBI|      DCA| 5.507462686567164|
|        ATL|      GSP| 6.242105263157895|
|        LAS|      LIT|13.225806451612904|
|        MCI|      MKE|11.725274725274724|
|        ORD|      PDX| 6.666666666666667|
|        MLI|      MCO| 6.055555555555555|
|        ROC|      CLE|2.5974025974025974|
|        EWR|      STT| 8.416666666666666|
|        CLE|      SJU| 8.833333333333334|
|        DSM|      MCO|               9.0|
|        CAE|      ATL|5.6767676767676765|
|        AUS|      ELP| 8.591397849462366|
+----------

In [20]:
top_5_avg = avg_route_delay.sort(col("Avg_Delay").desc()).limit(5)
top_5_avg.show()

+-----------+---------+---------+
|AirportFrom|AirportTo|Avg_Delay|
+-----------+---------+---------+
|        LAS|      ORF|     28.0|
|        MSP|      SJU|     24.0|
|        ORD|      MHT|     22.5|
|        AVP|      JFK|     22.0|
|        BFL|      SBP|     22.0|
+-----------+---------+---------+



In [21]:
spark.stop()