In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, avg, when

In [2]:
spark = SparkSession.builder.appName("airlines2").getOrCreate()

In [4]:
data = spark.read.csv("/content/Airlines.csv", header=True, inferSchema=True)
data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|
| 10|     AA|  2538|        LAS|      ORD|        3|  40|   200|    1|
| 11|     CO|   223|        ANC|      SEA|        3|  49|   201|    1|
| 12| 

In [5]:
data = data.withColumn("DelayTime", when(col("Delay")==1, (rand()*30).cast("int")+1).otherwise(0))
data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|       14|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       14|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|       22|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|       27|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|        0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|       21|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|        0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|        0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|       15|
| 10|     AA|  2538|        

In [8]:
avg_delay_df = data.groupBy("Airline").agg(avg("DelayTime").alias("AverageDelayTime"))
avg_delay_df.show()

+-------+------------------+
|Airline|  AverageDelayTime|
+-------+------------------+
|     UA| 5.002498280169449|
|     AA| 6.069695111266865|
|     EV| 6.233713325947897|
|     B6|  7.25231890459364|
|     DL| 7.000738431243846|
|     OO| 7.002507263103435|
|     F9|  6.82187112763321|
|     YV|3.7739890710382515|
|     US| 5.185623188405797|
|     MQ| 5.372326184947411|
|     OH| 4.230878859857482|
|     HA| 4.920401577626389|
|     XE| 5.870943905416693|
|     AS| 5.288292215151251|
|     CO| 8.759304858414623|
|     FL|  4.70936764776492|
|     WN| 10.78018427792597|
|     9E| 6.161703567630282|
+-------+------------------+



In [11]:
top_delay_df = avg_delay_df.orderBy(col("AverageDelayTime").desc()).limit(5)
top_delay_df.show()

+-------+-----------------+
|Airline| AverageDelayTime|
+-------+-----------------+
|     WN|10.78018427792597|
|     CO|8.759304858414623|
|     B6| 7.25231890459364|
|     OO|7.002507263103435|
|     DL|7.000738431243846|
+-------+-----------------+



In [12]:
spark.stop()