In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, avg, when

In [2]:
spark = SparkSession.builder.appName("airlines2").getOrCreate()

In [3]:
data = spark.read.csv("/content/Airlines.csv", header=True, inferSchema=True)
data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|
| 10|     AA|  2538|        LAS|      ORD|        3|  40|   200|    1|
| 11|     CO|   223|        ANC|      SEA|        3|  49|   201|    1|
| 12| 

In [4]:
#Add random delay time wherever Delay is 1
delay_data = data.withColumn("DelayTime", when(col("Delay")==1, (rand()*30).cast("int")+1).otherwise(0))
delay_data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|       17|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       29|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|       16|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|       14|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|        0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|        1|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|        0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|        0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|       18|
| 10|     AA|  2538|        

In [5]:
#Group the data by airline carrier and compute the average delay for each one.
avg_delay_df = delay_data.groupBy("Airline").agg(avg("DelayTime").alias("AverageDelayTime"))
avg_delay_df.show()

+-------+------------------+
|Airline|  AverageDelayTime|
+-------+------------------+
|     UA|  5.04066041493175|
|     AA| 5.964101103907482|
|     EV|6.1898652753457455|
|     B6| 7.237632508833922|
|     DL|  7.01711519527404|
|     OO|  7.06397500696462|
|     F9| 6.890644361833953|
|     YV|3.7143169398907103|
|     US| 5.238782608695653|
|     MQ| 5.367490779948095|
|     OH|4.2386381631037215|
|     HA| 4.873072785944783|
|     XE| 5.909882413416437|
|     AS| 5.235463342341557|
|     CO|  8.75433279666635|
|     FL| 4.677629999519854|
|     WN|10.792926448239582|
|     9E| 6.172387121724838|
+-------+------------------+



In [6]:
spark.stop()