In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, when

In [16]:
spark = SparkSession.builder.appName("airlines").getOrCreate()

In [17]:
file_path = "/content/Airlines.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

In [18]:
data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|
| 10|     AA|  2538|        LAS|      ORD|        3|  40|   200|    1|
| 11|     CO|   223|        ANC|      SEA|        3|  49|   201|    1|
| 12| 

In [19]:
delay_data = data.withColumn("DelayTime", when(col("Delay")==1, (rand()*30).cast("int")+1).otherwise(0))
delay_data.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|       21|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       16|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|        8|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|       19|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|        0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|       22|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|        0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|        0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|        3|
| 10|     AA|  2538|        

In [20]:
#Filter flights that were more than 15 minutes delayed.
filtered_df = delay_data.filter(col("DelayTime")>15)
filtered_df.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|       21|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       16|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|       19|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|       22|
| 11|     CO|   223|        ANC|      SEA|        3|  49|   201|    1|       30|
| 24|     HA|    17|        LAS|      HNL|        3| 100|   380|    1|       25|
| 29|     HA|   206|        HNL|      OGG|        3| 300|    36|    1|       27|
| 39|     OH|  6338|        GSO|      ATL|        3| 315|    93|    1|       26|
| 45|     9E|  3854|        DLH|      MSP|        3| 320|    58|    1|       19|
| 53|     OO|  4719|        

In [21]:
#Analyze whether there is any correla on between the flight length and the likelihood of a delay.
correlation = delay_data.corr("Length", "DelayTime")
print("Correlation is ", correlation)

Correlation is  0.031168906169925653


In [22]:
spark.stop()