In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, when

In [2]:
spark = SparkSession.builder.appName("S3").getOrCreate()

In [4]:
file_path = "/content/Airlines.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [5]:
df.show()

+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|
| 10|     AA|  2538|        LAS|      ORD|        3|  40|   200|    1|
| 11|     CO|   223|        ANC|      SEA|        3|  49|   201|    1|
| 12| 

In [7]:
df_with_delay = df.withColumn("DelayTime", when(col("Delay")==1, (rand()*30).cast("int")+1).otherwise(0))
df_with_delay.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|        1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       21|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|        1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|        7|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|        0|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|       28|
|  7|     DL|  1768|        LAX|      MSP|        3|  30|   220|    0|        0|
|  8|     DL|  2722|        PHX|      DTW|        3|  30|   228|    0|        0|
|  9|     DL|  2606|        SFO|      MSP|        3|  35|   216|    1|        3|
| 10|     AA|  2538|        

In [8]:
filter_15 = df_with_delay.filter(col("DelayTime")>15)
filter_15.show()

+---+-------+------+-----------+---------+---------+----+------+-----+---------+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|DelayTime|
+---+-------+------+-----------+---------+---------+----+------+-----+---------+
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|       21|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|       28|
| 29|     HA|   206|        HNL|      OGG|        3| 300|    36|    1|       19|
| 41|     US|   149|        SEA|      PHX|        3| 315|   166|    1|       29|
| 47|     OO|  4651|        FAR|      MSP|        3| 320|    62|    1|       22|
| 56|     9E|  3886|        DSM|      ATL|        3| 330|   135|    1|       24|
| 81|     US|  1732|        ATL|      CLT|        3| 330|    68|    1|       21|
| 84|     XE|  2263|        BTR|      IAH|        3| 330|    72|    1|       16|
| 94|     AA|   674|        ORD|      MIA|        3| 335|   185|    1|       18|
| 96|     CO|   463|        

In [9]:
correlation = df_with_delay.corr("Length", "DelayTime")
print(f"Correlation is {correlation}")

Correlation is 0.030757521580530203


In [10]:
spark.stop()