In [2]:
import findspark
findspark.init() 

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
spark = (SparkSession.builder.appName('SparkSqlExampleApp1').getOrCreate())

In [4]:
schema = StructType([StructField('date', StringType(), True),
                        StructField('delay', IntegerType(), True),
                        StructField('distance', IntegerType(), True),
                        StructField('origin', StringType(), True),
                        StructField('destination', StringType(), True),
                       ])

In [15]:
df = (spark.read.csv('departuredelays.csv', schema=schema, header=True))
df.createOrReplaceTempView('us_delay_flights_tbl')

In [16]:
df.columns

['date', 'delay', 'distance', 'origin', 'destination']

In [17]:
spark.sql("""SELECT distance, origin, destination, date
    FROM us_delay_flights_tbl WHERE distance >1000
    ORDER BY distance DESC""").show(10)

+--------+------+-----------+--------+
|distance|origin|destination|    date|
+--------+------+-----------+--------+
|    4330|   HNL|        JFK|03131530|
|    4330|   HNL|        JFK|03071625|
|    4330|   HNL|        JFK|03121530|
|    4330|   HNL|        JFK|03021625|
|    4330|   HNL|        JFK|03061625|
|    4330|   HNL|        JFK|03081530|
|    4330|   HNL|        JFK|03091530|
|    4330|   HNL|        JFK|03011625|
|    4330|   HNL|        JFK|03151530|
|    4330|   HNL|        JFK|03051625|
+--------+------+-----------+--------+
only showing top 10 rows



In [18]:
spark.sql("""SELECT date, delay, origin, destination
    FROM us_delay_flights_tbl WHERE origin ='SFO' AND destination = 'ORD' AND delay>120 ORDER BY delay DESC""").show(10,truncate=False)

+--------+-----+------+-----------+
|date    |delay|origin|destination|
+--------+-----+------+-----------+
|02190925|1638 |SFO   |ORD        |
|01031755|396  |SFO   |ORD        |
|01022330|326  |SFO   |ORD        |
|01051205|320  |SFO   |ORD        |
|01190925|297  |SFO   |ORD        |
|02171115|296  |SFO   |ORD        |
|01071040|279  |SFO   |ORD        |
|01051550|274  |SFO   |ORD        |
|03120730|266  |SFO   |ORD        |
|01261104|258  |SFO   |ORD        |
+--------+-----+------+-----------+
only showing top 10 rows



In [30]:
new_df=df.withColumn('date(MM/dd)', regexp_replace(col('date'), "(\\d{2})(\\d{2})(\\d{2})(\\d{2})","$1/$2 $3:$4"))

In [40]:
new_df.show(10)
new_df.createOrReplaceTempView('us_delay_flights_tbl')

+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01/01 12:45|    6|     602|   ABE|        ATL|
|01/02 06:00|   -8|     369|   ABE|        DTW|
|01/02 12:45|   -2|     602|   ABE|        ATL|
|01/02 06:05|   -4|     602|   ABE|        ATL|
|01/03 12:45|   -4|     602|   ABE|        ATL|
|01/03 06:05|    0|     602|   ABE|        ATL|
|01/04 12:43|   10|     602|   ABE|        ATL|
|01/04 06:05|   28|     602|   ABE|        ATL|
|01/05 12:45|   88|     602|   ABE|        ATL|
|01/05 06:05|    9|     602|   ABE|        ATL|
+-----------+-----+--------+------+-----------+
only showing top 10 rows



In [53]:
(new_df
    .select('date','delay','origin','destination')
    .groupBy('date')
    .agg(count('delay').alias('Delays_count'))
    .orderBy('Delays_count', ascending=False)
    .show())

+-----------+------------+
|       date|Delays_count|
+-----------+------------+
|03/17 06:00|         368|
|03/24 06:00|         368|
|03/10 06:00|         367|
|03/31 06:00|         367|
|03/28 06:00|         366|
|03/14 06:00|         365|
|03/21 06:00|         365|
|03/19 06:00|         363|
|03/20 06:00|         363|
|03/27 06:00|         363|
|03/13 06:00|         362|
|03/18 06:00|         362|
|03/12 06:00|         362|
|03/11 06:00|         361|
|03/26 06:00|         361|
|03/25 06:00|         361|
|03/03 06:00|         333|
|03/06 06:00|         326|
|02/21 06:00|         325|
|02/28 06:00|         324|
+-----------+------------+
only showing top 20 rows



In [43]:
delay_ts_df = (new_df.withColumn('date_ts',to_timestamp(col('date'),"MM/dd hh:mm")).drop("date"))

In [54]:
spark.sql("""SELECT date, delay, origin, destination
    FROM us_delay_flights_tbl WHERE origin ='SFO' AND destination = 'ORD' AND delay>120  ORDER BY delay DESC""").show(20)

+-----------+-----+------+-----------+
|       date|delay|origin|destination|
+-----------+-----+------+-----------+
|02/19 09:25| 1638|   SFO|        ORD|
|01/03 17:55|  396|   SFO|        ORD|
|01/02 23:30|  326|   SFO|        ORD|
|01/05 12:05|  320|   SFO|        ORD|
|01/19 09:25|  297|   SFO|        ORD|
|02/17 11:15|  296|   SFO|        ORD|
|01/07 10:40|  279|   SFO|        ORD|
|01/05 15:50|  274|   SFO|        ORD|
|03/12 07:30|  266|   SFO|        ORD|
|01/26 11:04|  258|   SFO|        ORD|
|01/16 12:10|  225|   SFO|        ORD|
|02/09 18:00|  223|   SFO|        ORD|
|01/22 10:40|  215|   SFO|        ORD|
|03/12 11:55|  203|   SFO|        ORD|
|02/11 12:56|  197|   SFO|        ORD|
|03/31 14:05|  196|   SFO|        ORD|
|01/03 19:20|  193|   SFO|        ORD|
|01/02 14:10|  190|   SFO|        ORD|
|03/17 12:15|  189|   SFO|        ORD|
|01/10 14:10|  184|   SFO|        ORD|
+-----------+-----+------+-----------+
only showing top 20 rows



In [73]:
spark.sql("""SELECT delay, origin, destination,
    CASE
    WHEN delay > 360 THEN 'Very Long Delays'
    WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
    WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
    WHEN delay > 0 and delay < 60 THEN 'Tolerable Delays'
    WHEN delay = 0 THEN 'No Delays'
    ELSE 'Early'
    END AS Flight_Delays
    FROM us_delay_flights_tbl
    ORDER BY origin, delay DESC""").show(25)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
|  180|   ABE|        ATL|  Long Delays|
|  173|   ABE|        DTW|  Long Delays|
|  165|   ABE|        ATL|  Long Delays|
|  159|   ABE|        ATL|  Long Delays|
|  159|   ABE|        ORD|  Long Delays|
|  158|   ABE|        ATL|  Long Delays|
|  151|   ABE|        DTW|  Long Delays|
|  127|   ABE|        ATL|  Long Delays|
|  121|   ABE|        DTW|  Long Delays|
|  118|   ABE|        DTW| Short Delays|
|  116|   ABE|        ORD| Short Delays|
|  111|   ABE|  

In [56]:
spark.sql("CREATE DATABASE learn_spark_db")

DataFrame[]

In [57]:
spark.sql("USE learn_spark_db")

DataFrame[]

In [58]:
new_df.write.saveAsTable("managed_us_delay_flights_tbl")

In [59]:
us_flights_df = spark.sql("SELECT * FROM us_delay_flights_tbl")
us_flights_df2 = spark.table("us_delay_flights_tbl")

In [60]:
us_flights_df.show()

+-----------+-----+--------+------+-----------+
|       date|delay|distance|origin|destination|
+-----------+-----+--------+------+-----------+
|01/01 12:45|    6|     602|   ABE|        ATL|
|01/02 06:00|   -8|     369|   ABE|        DTW|
|01/02 12:45|   -2|     602|   ABE|        ATL|
|01/02 06:05|   -4|     602|   ABE|        ATL|
|01/03 12:45|   -4|     602|   ABE|        ATL|
|01/03 06:05|    0|     602|   ABE|        ATL|
|01/04 12:43|   10|     602|   ABE|        ATL|
|01/04 06:05|   28|     602|   ABE|        ATL|
|01/05 12:45|   88|     602|   ABE|        ATL|
|01/05 06:05|    9|     602|   ABE|        ATL|
|01/06 12:15|   -6|     602|   ABE|        ATL|
|01/06 17:25|   69|     602|   ABE|        ATL|
|01/06 12:30|    0|     369|   ABE|        DTW|
|01/06 06:25|   -3|     602|   ABE|        ATL|
|01/07 06:00|    0|     369|   ABE|        DTW|
|01/07 17:25|    0|     602|   ABE|        ATL|
|01/07 12:30|    0|     369|   ABE|        DTW|
|01/07 06:25|    0|     602|   ABE|     