In [1]:
from __future__ import print_function
import sys
import re
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as func
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import udf, expr, concat, col, split, explode, lit,max as max_, monotonically_increasing_id, array, size, sum as sum_, pandas_udf, PandasUDFType
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [None]:
sc = SparkContext(appName="Flight Data Wrangling_Dataframe")
sqlContext = SQLContext(sc)

In [2]:
schema_flights =  StructType([
    StructField('YEAR', StringType(),True),
    StructField('MONTH', StringType(),True),
    StructField('DAY', StringType(),True),
    StructField('DAY_OF_WEEK', StringType(),True),
    StructField('AIRLINE', StringType(),True),
    StructField('FLIGHT_NUMBER', StringType(),True),
    StructField('TAIL_NUMBER', StringType(),True),
    StructField('ORIGIN_AIRPORT', StringType(),True),
    StructField('DESTINATION_AIRPORT', StringType(),True),
    StructField('SCHEDULED_DEPARTURE', StringType(),True),
    StructField('DEPARTURE_TIME', StringType(),True),
    StructField('DEPARTURE_DELAY', StringType(),True),
    StructField('CANCELLED', StringType(),True)
])

schema_airports =  StructType([
    StructField('IATA_CODE', StringType(),True),
    StructField('AIRPORT', StringType(),True),
    StructField('CITY', StringType(),True),
    StructField('STATE', StringType(),True),
    StructField('COUNTRY', StringType(),True),
    StructField('LATITUDE', StringType(),True),
    StructField('LONGITUDE', StringType(),True)
])

flights_file = 'flights.csv.bz2'
airports_file = 'airports.csv'

flights_lines = sc.textFile(flights_file)
flights_lines = flights_lines.zipWithIndex().filter(lambda kv: kv[1] > 2).keys()
flights_raw = flights_lines.map(lambda x: x.split(','))
flights = flights_raw.map(lambda p: (p[0], p[1] , p[2] , p[3], p[4] , p[5] , p[6], p[7] , p[8] , p[9], p[10], '0' if p[11] == '' else p[11], p[24] ))
flights = flights.toDF(schema_flights)
# flights = sqlContext.read.format('csv').options(header='True', infoerSchema = 'true', sep='|').load(flights_file, schema = schema_flights)
airports = sqlContext.read.format('csv').options(header='True', infoerSchema = 'true', sep=',').load(airports_file, schema = schema_airports)

In [3]:
flights = flights.withColumn("DEPARTURE_DELAY", flights["DEPARTURE_DELAY"].cast(IntegerType()))

In [4]:
# Question 1: Find a list of all origin Airports. Store the list of all origin airports in a single file
originAirports = flights.select('ORIGIN_AIRPORT').distinct()
originAirports.show()

+--------------+
|ORIGIN_AIRPORT|
+--------------+
|           BGM|
|           PSE|
|           INL|
|           DLG|
|         12888|
|           MSY|
|           PPG|
|         12003|
|         15041|
|           GEG|
|           SNA|
|           BUR|
|           GRB|
|           GTF|
|         14986|
|         13851|
|           IDA|
|         11150|
|         15412|
|           GRR|
+--------------+
only showing top 20 rows



In [5]:
# Question 2: Find a list of (Origin, Destination) pairs
Origin_Destination = flights.select('ORIGIN_AIRPORT','DESTINATION_AIRPORT').distinct()
Origin_Destination.show()

+--------------+-------------------+
|ORIGIN_AIRPORT|DESTINATION_AIRPORT|
+--------------+-------------------+
|           BQN|                MCO|
|           PHL|                MCO|
|           MCI|                IAH|
|           SPI|                ORD|
|           SNA|                PHX|
|           LBB|                DEN|
|           ORD|                PDX|
|           EWR|                STT|
|           ATL|                GSP|
|           MCI|                MKE|
|           PBI|                DCA|
|           SMF|                BUR|
|           MDW|                MEM|
|           LAS|                LIT|
|           TPA|                ACY|
|           DSM|                EWR|
|           FSD|                ATL|
|           SJC|                LIH|
|           CLE|                SJU|
|         11298|              11057|
+--------------+-------------------+
only showing top 20 rows



In [6]:
# Question 3: Which airport had the largest departure delay in January?
LargestDepartureDelay_January = flights.where("MONTH = '1'").orderBy("DEPARTURE_DELAY", ascending=False).limit(1)
LargestDepartureDelay_January.select("ORIGIN_AIRPORT").show()


+--------------+
|ORIGIN_AIRPORT|
+--------------+
|           BHM|
+--------------+



In [7]:
# Question 4: Which airline carrier had the largest delay on weekends (Saturdays and Sundays)?
LargestDelay_Weekends_airline = flights.where("DAY_OF_WEEK = '6'" or "DAY_OF_WEEK = '7'").orderBy("DEPARTURE_DELAY", ascending=False).limit(1)
LargestDepartureDelay_January.select("AIRLINE").show()

+-------+
|AIRLINE|
+-------+
|     AA|
+-------+



In [8]:
# Question 5: Which airport has the most cancellation of flights?
HighCancellation_Airport = flights.where("CANCELLED = '1'").withColumn('cnt',lit(1)).groupBy("ORIGIN_AIRPORT").agg(sum_("cnt").alias("count")).orderBy('count', ascending=False).limit(1)
HighCancellation_Airport.select("ORIGIN_AIRPORT").show()

+--------------+
|ORIGIN_AIRPORT|
+--------------+
|           ORD|
+--------------+



In [9]:
# Question 6: What are the flights cancellation percentage ratio for each carrier? Provide a printout
total_flight = flights.select(col('AIRLINE').alias("AIRLINES")).withColumn("cnt",lit(1)).groupBy("AIRLINES").agg(sum_("cnt").alias("total_count"))
cancelled_flight = flights.where("CANCELLED = '1'").withColumn("cnt",lit(1)).groupBy("AIRLINE").agg(sum_("cnt").alias("cancel_count"))
total_cancelled = total_flight.join(cancelled_flight, total_flight.AIRLINES == cancelled_flight.AIRLINE,'left')

carrier_cancellation = total_cancelled.withColumn('ratio', col('cancel_count')/col("total_count")).select("AIRLINE","ratio")
carrier_cancellation.show()

+-------+--------------------+
|AIRLINE|               ratio|
+-------+--------------------+
|     UA| 0.01274521400053905|
|     NK|0.017072900604026275|
|     AA|0.015040297086846387|
|     EV|0.026628693111785964|
|     B6|0.016012102693148796|
|     DL|0.004365889886868193|
|     OO| 0.01692861258462182|
|     F9|0.006473204456382932|
|     US|0.020466497244797825|
|     MQ| 0.05099581851258519|
|     HA|0.002241976085588...|
|     AS|0.003877811268258...|
|     VX|0.008626399366751207|
|     WN|0.012713822111098344|
+-------+--------------------+



In [10]:
# Question 7: Find the largest departure delay for each carrier
LargestDepartureDelay_carrier = flights.groupBy("AIRLINE").agg(max_("DEPARTURE_DELAY"))
LargestDepartureDelay_carrier.show()

+-------+--------------------+
|AIRLINE|max(DEPARTURE_DELAY)|
+-------+--------------------+
|     UA|                1314|
|     NK|                 836|
|     AA|                1988|
|     EV|                1274|
|     B6|                1006|
|     DL|                1289|
|     OO|                1378|
|     F9|                1112|
|     US|                 759|
|     MQ|                1544|
|     HA|                1433|
|     AS|                 963|
|     VX|                 644|
|     WN|                 665|
+-------+--------------------+



In [11]:
# Question 8: Find the largest departure delay for each carrier for each month
LargestDepartureDelay_carrier_month = flights.groupBy("AIRLINE","MONTH").agg(max_("DEPARTURE_DELAY"))
LargestDepartureDelay_carrier_month.show()

+-------+-----+--------------------+
|AIRLINE|MONTH|max(DEPARTURE_DELAY)|
+-------+-----+--------------------+
|     OO|    3|                 874|
|     F9|    6|                 813|
|     F9|    1|                 696|
|     WN|    4|                 498|
|     EV|    6|                1210|
|     VX|   11|                 288|
|     MQ|    5|                 956|
|     AA|    1|                1988|
|     UA|    7|                 824|
|     WN|   10|                 621|
|     WN|   11|                 505|
|     AS|   11|                 634|
|     NK|   12|                 723|
|     HA|    9|                1304|
|     MQ|    9|                1083|
|     MQ|    6|                1098|
|     AS|    7|                 830|
|     OO|    4|                 878|
|     VX|    7|                 410|
|     MQ|   10|                1544|
+-------+-----+--------------------+
only showing top 20 rows



In [12]:
# Question 9: For each carrier find the average departure delay.
TotalDepartureDelay_carrier = flights.groupBy("AIRLINE").agg(sum_("DEPARTURE_DELAY").alias("total_delay"))
cnt_carrier = flights.select(col('AIRLINE').alias("AIRLINES")).withColumn("cnt",lit(1)).groupBy("AIRLINES").agg(sum_("cnt").alias("total_cnt"))
TotalDepartureDelay_cnt = TotalDepartureDelay_carrier.join(cnt_carrier, TotalDepartureDelay_carrier.AIRLINE == cnt_carrier.AIRLINES,'left')

AverageDepartureDelay_carrier = TotalDepartureDelay_cnt.withColumn("average_delay",col("total_delay")/col("total_cnt"))

AverageDepartureDelay_carrier.select("AIRLINE","average_delay").show()

+-------+-------------------+
|AIRLINE|      average_delay|
+-------+-------------------+
|     UA|  14.26220664969373|
|     NK|  15.68327383944317|
|     AA|  8.773542906652084|
|     EV|   8.49219111957299|
|     B6| 11.333044995656211|
|     DL|  7.338090448359994|
|     OO|   7.67823058605973|
|     F9|  13.27060856928971|
|     US| 6.0209194071912036|
|     MQ|   9.63204268375465|
|     HA|0.48473882945248586|
|     AS| 1.7795501970785996|
|     VX|  8.947094648078446|
|     WN| 10.450107183471951|
+-------+-------------------+



In [13]:
# Question 10: For each carrier find the average departure delay for each month.
TotalDepartureDelay_carrier_month = flights.groupBy("AIRLINE","MONTH").agg(sum_("DEPARTURE_DELAY").alias("total_delay"))
cnt_carrier_month = flights.select(col("AIRLINE").alias("AIRLINES"),col("MONTH").alias("MONTHS")).withColumn('cnt',lit(1)).groupBy("AIRLINES","MONTHS").agg(sum_("cnt").alias("total_cnt"))

TotalDepartureDelay_cnt_month = TotalDepartureDelay_carrier_month.join(cnt_carrier_month, TotalDepartureDelay_carrier_month.AIRLINE == cnt_carrier_month.AIRLINES,'inner').where("MONTHS == MONTH")

AverageDepartureDelay_carrier_month = TotalDepartureDelay_cnt_month.withColumn('average_delay', col('total_delay')/col('total_cnt'))
AverageDepartureDelay_carrier_month.select("AIRLINE","MONTH","average_delay").show()


+-------+-----+------------------+
|AIRLINE|MONTH|     average_delay|
+-------+-----+------------------+
|     F9|    6|19.004434308881287|
|     OO|    3| 5.789528335796158|
|     EV|    6|14.712840245118997|
|     F9|    1|17.763215697759556|
|     WN|    4| 8.767129982050053|
|     VX|   11|5.7207240487624675|
|     MQ|    5|  9.21988911171405|
|     AA|    1|10.380929683598893|
|     UA|    7| 19.23004432204484|
|     WN|   10| 6.034080906272724|
|     WN|   11|  7.79386803786823|
|     AS|   11|1.7022939068100358|
|     NK|   12|14.084153312716708|
|     HA|    9| 2.089372765680858|
|     MQ|    9| 4.531930949910386|
|     MQ|    6|12.511945526823316|
|     AS|    7|2.5547689779407117|
|     OO|    4| 5.276855399460763|
|     MQ|   10|3.4174779364934946|
|     VX|    7|  9.72038440214378|
+-------+-----+------------------+
only showing top 20 rows



In [14]:
# Question 11: Which date of the year has the highest rate of flight cancellations?
# You should calculate the rate of flight cancellation by dividing number of canceled flights by total
# number of flights.
total_flight_date = flights.select(col("YEAR").alias('YEAR2'),col("MONTH").alias('MONTH2'),col("DAY").alias('DAY2')).withColumn("cnt",lit(1)).groupBy("YEAR2","MONTH2","DAY2").agg(sum_("cnt").alias("total_cnt"))
cancelled_flight_date = flights.where("CANCELLED = '1'").withColumn("cnt",lit(1)).groupBy("YEAR","MONTH","DAY").agg(sum_("cnt").alias("cancel_cnt"))

total_flight_date_cancelled_flight_date = total_flight_date.join(cancelled_flight_date, total_flight_date.DAY2 == cancelled_flight_date.DAY, "left").where("MONTH2 = MONTH").where("YEAR2 = YEAR")

DateOfYear_cancellations = total_flight_date_cancelled_flight_date.withColumn('rate',col("cancel_cnt")/col("total_cnt"))
highest_DateOfYear_cancellations = DateOfYear_cancellations.select("YEAR","MONTH","DAY","rate").orderBy("rate",ascending=False).limit(1)
highest_DateOfYear_cancellations.show()


+----+-----+---+-------------------+
|YEAR|MONTH|DAY|               rate|
+----+-----+---+-------------------+
|2015|    1| 27|0.19030023094688223|
+----+-----+---+-------------------+



In [16]:
# Question 12: Calculate the number of flights to each destination state for each carrier, for which
# state do they have the largest average delay? You will need the airline and airport data sets for this
# question.
destination_airline = flights.select("DESTINATION_AIRPORT","AIRLINE")
airport_state = airports.select("IATA_CODE","STATE")

destination_airline_state = destination_airline.join(airport_state, destination_airline.DESTINATION_AIRPORT == airport_state.IATA_CODE,"inner").select("AIRLINE","STATE")
airline_state = destination_airline_state.withColumn("cnt",lit(1)).groupBy("STATE","AIRLINE").agg(sum_("cnt").alias("total_cnt")).select("STATE","AIRLINE","total_cnt")
airline_state.show()

destination_cnt = flights.select("DESTINATION_AIRPORT").withColumn("cnt",lit(1))
destination_state_cnt = destination_cnt.join(airport_state, destination_cnt.DESTINATION_AIRPORT == airport_state.IATA_CODE,'inner')
state_cnt = destination_state_cnt.groupBy("STATE").agg(sum_("cnt").alias("state_cnt")).select("STATE","state_cnt")

delayed_destination_cnt = flights.where("DEPARTURE_DELAY = 1").select("DESTINATION_AIRPORT").withColumn("cnt",lit(1))
delayed_destination_state_cnt = delayed_destination_cnt.join(airport_state,delayed_destination_cnt.DESTINATION_AIRPORT==airport_state.IATA_CODE,"inner")
state_DelayCnt = delayed_destination_state_cnt.select(col("STATE").alias("STATE2")).withColumn("cnt",lit(1)).groupBy("STATE2").agg(sum_("cnt").alias("delay_cnt"))

state_delay = state_cnt.join(state_DelayCnt,state_cnt.STATE == state_DelayCnt.STATE2,"inner")
state_AvgDelay = state_delay.withColumn("average_delay",col("delay_cnt")/col("state_cnt")).select("STATE","average_delay")

largest_state_AvgDelay = state_AvgDelay.orderBy("average_delay",ascending=False).limit(1).select("STATE")
largest_state_AvgDelay.show()

+-----+-------+---------+
|STATE|AIRLINE|total_cnt|
+-----+-------+---------+
|   AZ|     US|    27781|
|   NJ|     DL|     5009|
|   MO|     AS|      795|
|   WA|     UA|     9428|
|   NM|     OO|     3532|
|   IN|     F9|      728|
|   UT|     MQ|       72|
|   MD|     B6|     1773|
|   CA|     VX|    29693|
|   NV|     DL|    11561|
|   CA|     US|    11469|
|   MO|     OO|     3822|
|   WY|     UA|     1016|
|   AK|     AS|    28708|
|   MD|     UA|     3013|
|   SC|     B6|     1799|
|   WA|     WN|    14605|
|   NJ|     US|     2071|
|   AR|     WN|     2337|
|   FL|     AS|     1319|
+-----+-------+---------+
only showing top 20 rows

+-----+
|STATE|
+-----+
|   GU|
+-----+



In [None]:
sc.stop()