# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

~140GB

In [2]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dbutils.fs.ls('/databricks-datasets/airlines')

In [4]:
sum = 0
DATA_PATH = 'dbfs:/databricks-datasets/airlines/'
for item in dbutils.fs.ls(DATA_PATH):
  sum = sum+item.size
sum


In [5]:
with open("/dbfs/databricks-datasets/airlines/README.md") as f:
    x = ''.join(f.readlines())

print(x)

In [6]:
# airlines_first = spark.read.option("header", "true").csv("dbfs:/databricks-datasets/airlines/part-00000")
# headers = airlines_first.columns
# airlines_first.count()
#airlines = spark.read.csv("dbfs:/databricks-datasets/airlines/part-*")
#airlines.printSchema()

In [7]:
from pyspark.sql.types import *
airline_schema = StructType([
            StructField("Year", IntegerType(), True),
            StructField("Month", IntegerType(), True),
            StructField("DayofMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DepTime", IntegerType(), True),
            StructField("CRSDepTime", IntegerType(), True),
            StructField("ArrTime", IntegerType(), True),
            StructField("CRSArrTime", IntegerType(), True),
            StructField("UniqueCarrier", StringType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("TailNum", StringType(), True),
            StructField("ActualElapsedTime", IntegerType(), True),
            StructField("CRSElapsedTime", IntegerType(), True),
            StructField("AirTime", IntegerType(), True),
            StructField("ArrDelay", IntegerType(), True),
            StructField("DepDelay", IntegerType(), True),
            StructField("Origin", StringType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", IntegerType(), True),
            StructField("TaxiIn", IntegerType(), True),			
			StructField("TaxiOut", IntegerType(), True),
            StructField("Cancelled", IntegerType(), True),
            StructField("CancellationCode", StringType(), True),
            StructField("Diverted", IntegerType(), True),
            StructField("CarrierDelay", IntegerType(), True),
            StructField("WeatherDelay", IntegerType(), True),
            StructField("NASDelay", IntegerType(), True),
            StructField("SecurityDelay", IntegerType(), True),
			StructField("LateAircraftDelay", IntegerType(), True),
            StructField("IsArrDelayed", StringType(), True),			
            StructField("IsDepDelayed", StringType(), True)])

In [8]:
airlines = spark.read.format("csv").option("header", "false").schema(airline_schema).load("dbfs:/databricks-datasets/airlines/part-*")
airlines.printSchema()

In [9]:
airlines.count()

In [10]:
airlines.write.parquet("dbfs:/tmp/parquet/airlines_data_in_parquet.parquet")

In [11]:
dbutils.fs.ls('/tmp/parquet')

In [12]:
airlines_df = spark.read.parquet("dbfs:/tmp/parquet/airlines_data_in_parquet.parquet")
display(airlines_df)


Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed,IsDepDelayed
2006.0,2.0,12.0,7.0,625.0,622.0,744.0,746.0,OO,6612.0,N746SK,79.0,84.0,55.0,-2.0,3.0,SLC,DEN,391.0,8.0,16.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,YES
2006.0,2.0,12.0,7.0,1506.0,1457.0,1736.0,1737.0,OO,6614.0,N771SK,90.0,100.0,75.0,-1.0,9.0,DEN,MCI,533.0,4.0,11.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,YES
2006.0,2.0,12.0,7.0,1148.0,1151.0,1330.0,1347.0,OO,6615.0,N910SW,102.0,116.0,89.0,-17.0,-3.0,DEN,GTF,624.0,4.0,9.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,NO
2006.0,2.0,12.0,7.0,1445.0,1454.0,1623.0,1645.0,OO,6615.0,N910SW,98.0,111.0,84.0,-22.0,-9.0,GTF,DEN,624.0,7.0,7.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,NO
2006.0,2.0,12.0,7.0,1013.0,1003.0,1258.0,1255.0,OO,6616.0,N710SK,105.0,112.0,90.0,3.0,10.0,DEN,CID,692.0,4.0,11.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,YES,YES
2006.0,2.0,12.0,7.0,1017.0,1021.0,1251.0,1250.0,OO,6618.0,N969SW,94.0,89.0,77.0,1.0,-4.0,DEN,FSD,483.0,4.0,13.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,YES,NO
2006.0,2.0,12.0,7.0,847.0,845.0,947.0,954.0,OO,6619.0,N958SW,120.0,129.0,103.0,-7.0,2.0,DEN,PSP,776.0,4.0,13.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,YES
2006.0,2.0,12.0,7.0,1020.0,1024.0,1320.0,1325.0,OO,6619.0,N958SW,120.0,121.0,104.0,-5.0,-4.0,PSP,DEN,776.0,7.0,9.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,NO
2006.0,2.0,12.0,7.0,907.0,910.0,1131.0,1142.0,OO,6620.0,N752SK,84.0,92.0,65.0,-11.0,-3.0,DEN,TUL,541.0,5.0,14.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,NO
2006.0,2.0,12.0,7.0,1737.0,1735.0,1841.0,1843.0,OO,6621.0,N978SW,124.0,128.0,108.0,-2.0,2.0,DEN,PSP,776.0,4.0,12.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,NO,YES


In [13]:
airlines_df.printSchema()

In [14]:
airlines_df.count()

In [15]:
import numpy as np
import matplotlib.pyplot as mplt
import matplotlib.ticker as mtick

def plotHistogramData(data):
    binSides, binCounts = data

    N = len(binCounts)
    ind = np.arange(N)
    width = 1

    fig, ax = mplt.subplots()
    rects1 = ax.bar(ind+0.5, binCounts, width, color='b')

    ax.set_ylabel('Frequencies')
    ax.set_title('Histogram')
    ax.set_xticks(np.arange(N+1))
    ax.set_xticklabels(binSides)
    ax.xaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))
    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.2e'))

    mplt.show()
#plotHistogramData(airlines_first.select('Distance').rdd.flatMap(lambda x: x[0]).histogram(11))    

In [16]:
airlines_first = spark.read.option("header", "true").csv("dbfs:/databricks-datasets/airlines/part-00000")
print(airlines_first.count())

In [17]:

airlines_test = sqlContext.read.option("header", "true").csv("dbfs:/databricks-datasets/airlines/part-00000")
airlines_test.registerTempTable("airlines_test")
airlines_test.cache()


airports_count = sqlContext.sql("select count(*) from airlines_test")
airports_count.registerTempTable("airports_count")
airports_count.cache()
sqlContext.sql("select count(*) as Total_Records from airlines_test").show()

In [18]:
display(airports_count)

In [19]:
#combine all partitions together ONLY RUN IF YOU HAVE TIME ON YOUR HANDS!
airlines_all = airlines_first
for item in dbutils.fs.ls(DATA_PATH):
  if re.search(r"part-\d*", item.path):
    if item.path != "dbfs:/databricks-datasets/airlines/part-00000":
      airlines_each = spark.read.option("header", "false").csv(item.path)
      airlines_all = airlines_all.union(airlines_each)

In [20]:
airlines_all.registerTempTable("airlines_all")
airlines_all.cache()

In [21]:
airports_all_count = sqlContext.sql("select count(*) from airlines_all")
airports_all_count.registerTempTable("airports_all_count")
airports_all_count.cache()

In [22]:
display(airports_all_count)

In [23]:
sqlContext.sql("select count(*) from airlines_all").show()