# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

~140GB

In [2]:
# imports
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [3]:
#dbutils.fs.ls('/databricks-datasets/airlines')

In [4]:
sum = 0
DATA_PATH = 'dbfs:/databricks-datasets/airlines/'
for item in dbutils.fs.ls(DATA_PATH):
  sum = sum+item.size
sum

In [5]:
# with open("/dbfs/databricks-datasets/airlines/README.md") as f:
#     x = ''.join(f.readlines())

# print(x)

In [6]:
from pyspark.sql.types import *
airline_schema = StructType([
            StructField("Year", IntegerType(), True),
            StructField("Month", IntegerType(), True),
            StructField("DayofMonth", IntegerType(), True),
            StructField("DayOfWeek", IntegerType(), True),
            StructField("DepTime", IntegerType(), True),
            StructField("CRSDepTime", IntegerType(), True),
            StructField("ArrTime", IntegerType(), True),
            StructField("CRSArrTime", IntegerType(), True),
            StructField("UniqueCarrier", StringType(), True),
            StructField("FlightNum", StringType(), True),
            StructField("TailNum", StringType(), True),
            StructField("ActualElapsedTime", IntegerType(), True),
            StructField("CRSElapsedTime", IntegerType(), True),
            StructField("AirTime", IntegerType(), True),
            StructField("ArrDelay", IntegerType(), True),
            StructField("DepDelay", IntegerType(), True),
            StructField("Origin", StringType(), True),
            StructField("Dest", StringType(), True),
            StructField("Distance", IntegerType(), True),
            StructField("TaxiIn", IntegerType(), True),			
			StructField("TaxiOut", IntegerType(), True),
            StructField("Cancelled", IntegerType(), True),
            StructField("CancellationCode", StringType(), True),
            StructField("Diverted", IntegerType(), True),
            StructField("CarrierDelay", IntegerType(), True),
            StructField("WeatherDelay", IntegerType(), True),
            StructField("NASDelay", IntegerType(), True),
            StructField("SecurityDelay", IntegerType(), True),
			StructField("LateAircraftDelay", IntegerType(), True),
            StructField("IsArrDelayed", StringType(), True),			
            StructField("IsDepDelayed", StringType(), True)])

In [7]:
airlines = spark.read.format("csv").option("header", "false").schema(airline_schema).load("dbfs:/databricks-datasets/airlines/part-*")
airlines.printSchema()

In [8]:
start_time = time.time()
count =  airlines.count()
total_time_csv = time.time() - start_time
print("Total records: {} \n Time taken (in seconds) to count total records: {}".format(count, np.round(total_time_csv, 2) )) 

In [9]:
airlines.write.parquet("dbfs:/tmp/parquet/airlines_data_in_parquet.parquet")

In [10]:
airlines_df = spark.read.parquet("dbfs:/tmp/parquet/airlines_data_in_parquet.parquet")


In [11]:
airlines_df.show(5)

In [12]:
airlines_df.printSchema()

In [13]:
start_time_parquet = time.time()
count = airlines_df.count()
total_time_parquet = time.time() - start_time_parquet
print("Total records: {} \n Time taken (in seconds) to count total records: {} \n Parquet read DataFrame is {} times faster than CSV read DataFrame".format(count, np.round(total_time_parquet, 2), np.round(total_time_csv/total_time_parquet, 2) )) 

In [14]:
def nullDataFrame(df):
  null_feature_list = []
  for column in df.columns:
    nulls = df.filter(df[column].isNull()).count()
    nulls_perct = np.round((nulls/count)*100, 2)
    null_feature_list.append([column, nulls, nulls_perct])
  nullCounts_df = pd.DataFrame(np.array(null_feature_list), columns=['Feature_Name', 'Null_Counts', 'Percentage_Null_Counts'])
  return nullCounts_df  

In [15]:
nullCounts_df = nullDataFrame(airlines_df)

In [16]:
nullCounts_df

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,Year,899947541,72.85
1,Month,899947541,72.85
2,DayofMonth,899947541,72.85
3,DayOfWeek,899947541,72.85
4,DepTime,899947541,72.85
5,CRSDepTime,899947541,72.85
6,ArrTime,899947541,72.85
7,CRSArrTime,899947541,72.85
8,UniqueCarrier,899947541,72.85
9,FlightNum,899947541,72.85


In [17]:
subset = [x for x in airlines_df.columns if x not in ["CancellationCode"]]
airlines_df2 = airlines_df.na.drop(subset=subset)

In [18]:
airlines_df2.count()

In [19]:
nullCounts_df2 = nullDataFrame(airlines_df2)
nullCounts_df2

Unnamed: 0,Feature_Name,Null_Counts,Percentage_Null_Counts
0,Year,0,0.0
1,Month,0,0.0
2,DayofMonth,0,0.0
3,DayOfWeek,0,0.0
4,DepTime,0,0.0
5,CRSDepTime,0,0.0
6,ArrTime,0,0.0
7,CRSArrTime,0,0.0
8,UniqueCarrier,0,0.0
9,FlightNum,0,0.0


In [20]:
Delay_List = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay' ]
airlines_df2[Delay_List].describe().show()

In [21]:
sample_airlines_df = airlines_df2.select(Delay_List).sample(False, 0.001, 2020)
pandas_df = sample_airlines_df.toPandas()

In [22]:
pandas_df.hist(figsize=(15,15), bins=50)
display(plt.show())