In [1]:
%matplotlib inline

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels
from maskfunctions import create_mask
from scipy.stats import lognorm
from scipy.stats import gamma

In [None]:
# Location and names of data files
data_dir = "/Users/jeremysmith/Documents/BTS_Airline_Data/"
file_pre = "On_Time_On_Time_Performance_2016"
path_airports = os.path.join(data_dir, "airports.dat")
path_emplanes = os.path.join(data_dir, "cy15-commercial-service-enplanements.csv")

In [None]:
# Delay Data
data_list = []
usecols = [2,3,4,5,8,10,14,23,31,33,36,39,42,44,47,52,54,56,57,58,59,60]
for month in range(12):
    file_month = "{:s}_{:02d}".format(file_pre, month + 1)
    path_month = os.path.join(data_dir, file_month, file_month + ".csv")
    data_list.append(pd.read_csv(path_month, usecols=usecols))
data_delays = pd.concat(data_list, ignore_index=True)

In [None]:
data_delays.head()

In [None]:
# Airport Location Data
head_airports = ['Name', 'City', 'Country', 'IATA',
                 'Latitude', 'Longitude', 'Altitude',
                 'Timezone']
data_airports = pd.read_csv(path_airports,
                           usecols=[1,2,3,4,6,7,8,9],
                           names=head_airports)

In [None]:
# Airport USA Enplanements Data
data_emplanes = pd.read_csv(path_emplanes,
                           usecols=[2,3,6,7,8],
                           thousands=',')
data_emplanes.rename(columns={'Locid':'IATA', 'CY 15 Enplanements':'CY15enplane'}, inplace=True)

In [None]:
# Merge Airport Data
data_airports_merged = pd.merge(data_emplanes, data_airports, on='IATA', how='left', copy=False)

In [None]:
data_airports_merged_us = data_airports_merged[data_airports_merged.Country == "United States"]
data_airports_merged_us = data_airports_merged_us[data_airports_merged_us.CY15enplane > 100000]
data_airports_merged_us.head()

In [None]:
# Output Airport Data Without Delay Data Embedded
data_airports_merged_us.to_csv("visualization/airports.csv")

In [None]:
# Create Filter and Mask
airports = data_airports_merged_us['IATA'].values
carrier = 'ALL'                                     # 2 letter carrier code or ALL
month = 0                                           # month (1 - 12, 0 = all)
dotw = 0                                            # day of the week (1 - 7, 0 = all)
mask, matches = create_mask(data_delays, airports, carrier, month, dotw)
print "There are {:d} matching flights for this filter".format(matches)

In [None]:
# Information grouped by destination airport
groupDest = data_delays[mask].groupby('Dest', as_index=False)

num_arr = groupDest.size().to_frame().reset_index().rename(columns={0:'ArrNum'})
fraction_delayed = groupDest['ArrDel15'].mean()
med_in_time = groupDest['TaxiIn','ArrDelay'].median()
mergeDest = pd.merge(fraction_delayed, med_in_time, on="Dest").merge(num_arr, on="Dest")
mergeDest.ArrDel15 *= 100
mergeDest.rename(columns={'Dest':'IATA', 'ArrDel15':'PercentArrDel15'}, inplace=True)

In [None]:
# Information grouped by origin airport
groupOrig = data_delays[mask].groupby('Origin', as_index=False)

num_dep = groupOrig.size().to_frame().reset_index().rename(columns={0:'DepNum'})
fraction_cancel = groupOrig['Cancelled'].mean()
med_out_time = groupOrig['TaxiOut', 'DepDelay'].median()
mergeOrig = pd.merge(fraction_cancel, med_out_time, on="Origin").merge(num_dep, on="Origin")
mergeOrig.Cancelled *= 100
mergeOrig.rename(columns={'Origin':'IATA', 'Cancelled':'PercentCancelled'}, inplace=True)

In [None]:
# Merge Origin and Destination delay data
mergeAll = pd.merge(mergeDest, mergeOrig, on='IATA')

In [None]:
# Merge delay data into the airport information df
data_airports_merged_us_wdelay = pd.merge(data_airports_merged_us,
                                          mergeAll, on='IATA', how='inner', copy=False)

In [None]:
data_airports_merged_us_wdelay.head()

In [None]:
# Output Airport Data With Delay Data Embedded
data_airports_merged_us_wdelay.to_csv("visualization/airports_wdelaydata_{:s}.csv".format(carrier))

In [None]:
sns.regplot('ArrDelay', 'PercentArrDel15', data=mergeDest[mergeDest.ArrNum > 10])

In [None]:
sns.regplot('TaxiIn', 'PercentArrDel15', data=mergeDest[mergeDest.ArrNum > 10])

In [None]:
sns.regplot('TaxiIn', 'ArrDelay', data=mergeDest[mergeDest.ArrNum > 10])

In [None]:
sns.regplot('TaxiOut', 'DepDelay', data=mergeOrig[mergeOrig.DepNum > 10], color='green')

In [None]:
a = data_delays[mask]['ArrDelay'].dropna()
a_sampled = np.random.choice(a, size=2000, replace=False)
a_sampled = a_sampled[(a_sampled < 75) & (a_sampled > -50)]
shape, loc, scale = lognorm.fit(a_sampled, 0.1, loc=-50, scale=40)
x = np.linspace(-50,350,201)
d = lognorm.pdf(x, shape, loc, scale)
sns.distplot(a, bins=x, kde=False, norm_hist=True)
plt.plot(x, d, color='black')
sns.plt.xlim(-50,150)
print shape, loc, scale

In [None]:
a = data_delays[mask]['TaxiIn'].dropna()
a_sampled = np.random.choice(a, size=1000, replace=False)
a_sampled = a_sampled[(a_sampled < 30) & (a_sampled > 0)]
shape, loc, scale = lognorm.fit(a_sampled, 0.5, loc=1, scale=5)
x = np.linspace(0,200,201)
d = lognorm.pdf(x, shape, loc, scale)
sns.distplot(a, bins=x, kde=False, norm_hist=True)
plt.plot(x, d, color='black')
sns.plt.xlim(0,50)
print shape, loc, scale

In [None]:
a = data_delays[mask]['DepDelay'].dropna()
a_sampled = np.random.choice(a, size=2000, replace=False)
a_sampled = a_sampled[(a_sampled < 50) & (a_sampled > -20)]
shape, loc, scale = lognorm.fit(a_sampled,1, loc=-20, scale=15)
x = np.linspace(-50,350,201)
d = lognorm.pdf(x, shape, loc, scale)
sns.distplot(a, bins=x, color='green', kde=False, norm_hist=True)
plt.plot(x, d, color='black')
sns.plt.xlim(-50,150)
print shape, loc, scale

In [None]:
a = data_delays[mask]['TaxiOut'].dropna()
a_sampled = np.random.choice(a, size=1000, replace=False)
a_sampled = a_sampled[(a_sampled < 50) & (a_sampled > 0)]
shape, loc, scale = lognorm.fit(a_sampled, 0.5, loc=4, scale=10)
x = np.linspace(0,200,201)
d = lognorm.pdf(x, shape, loc, scale)
sns.distplot(a, bins=x, color='green', kde=False, norm_hist=True)
plt.plot(x, d, color='black')
sns.plt.xlim(0,50)
print shape, loc, scale