In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

In [77]:
weather_df = pd.read_csv('data/weather.csv') 
weather_df = weather_df[weather_df['wind_speed'].notnull()]
weather_df.head()

Unnamed: 0,airport_id,airport_name,latitude,longitude,elevation,datetime,temperature,visibility,wind_direction,wind_speed,snow_depth,cloud_status
0,BUF,"BUFFALO NIAGARA INTERNATIONAL, NY US",42.9408,-78.7358,218.2,2017-01-01T00:54:00,3.9,16093,220,7.2,,
1,BUF,"BUFFALO NIAGARA INTERNATIONAL, NY US",42.9408,-78.7358,218.2,2017-01-01T06:54:00,0.6,16093,260,6.2,,
2,BUF,"BUFFALO NIAGARA INTERNATIONAL, NY US",42.9408,-78.7358,218.2,2017-01-01T12:54:00,-0.6,16093,250,5.1,,9.0
3,BUF,"BUFFALO NIAGARA INTERNATIONAL, NY US",42.9408,-78.7358,218.2,2017-01-01T18:54:00,2.2,16093,240,5.7,,9.0
4,BUF,"BUFFALO NIAGARA INTERNATIONAL, NY US",42.9408,-78.7358,218.2,2017-01-02T00:54:00,-0.6,16093,170,1.5,,


In [78]:
flight_df = pd.read_csv('data/flight_traffic.csv') 
flight_df.head()

Unnamed: 0,year,month,day,airline_id,origin_airport,destination_airport,scheduled_departure,actual_departure,taxi_out,wheels_off,...,cancelled,diverted,scheduled_elapsed,actual_elapsed,distance,airline_delay,weather_delay,air_system_delay,security_delay,aircraft_delay
0,2017,1,17,AA,CLT,PHX,1619,1616.0,17.0,1633.0,...,0.0,0.0,277.0,266.0,1773.0,,,,,
1,2017,1,18,AA,CLT,PHX,1619,1614.0,13.0,1627.0,...,0.0,0.0,277.0,247.0,1773.0,,,,,
2,2017,1,19,AA,CLT,PHX,1619,1611.0,17.0,1628.0,...,0.0,0.0,277.0,255.0,1773.0,,,,,
3,2017,1,20,AA,CLT,PHX,1619,1656.0,18.0,1714.0,...,0.0,0.0,277.0,273.0,1773.0,33.0,0.0,0.0,0.0,0.0
4,2017,1,21,AA,CLT,PHX,1619,1632.0,17.0,1649.0,...,0.0,0.0,277.0,266.0,1773.0,,,,,


In [79]:
def convertDateToMinutes(t):
    t = int(t)
    hours = (t/100) % 100
    minutes = t % 100
    return 60*hours + minutes
def getTimeDifference(t1, t2):
    totalMin = 24*60
    t1 = convertDateToMinutes(t1)
    t2 = convertDateToMinutes(t2)
    if t1 > t2 and t1 - t2 < 60: # early departure
        return t2 - t1
    return (t2 - t1) % totalMin

In [81]:
def getDates(airport):
    cur_df = weather_df[weather_df['airport_id'].isin([airport])]
    avg_wind = cur_df['wind_speed'].mean()
    std_wind = cur_df['wind_speed'].std()
    
    ret = set()
    for index, row in cur_df.iterrows():
        if row['wind_speed'] > avg_wind + 2*std_wind:
            ret.add(row['datetime'][:10])
    return ret

In [90]:
all_airports = weather_df['airport_id'].unique()
weather_events = {}
for airport in all_airports:
    dates = getDates(airport)
    if len(dates) >= 20:
        weather_events[airport] = dates

In [93]:
big_five = ['AA', 'B6', 'DL', 'UA', 'WN']
flight_df = flight_df[flight_df['origin_airport'].isin(weather_events.keys())]
flight_df = flight_df[flight_df['airline_id'].isin(big_five)]
flight_df = flight_df[flight_df['scheduled_departure'].notnull()]
flight_df = flight_df[flight_df['actual_departure'].notnull()]
flight_df = flight_df[flight_df['scheduled_arrival'].notnull()]
flight_df = flight_df[flight_df['actual_arrival'].notnull()]
flight_df = flight_df[flight_df['scheduled_elapsed'].notnull()]
flight_df = flight_df[flight_df['actual_elapsed'].notnull()]
flight_df.loc[:,'dep_diff'] = flight_df.apply(lambda row: getTimeDifference(row['scheduled_departure'], row['actual_departure']), axis=1)
flight_df.loc[:,'arr_diff'] = flight_df.apply(lambda row: getTimeDifference(row['scheduled_arrival'], row['actual_arrival']), axis=1)
flight_df.loc[:,'elapsed_diff'] = flight_df['actual_elapsed'] - flight_df['scheduled_elapsed']
print flight_df.shape

(2400332, 27)


In [100]:
series = flight_df.groupby(['origin_airport', 'airline_id', 'year', 'month', 'day'])['dep_diff'].mean()
ret = {}
for key, val in series.iteritems():
    if key[1] not in big_five:
        continue
    if key[0] not in ret:
        ret[key[0]] = {}
    if key[1] not in ret[key[0]]:
        ret[key[0]][key[1]] = {}
    month = str(key[3])
    if len(month) == 1:
        month = "0" + month
    day = str(key[4])
    if len(day) == 1:
        day = "0" + day
    d = str(key[2]) + "-" + month + "-" + day
    ret[key[0]][key[1]][d] = val

In [118]:
good_airports = []
for airport in weather_events.keys():
    if airport in ret:
        good_airports.append(airport)

for airport in good_airports:
    for airline in big_five:
        if airline not in ret[airport]:
            continue
            
        t = []
        for k, v in ret[airport][airline].iteritems():
            t.append(v)
        t = np.array(t)
        mean = np.mean(t)
        std = np.std(t, ddof=1)
        ret[airport][airline]['mean'] = mean
        ret[airport][airline]['std'] = std

occ = defaultdict(int)
mp = defaultdict(float)   
for airport in good_airports:
    for airline in big_five:
        if airline not in ret[airport]:
            continue
            
        total = 0.0
        for date in weather_events[airport]:
            if date in ret[airport][airline]:
                total += ret[airport][airline][date] - ret[airport][airline]['mean']
        total /= len(weather_events[airport])
        occ[airline] += 1
        mp[airline] += total

for airline in big_five:
    print airline, mp[airline]/occ[airline]

AA 1.2006281049653647
B6 2.985900389855095
DL 4.171998193789186
UA 3.9871162834459097
WN 1.0961902848599228
