In [1]:
import pandas as pd
import datetime
import pickle
import hashlib

def pad(s, n=2, fill='0', orientation = 'left'):
    fill_n = n-len(fill)
    fillstr = fill*fill_n
    if orientation == 'left':
        retval = fillstr + s
    elif orientation == 'right':
        retval = s + fillstr
    else:
        raise ValueError('orientation must be either \'left\' or \'right\'')
    return(retval)

In [2]:
flights = pd.read_csv('../data/flights_head.csv', 
                      dtype = 'str', 
                      usecols=['YEAR', 'MONTH', 'DAY', 'DEPARTURE_TIME', 'ARRIVAL_TIME',
                               'TAIL_NUMBER', 'FLIGHT_NUMBER', 'ORIGIN_AIRPORT',
                               'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL'])

In [3]:
flights['departure_dt'] = flights['YEAR'] + flights['MONTH'].apply(pad) + flights['DAY'].apply(pad) + ' ' + flights['DEPARTURE_TIME'].str[:2] + ':' + flights['DEPARTURE_TIME'].str[2:]

flights['id'] = flights['departure_dt'] + flights['TAIL_NUMBER'] + flights['FLIGHT_NUMBER'] 
flights['id'] = flights['id'].apply(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest()   )

flights['departure_dt'] = pd.to_datetime(flights['departure_dt'], format = '%Y%m%d %H:%M', errors='ignore')
flights_out = (flights[[ 'id', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 
                       'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'departure_dt']]
               .dropna())

In [4]:
flights_out.head()

Unnamed: 0,id,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,departure_dt
0,8b398ee0ce30cb8bb45551ba0a221948,98,N407AS,ANC,SEA,5,430,2015-01-01 23:54:00
1,21dfc7e316a52f9b392b5495032f4cfa,2336,N3KUAA,LAX,PBI,10,750,2015-01-01 00:02:00
2,34880168b561e11bc7f2af09364f0241,840,N171US,SFO,CLT,20,806,2015-01-01 00:18:00
3,07629bfe57e362fb5f30cf7975e7ffe4,258,N3HYAA,LAX,MIA,20,805,2015-01-01 00:15:00
4,da272f1a5d7b875310f1063697c3e354,135,N527AS,SEA,ANC,25,320,2015-01-01 00:24:00


In [5]:
flights_out = flights_out.sort_values(by='departure_dt', ascending=True)

In [6]:
flightdict = flights_out.to_dict( orient='row')

In [7]:
departures = open('../data/departures.pickle', 'wb')

pickle.dump(flightdict, departures)

departures.close()

In [8]:
flights['arrival_dt'] = flights['YEAR'] + flights['MONTH'].apply(pad) + flights['DAY'].apply(pad) + ' ' + flights['ARRIVAL_TIME'].str[:2] + ':' + flights['ARRIVAL_TIME'].str[2:]
flights['arrival_dt'] = pd.to_datetime(flights['departure_dt'], format = '%Y%m%d %H:%M', errors='ignore')

flights_in = (
    flights[[ 'id', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 
            'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'arrival_dt']]
    .dropna()
    .sort_values(by='arrival_dt', ascending=True)
)

In [9]:
flightdict = flights_in.to_dict( orient='row')

In [10]:
arrivals = open('../data/arrivals.pickle', 'wb')

pickle.dump(flightdict, arrivals)

arrivals.close()
