In [85]:
air_filepath = 'data\\ontime.td.201912.asc'

In [86]:
import pandas as pd
import numpy as np

In [87]:
# Load file
air_df = pd.read_csv(air_filepath, sep='|', header=None, dtype=str)
# define titles for the known columns
column_titles = ['carrier_code', 'flight_number', 'unknown', 'unknown', 'unknown', 'unknown',
           'origin_airport', 'destination_airport', 'date', 'unknown',
           'scheduled_departure', 'crs_scheduled_departure', 'actual_departure',
           'scheduled_arrival', 'crs_scheduled_arrival', 'actual_arrival',
           'oag_crs_departure_difference', 'oag_crs_arrival_difference',
           'scheduled_elapsed_time', 'actual_elapsed_time', 'departure_delay', 'arrival_delay',
           'elapsed_time_difference', 'wheels_off_time', 'wheels_on_time', 'tail_number',
           'taxi_out_time', 'taxi_in_time', 'total_in_air_time', 'cancelled_code',
           'delay_carrier', 'delay_weather', 'delay_national_aviation_system', 'delay_security',
           'delay_late_aircarft_arrival', 'unknown', 'actual_elapsed_time_to_diverted_airport', 'unknown',
           'no_diverted_airports', 'diverted_airport_code', 'wheels_on_diverted_airport',
           'total_time_away_gate_diverted_airport', 'longest_period_away_gate_diverted_airport',
           'wheels_off_diverted_airport']
# set column titles
air_df.columns = column_titles + ['unknown' for _ in range(len(air_df.columns) - len(column_titles))]
# keep useful columns
air_df = air_df[['carrier_code', 'flight_number', 'origin_airport', 'destination_airport', 'date', 
                'scheduled_departure', 'actual_departure', 'scheduled_arrival', 'actual_arrival',
               'scheduled_elapsed_time', 'tail_number', 
               'departure_delay', 'arrival_delay', 'delay_carrier', 'delay_weather', 'delay_national_aviation_system', 
               'delay_security', 'delay_late_aircarft_arrival', 'cancelled_code']]

In [88]:
# split date column into 4 columns: year, month, day and weekday
import datetime

air_df['date'] = pd.to_datetime(air_df['date'])
air_df['year'] = air_df['date'].dt.strftime('%Y')
air_df['month'] = air_df['date'].dt.strftime('%m')
air_df['day'] = air_df['date'].dt.strftime('%d')
air_df['weekday'] = air_df['date'].dt.dayofweek
air_df['date'] = air_df['date'].dt.date
# air_df = air_df.drop(columns = ['date'])

In [15]:
# round departure and arrival times
def round_time(time):
    time = str(time)
    if len(time) < 4:
        time = time[::-1].ljust(4, '0')[::-1]
    return int(time[:2])
air_df['scheduled_departure'] = air_df.apply(lambda row: round_time(row['scheduled_departure']), axis=1)
air_df['actual_departure'] = air_df.apply(lambda row: round_time(row['actual_departure']), axis=1)
air_df['scheduled_arrival'] = air_df.apply(lambda row: round_time(row['scheduled_arrival']), axis=1)
air_df['actual_arrival'] = air_df.apply(lambda row: round_time(row['actual_arrival']), axis=1)

In [89]:
def split_time(air_df, column_name):
    air_df[column_name] = air_df[column_name].astype(str).replace('0', np.nan).str.zfill(4).str.replace('^24', '00', regex=True) 
    air_df[column_name] = pd.to_datetime(air_df[column_name], format='%H%M')
    air_df[column_name + '_dt'] = pd.to_datetime(air_df.date.astype(str) + ' ' + air_df[column_name].dt.time.astype(str), errors='coerce')
    air_df = air_df.drop(columns=[column_name])
    return air_df
air_df = split_time(air_df, 'scheduled_departure')
air_df = split_time(air_df, 'scheduled_arrival')
air_df = split_time(air_df, 'actual_departure')
air_df = split_time(air_df, 'actual_arrival')

In [90]:
# fill NaN values in cancelled_code column with 0
air_df.cancelled_code = air_df.cancelled_code.fillna(value='N')

In [40]:
air_df.to_csv('ontime_12_2019.csv', index=False)

In [63]:
air_df.head()

Unnamed: 0,carrier_code,flight_number,origin_airport,destination_airport,date,scheduled_elapsed_time,tail_number,departure_delay,arrival_delay,delay_carrier,...,delay_late_aircarft_arrival,cancelled_code,year,month,day,weekday,scheduled_departure_dt,scheduled_arrival_dt,actual_departure_dt,actual_arrival_dt
0,DL,3280,CVG,ORD,2019-12-01,102,N8896A,41,18,4,...,14,N,2019,12,1,6,2019-12-01 18:35:00,2019-12-01 19:17:00,2019-12-01 19:16:00,2019-12-01 19:35:00
1,DL,3280,CVG,ORD,2019-12-02,102,N8896A,-3,52,0,...,0,N,2019,12,2,0,2019-12-02 18:35:00,2019-12-02 19:17:00,2019-12-02 18:32:00,2019-12-02 20:09:00
2,DL,3281,JAX,RDU,2019-12-01,81,N186PQ,22,22,22,...,0,N,2019,12,1,6,2019-12-01 06:00:00,2019-12-01 07:21:00,2019-12-01 06:22:00,2019-12-01 07:43:00
3,DL,3281,JAX,RDU,2019-12-02,81,N316PQ,-3,16,0,...,0,N,2019,12,2,0,2019-12-02 06:00:00,2019-12-02 07:21:00,2019-12-02 05:57:00,2019-12-02 07:37:00
4,DL,3282,LGA,PIT,2019-12-01,105,N398CA,107,93,0,...,93,N,2019,12,1,6,2019-12-01 17:38:00,2019-12-01 19:23:00,2019-12-01 19:25:00,2019-12-01 20:56:00


In [91]:
import glob, os
import pandas as pd

In [92]:
wdf = pd.concat(map(pd.read_csv, glob.glob(os.path.join('data', 'weather-data', '2019', "*.csv"))))
wdf['DATE'] = pd.to_datetime(wdf['DATE'])

In [93]:
wdf.head()

Unnamed: 0,STATION,DATE,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyStationPressure,HourlyVisibility,HourlyWindSpeed,iata_code
0,72517014737,2019-01-01 00:51:00,43,0.01,29.32,3.0,0,ABE
1,72517014737,2019-01-01 01:51:00,45,0.0,29.23,2.0,5,ABE
2,72517014737,2019-01-01 02:51:00,44,0.0,29.26,1.5,3,ABE
3,72517014737,2019-01-01 03:51:00,46,0.0,29.22,1.75,6,ABE
4,72517014737,2019-01-01 04:51:00,51,0.0,29.23,8.0,8,ABE


In [101]:
def add_weather_data(air_df, wdf, scheduled_col, airport_col):
    air_df_s = air_df.rename(columns={scheduled_col: 'DATE'}).sort_values(by=['DATE'])
    wdf_s = wdf.rename(columns={'iata_code': airport_col}).sort_values(by=['DATE'])
    merged = pd.merge_asof(air_df_s, wdf_s, on='DATE', by=airport_col, direction='nearest')
    merged = merged.rename(columns={'DATE': scheduled_col})
    return merged

In [103]:
merged = add_weather_data(air_df, wdf, 'scheduled_departure_dt', 'origin_airport')
merged = add_weather_data(merged, wdf, 'scheduled_arrival_dt', 'destination_airport')
# merged = merged.drop(columns=['date', 'STATION'])
merged.to_csv('merged.csv', index=False)

In [104]:
# WORKS! But the list of airports is not complete :( 