In [None]:
import pandas as pd
import numpy as np

In [1]:
"""
Creates a train test split using 20% of the data from each month to train and 5% of the data for testing from
2018. This is the same ratio as an 80/20 split. We sampled from every month as we expect there to be a seasonal
component to our model. Full datasets are to large to store on git, so they are on my local machine, and the 
splits are created, stored as a pickle file, and pushed to git
"""

train_set, test_set = pd.DataFrame(), pd.DataFrame()
airports = ['LAX','JFK', 'ATL', 'DFW', 'DEN', 'SFO','SEA','LAS','MCO']
files = ['april.csv', 'august.csv', 'dec.csv', 'feb.csv', 'jan.csv', 'july.csv', 'june.csv', 'march.csv',
         'may.csv',  'nov.csv',  'oct.csv', 'sept.csv']
c = 0
for filename in files:
    df = pd.read_csv("/home/jackson/data/" + filename)
    df = df[(df['ORIGIN'].isin(airports)) & (df['DEST'].isin(airports))]
    if(train_set.empty):
        train_set = df.sample(frac=0.20, random_state=7)
        test_set = df.sample(frac=0.05, random_state=7)
    else:
        train_set = pd.concat([train_set,df.sample(frac=0.20, random_state=7)])
        test_set = pd.concat([test_set, df.sample(frac=0.05, random_state=7)])

#train_set.to_pickle('train.csv')
#test_set.to_pickle('test.csv')

FileNotFoundError: [Errno 2] File b'/home/jackson/data/april.csv' does not exist: b'/home/jackson/data/april.csv'

In [None]:
"""
Reads train pkl created above, and joins with the weather data on data and origin/destination.
Saves combined data to data.csv
"""
def merge(df_air, name):

    df_weath = pd.read_pickle('Weather.pkl')
    df_weath = df_weath.rename(columns={"NAME":"ORIGIN", "DATE":"FL_DATE"})
    df_weath['FL_DATE'] = pd.to_datetime(df_weath['FL_DATE']).dt.strftime('%Y-%m-%d')
    df = df_air.merge(df_weath, on=['ORIGIN','FL_DATE'], how='outer')
    df_weath = df_weath.rename(columns={"ORIGIN":"DEST", "DATE":"FL_DATE"})
    df = df.merge(df_weath, on=['DEST','FL_DATE'], how='outer')
    df = df.fillna(0)
    df = df[df['ORIGIN'] != 0]
    df = df[df['DEST'] != 0]
    df.to_csv(name)
    return df

X = merge(train_set,'train.csv')
X_test = merge(test_set, 'test.csv')

In [None]:
"""
Create X vector, with only the data from df that is already real value or boolean encoded 
"""
def clean(df):
    remove = []
    remove.append(df.columns[0])
    for rem in df.columns[6:37]:
        remove.append(rem)
    remove.extend(["WDF2_x", "WDF5_x", "WSF2_x", "WSF5_x", "WDF2_y", "WDF5_y", "WSF2_y", "WSF5_y", "STATION_y",
                   "FL_DATE", "OP_UNIQUE_CARRIER", "ORIGIN", "DEST", "CRS_DEP_TIME"])
    return df.drop(remove, axis = 1)

X = clean(X)
X_test = clean(X_test)

In [None]:
"""
Creates one-hot encodings for Airline, Origin, and Destination and concatenates the features to X
"""
def onehot(df, column):
    title = df[column].unique()
    header = [column + s for s in title]
    onehot = []
    for i in df[column]:
        result = []
        for j in title:
            if i == j:
                result.append(1)
            else:
                result.append(0)
        onehot.append(result)
    return pd.DataFrame(onehot, columns = header)

X = pd.concat([X, onehot(train_set, "OP_UNIQUE_CARRIER"), onehot(train_set, "ORIGIN"), onehot(train_set, "DEST")],
              axis = 1)
X_test = pd.concat([X_test, onehot(test_set, "OP_UNIQUE_CARRIER"), onehot(test_set, "ORIGIN"),
                    onehot(test_set,"DEST")], axis = 1)

In [None]:
def polynomial(df, feature, max_order):
    current_order = 2
    transformed, names = [], []
    while current_order <= max_order:
        transformed.append(np.power(df[feature],current_order).values.flatten())
        names.append(feature + "^" + str(current_order))
        current_order += 1
    return pd.DataFrame(transformed, index=names).T

X = pd.concat([polynomial(X,'DEP_TIME',3), X],axis=1)
X_test = pd.concat([polynomial(X_test,'DEP_TIME',3), X_test],axis=1)
X['OFFSET'] = [1] * len(X.index)
X_test['OFFSET'] = [1] *len(X_test.index)
X.to_csv('X.csv')
X_test.to_csv('X_test.csv')

In [None]:
"""
Build NAS delay vectors for clustering.
"""
train_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
vals = []
for index, row in train_set.iterrows():
    if(row["NAS_DELAY"]>0):
        vals.append(row["NAS_DELAY"]) 
    else:
        vals.append(0)
Yclass_train = pd.DataFrame(vals)


vals = []
for index, row in test_set.iterrows():
    if(row["NAS_DELAY"]>0):
        vals.append(row["NAS_DELAY"])
    else:
        vals.append(0)
Yclass_test = pd.DataFrame(vals)

Yclass_train.to_csv("NAS_train.csv")
Yclass_test.to_csv("NAS_test.csv")