In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

Load datasets

In [2]:
X_train_label = pd.read_csv("../data/X_train_label.csv")
X_test_label = pd.read_csv("../data/X_test_label.csv")

Y_train = pd.read_csv("../data/Y_train.csv")
Y_test = pd.read_csv("../data/Y_test.csv")

Combine train and test, sort by date, split into train and test temporally

In [3]:
X_train_label_lstm = X_train_label.copy()
X_test_label_lstm = X_test_label.copy()

X_lstm1 = pd.concat([X_train_label_lstm, Y_train], axis=1)
X_lstm2 = pd.concat([X_test_label_lstm, Y_test], axis=1)

combined_df_lstm = pd.concat([X_lstm1, X_lstm2])

# Sort by date
combined_df_lstm.sort_values(["YEAR", "MONTH", "DAY", "CRS_DEP_TIME", "CRS_ARR_TIME"], inplace=True)

# Features (X_LSTM)
X_LSTM = combined_df_lstm[["YEAR", "MONTH", "DAY", 'AIRLINE', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'DISTANCE', 'TAXI_IN', 'TAXI_OUT']]

# Target variables (Y_LSTM)
Y_LSTM = combined_df_lstm[['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_SECURITY', 'DELAY_DUE_NAS', 'DELAY_DUE_LATE_AIRCRAFT']]
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_LSTM, Y_LSTM, test_size=0.25, shuffle=False)

Measure longest and shortest list of flights per day to determine whether it makes sense to pad by day. We decided this would not make sense.

In [4]:
longest = 0
shortest = float("inf")
for group in X_train_lstm.groupby(["YEAR", "MONTH", "DAY"]):
  group = group[1]

  if len(group) > longest:
    longest = len(group)

  if len(group) < shortest:
    shortest = len(group)
    
print(f"longest: {longest} flights")
print(f"shortest: {shortest} flights")

longest: 738 flights
shortest: 10 flights


Write time-series datasets to CSV

In [6]:
X_train_lstm.to_csv("../data/X_train_lstm.csv", index=False)
X_test_lstm.to_csv("../data/X_test_lstm.csv", index=False)
y_train_lstm.to_csv("../data/Y_train_lstm.csv", index=False)
y_test_lstm.to_csv("../data/Y_test_lstm.csv", index=False)