# Load Libraries and Data

In [94]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader

In [95]:
df = pd.read_csv("data/airlines_filtered_selected_with_weather.csv")
df.rename(columns={"Unnamed: 0":"FlightDate"}, inplace=True)
df.set_index('FlightDate', inplace=True)
df.index = pd.to_datetime(df.index)
df = df[df.index.year < 2020]
df.head()

Unnamed: 0_level_0,Year,Month,DayofMonth,DayOfWeek,Reporting_Airline,Tail_Number,Origin,Dest,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepTimeBlk,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrTimeBlk,Cancelled,CancellationCode,AirTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
FlightDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
2010-01-01,2010,1,1,5,AA,N4XXAA,SFO,LAX,,,,,1100-1159,,,,,1300-1359,1.0,A,,,,,,,15.2,10.0,20.6,0.0,,,7.6,,1022.1,
2010-01-01,2010,1,1,5,AA,N4XXAA,SFO,LAX,,,,,0600-0659,,,,,0800-0859,1.0,A,,,,,,,15.2,10.0,20.6,0.0,,,7.6,,1022.1,
2010-01-01,2010,1,1,5,AA,N506AA,SFO,LAX,1519.0,54.0,54.0,1.0,1400-1459,1637.0,42.0,42.0,1.0,1500-1559,0.0,,54.0,0.0,0.0,0.0,0.0,42.0,15.2,10.0,20.6,0.0,,,7.6,,1022.1,
2010-01-01,2010,1,1,5,AA,N329AA,SFO,LAX,1953.0,58.0,58.0,1.0,1800-1859,2112.0,47.0,47.0,1.0,2000-2059,0.0,,51.0,0.0,0.0,0.0,0.0,47.0,15.2,10.0,20.6,0.0,,,7.6,,1022.1,
2010-01-01,2010,1,1,5,AA,N446AA,SFO,LAX,918.0,-7.0,0.0,0.0,0900-0959,1047.0,-13.0,0.0,0.0,1100-1159,0.0,,54.0,,,,,,15.2,10.0,20.6,0.0,,,7.6,,1022.1,


In [96]:
#we have daily data, check if missing any days... doesn't look like it
for yr in df.index.year.unique():
    dd = df[df.index.year == yr]
    print(f"Length of {yr}: {len(dd.index.unique())}")

Length of 2010: 365
Length of 2011: 365
Length of 2012: 366
Length of 2013: 365
Length of 2014: 365
Length of 2015: 365
Length of 2016: 366
Length of 2017: 365
Length of 2018: 365
Length of 2019: 365


In [97]:
#shift the weather variables back one day so the model can't use information it doesn't know
wx = df[['tavg','tmin','tmax','prcp','snow','wdir','wspd','wpgt','pres','tsun']]
wx = wx.groupby(wx.index.date).mean()
wx = wx.shift(1)
wx

Unnamed: 0,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
2010-01-01,,,,,,,,,,
2010-01-02,15.2,10.0,20.6,0.0,,,7.6,,1022.1,
2010-01-03,18.8,12.8,24.4,0.0,,,10.1,,1017.4,
2010-01-04,18.4,11.1,26.7,0.0,,,8.3,,1017.6,
2010-01-05,17.0,11.7,23.9,0.0,,,9.7,,1017.8,
...,...,...,...,...,...,...,...,...,...,...
2019-12-27,11.4,7.8,13.9,9.4,0.0,51.0,13.7,,1009.4,
2019-12-28,11.9,6.1,16.7,0.0,0.0,96.0,9.0,,1013.7,
2019-12-29,12.7,7.8,17.2,0.0,0.0,85.0,7.6,,1018.6,
2019-12-30,12.8,8.3,16.7,0.0,0.0,106.0,7.2,,1020.5,


In [98]:
#replace weather data in df
df = df.drop(columns=(['tavg','tmin','tmax','prcp','snow','wdir','wspd','wpgt','pres','tsun']))
df = df.merge(wx, left_index=True, right_index=True, how='left')

In [99]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Reporting_Airline,Tail_Number,Origin,Dest,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepTimeBlk,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrTimeBlk,Cancelled,CancellationCode,AirTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
2010-01-01,2010,1,1,5,AA,N4XXAA,SFO,LAX,,,,,1100-1159,,,,,1300-1359,1.0,A,,,,,,,,,,,,,,,,
2010-01-01,2010,1,1,5,AA,N4XXAA,SFO,LAX,,,,,0600-0659,,,,,0800-0859,1.0,A,,,,,,,,,,,,,,,,
2010-01-01,2010,1,1,5,AA,N506AA,SFO,LAX,1519.0,54.0,54.0,1.0,1400-1459,1637.0,42.0,42.0,1.0,1500-1559,0.0,,54.0,0.0,0.0,0.0,0.0,42.0,,,,,,,,,,
2010-01-01,2010,1,1,5,AA,N329AA,SFO,LAX,1953.0,58.0,58.0,1.0,1800-1859,2112.0,47.0,47.0,1.0,2000-2059,0.0,,51.0,0.0,0.0,0.0,0.0,47.0,,,,,,,,,,
2010-01-01,2010,1,1,5,AA,N446AA,SFO,LAX,918.0,-7.0,0.0,0.0,0900-0959,1047.0,-13.0,0.0,0.0,1100-1159,0.0,,54.0,,,,,,,,,,,,,,,


# Construct Feature Matrix and Labels

In [100]:
#Certain columns are already set to be added - weather data is data for yesterday so model can't cheat.
feature_data = df[['Year','Month','DayofMonth','DayOfWeek','tavg','tmin','tmax','prcp','snow','wdir','wspd','wpgt','pres','tsun','ArrDel15']]

In [101]:
feature_data.ArrDel15.value_counts()

0.0    108923
1.0     37909
Name: ArrDel15, dtype: int64

In [102]:
feature_data = feature_data[~pd.isna(feature_data.ArrDel15)]
feature_data = feature_data[feature_data.index > '2010-01-01']
feature_data = feature_data.fillna(0)
feature_data

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,ArrDel15
2010-01-02,2010,1,2,6,15.2,10.0,20.6,0.0,0.0,0.0,7.6,0.0,1022.1,0.0,0.0
2010-01-02,2010,1,2,6,15.2,10.0,20.6,0.0,0.0,0.0,7.6,0.0,1022.1,0.0,0.0
2010-01-02,2010,1,2,6,15.2,10.0,20.6,0.0,0.0,0.0,7.6,0.0,1022.1,0.0,0.0
2010-01-02,2010,1,2,6,15.2,10.0,20.6,0.0,0.0,0.0,7.6,0.0,1022.1,0.0,0.0
2010-01-02,2010,1,2,6,15.2,10.0,20.6,0.0,0.0,0.0,7.6,0.0,1022.1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31,2019,12,31,2,13.9,11.7,17.8,0.0,0.0,70.0,9.4,0.0,1015.2,0.0,0.0
2019-12-31,2019,12,31,2,13.9,11.7,17.8,0.0,0.0,70.0,9.4,0.0,1015.2,0.0,0.0
2019-12-31,2019,12,31,2,13.9,11.7,17.8,0.0,0.0,70.0,9.4,0.0,1015.2,0.0,0.0
2019-12-31,2019,12,31,2,13.9,11.7,17.8,0.0,0.0,70.0,9.4,0.0,1015.2,0.0,0.0


In [103]:
data = np.array(feature_data)
y = data[:,-1:]
y = y.astype(int)
y = y.reshape(y.shape[0],)
X = data[:,:-1]

In [104]:
## Can get 74% accuracy just by predicting no delays every single time. 
1 - sum(y)/len(y)

0.7418174882669082

# Train the model

In [105]:
data = np.array(feature_data)
y = data[:,-1:]
y = y.astype(int)
y = y.reshape(y.shape[0],)
X = data[:,:-1]

########################################
epochs = 1
learning_rate = .0001
batch_size = 2
########################################

class flight_data(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y)
        self.length = self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    def __len__(self):
        return self.length

data = flight_data(X, y)

class Net(torch.nn.Module):
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
    def forward(self, X):
        pred = self.linear(X)
        return pred

model = Net(X.shape[1],2)

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
train_loader = DataLoader(dataset=data, batch_size=batch_size)

Loss = []
for epoch in range(epochs):
    for x, y in train_loader:
        optimizer.zero_grad()
        y_pred = model(x)
        loss = criterion(y_pred, y)
        Loss.append(loss)
        loss.backward()
        optimizer.step()
print('Done')

pred_model = model(data.X)
_, y_pred = pred_model.max(1)
print("model predictions on data:", y_pred)
print("actuals           on data:", data.y)

correct = (data.y == y_pred).sum().item()
acc = correct / len(data)
print("model accuracy:", acc)

Done
model predictions on data: tensor([0, 0, 0,  ..., 0, 0, 0])
actuals           on data: tensor([0, 0, 0,  ..., 0, 0, 0])
model accuracy: 0.7199384225547828
