## Summary

* Time is minutes after midnight
* No missing values
* Logistic Regression on original dataset is bad (58% accuracy ~ little better than random)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv("../data/Airlines.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isna().any()

In [None]:
df['Delay'].value_counts() / len(df['Delay'])

In [7]:
# AirportFrom and AirportTo represent different concepts so I've made multiple encoders
label_encoders = {}
categorical_cols = ['Airline', 'AirportFrom', 'AirportTo']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

In [8]:
X = df.drop(['id', 'Delay'], axis=1)
y = df['Delay']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train_scaled, y_train)

In [12]:
y_pred = logreg.predict(X_test_scaled)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
df['Time'].max()
# time is minutes after midnight 
# 1439 = 23 hours and 59 minutes

In [12]:
def load():
    return pd.read_csv("../data/Airlines.csv")

def get_split(df):
    X = df.drop(['id', 'Delay'], axis=1)
    y = df['Delay']
    return train_test_split(X, y, test_size=0.2, random_state=0)

def encode_inplace(df, cat_cols):
    label_encoders = {}
    for col in cat_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
    return label_encoders

def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [7]:
def add_new_inplace(df):
    df['DurationCategory'] = pd.cut(df['Length'],
                        bins=[0, 60, 120, 180, 240, float('inf')],
                        labels=['Very Short', 'Short', 'Medium', 'Long', 'Very Long'],
                        include_lowest=True)
    df['Route'] = df['AirportFrom'] + '-' + df['AirportTo']

In [8]:
df = load()
add_new_inplace(df)

categorical_cols = ['Airline', 'AirportFrom', 'AirportTo', 'DurationCategory', 'Route']
label_encoders = encode_inplace(df, cat_cols=categorical_cols)

df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,DurationCategory,Route
0,1,4,269,253,135,3,15,205,1,0,3740
1,2,14,1558,217,60,3,15,222,1,0,3265
2,3,1,2400,154,80,3,20,165,1,1,2159
3,4,1,2466,253,80,3,20,195,1,0,3731
4,5,2,108,14,252,3,30,202,0,0,83


In [11]:
X_train, X_test, y_train, y_test = get_split(df)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(random_state=0)
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
print_metrics(y_test, y_pred)

Accuracy: 0.5848512657934499

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.76      0.67     60014
           1       0.55      0.37      0.44     47863

    accuracy                           0.58    107877
   macro avg       0.57      0.56      0.55    107877
weighted avg       0.58      0.58      0.57    107877


Confusion Matrix:
[[45556 14458]
 [30327 17536]]
