# Setup

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)


# 1. Load Data & Split

In [27]:
from pathlib import Path

data_dir = Path("../data")
df = pd.read_csv(data_dir / "Airline Dataset Updated - v2.csv")

# confirm data loaded
df.head()

# Seperate Target and Predictors
y = df["Flight Status"]
X = df.drop(["Flight Status"], axis=1)

# Train / Test Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, 
    test_size=0.2, random_state=0, stratify=y)


# 2. Preprocessing / Feature Engineering

In [28]:
# Preprocess X_train and X_valid:
# - Drop unhelpful IDs for prediction
# - Extract month 

# Drop IDs
X_train = X_train.drop(columns=["Passenger ID", "First Name", "Last Name", "Pilot Name"])
X_valid = X_valid.drop(columns=["Passenger ID", "First Name", "Last Name", "Pilot Name"])


# Drop Airport Name and Arrival Airport since most categories have less than 20 entries
X_train = X_train.drop(columns=["Airport Name", "Arrival Airport"])
X_valid = X_valid.drop(columns=["Airport Name", "Arrival Airport"])
print(f"X_train cols: {X_train.columns}")
print(f"X_valid cols: {X_valid.columns}")

# Extract Month
parsed_train = pd.to_datetime(X_train["Departure Date"], format="mixed", errors="coerce")

train_month = parsed_train.dt.month
X_train["Month"] = train_month
X_train = X_train.drop(columns=["Departure Date"])

parsed_valid = pd.to_datetime(X_valid["Departure Date"],
format="mixed", errors="coerce")

valid_month = parsed_valid.dt.month
X_valid["Month"] = valid_month
X_valid = X_valid.drop(columns=["Departure Date"])

X_train.head()
X_valid.head()


X_train cols: Index(['Gender', 'Age', 'Nationality', 'Airport Country Code', 'Country Name',
       'Airport Continent', 'Continents', 'Departure Date'],
      dtype='object')
X_valid cols: Index(['Gender', 'Age', 'Nationality', 'Airport Country Code', 'Country Name',
       'Airport Continent', 'Continents', 'Departure Date'],
      dtype='object')


Unnamed: 0,Gender,Age,Nationality,Airport Country Code,Country Name,Airport Continent,Continents,Month
44542,Female,88,China,CL,Chile,SAM,South America,1
29873,Male,50,Russia,PG,Papua New Guinea,OC,Oceania,1
5693,Female,87,Japan,CA,Canada,NAM,North America,12
11885,Female,65,Russia,NZ,New Zealand,OC,Oceania,6
40407,Male,35,Norway,US,United States,NAM,North America,7


# 3. Baseline

# 4. Tree-based Comparison

# 5. Evaluation

# Conclusion