# Setup

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)


# 1. Load Data & Split

In [36]:
from pathlib import Path

data_dir = Path("../data")
df = pd.read_csv(data_dir / "Airline Dataset Updated - v2.csv")

# confirm data loaded
df.head()

# Seperate Target and Predictors
y = df["Flight Status"]
X = df.drop(["Flight Status"], axis=1)

# Train / Test Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, 
    test_size=0.2, random_state=0, stratify=y)


# 2. Data Preparation

In [37]:
# Preprocess X_train and X_valid:
# - Drop unhelpful IDs for prediction
# - Extract month 

# Drop IDs
X_train = X_train.drop(columns=["Passenger ID", "First Name", "Last Name", "Pilot Name"])
X_valid = X_valid.drop(columns=["Passenger ID", "First Name", "Last Name", "Pilot Name"])


# Drop Airport Name and Arrival Airport since most categories have less than 20 entries
X_train = X_train.drop(columns=["Airport Name", "Arrival Airport", "Airport Country Code"])
X_valid = X_valid.drop(columns=["Airport Name", "Arrival Airport", "Airport Country Code"])
print(f"X_train cols: {X_train.columns}")
print(f"X_valid cols: {X_valid.columns}")

# Extract Month
parsed_train = pd.to_datetime(X_train["Departure Date"], format="mixed", errors="coerce")

train_month = parsed_train.dt.month
X_train["Month"] = train_month
X_train = X_train.drop(columns=["Departure Date"])

parsed_valid = pd.to_datetime(X_valid["Departure Date"],
format="mixed", errors="coerce")

valid_month = parsed_valid.dt.month
X_valid["Month"] = valid_month
X_valid = X_valid.drop(columns=["Departure Date"])

X_train.head()
X_valid.head()

numerical_cols = ["Age"]
categorical_cols = ["Gender", "Nationality", "Country Name",
"Continents", "Month"]


X_train cols: Index(['Gender', 'Age', 'Nationality', 'Country Name', 'Airport Continent',
       'Continents', 'Departure Date'],
      dtype='object')
X_valid cols: Index(['Gender', 'Age', 'Nationality', 'Country Name', 'Airport Continent',
       'Continents', 'Departure Date'],
      dtype='object')


# 3. Baseline Model Pipeline

In [38]:
# LogisticReg
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Constructing Pipeline
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle the preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define Model
base_model = LogisticRegression(max_iter=5000, random_state=0)

# Bundle preprocessing and model into pipeline
base_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', base_model)])

# Fit model and get predictions
base_pipeline.fit(X_train, y_train)
base_preds = base_pipeline.predict(X_valid)


# 4. RandomForest Model Pipeline

In [40]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100, random_state=0)

forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', forest_model)])

# Fit model and get predictions
forest_pipeline.fit(X_train, y_train)
forest_preds = forest_pipeline.predict(X_valid)

# 5. Evaluation

In [41]:
# Evaluate Logistic Regression model
base_acc = accuracy_score(y_valid, base_preds)
print(f"Baseline (LogisticRegression) model validation accuracy: {base_acc}")


# Evaluate RandomForest Model
forest_acc = accuracy_score(y_valid, forest_preds)
print(f"RandomForest model validation accuracy: {forest_acc}")

Baseline (LogisticRegression) model validation accuracy: 0.3338065301155952
RandomForest model validation accuracy: 0.32893936321233014


# Conclusion

Both Logistic Regression (~33%) and Random Forest Classifier (~32%) models had a validation accuracy comparable to random guessing between the three categories of flight status. This result lines up with obeservations made during EDA where proportions of the flight status remained about the same throughout individual features and interactions.

EDA had hinted that the features were not strong predictors of the flight status since there was little variance of the target when looking at distributions. Thus, the given features have limited predictive information as shown by the modeling results.

