# Probabilistic Flight Delay Classification Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import BinaryEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
# Load data functions
def load():
    # Implement data loading
    pass

def get_split(df):
    # Implement data splitting
    pass

In [None]:
# Preprocessing configuration
cat_cols = ['Airline', 'AirportFrom', 'AirportTo', 'Route', 'DayOfWeek']
num_cols = ['Flight', 'Time', 'Length', 'Airline_DelayRate', 'Route_AvgDelay']

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", BinaryEncoder(), cat_cols)
])

In [None]:
# KNN Model
knn_pipe = Pipeline([
    ("pre", preprocessor),
    ("knn", KNeighborsClassifier(
        n_neighbors=5, 
        weights='distance',  # Weight by distance for probabilistic output
        algorithm='auto'
    ))
])

# Decision Tree Model
dt_pipe = Pipeline([
    ("pre", preprocessor),
    ("dt", DecisionTreeClassifier(
        max_depth=10,
        min_samples_split=50,
        min_samples_leaf=20,
        class_weight='balanced'
    ))
])

In [None]:
def evaluate_model(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nROC AUC Score:")
    print(roc_auc_score(y_test, y_prob))
    
    return pipe

In [None]:
# Training and Evaluation
df = load()
X_train, X_test, y_train, y_test = get_split(df)

# Train and evaluate KNN
knn_model = knn_pipe.fit(X_train, y_train)
best_knn = evaluate_model(knn_model, X_test, y_test)

# Train and evaluate Decision Tree
dt_model = dt_pipe.fit(X_train, y_train)
best_dt = evaluate_model(dt_model, X_test, y_test)