In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from category_encoders import BinaryEncoder
from sklearn.model_selection import RandomizedSearchCV

In [2]:
from utils import *

In [15]:
df = load()
X_train, X_test, y_train, y_test = get_split(df)


cat_cols = ['Airline', 'AirportFrom', 'AirportTo', 'Route', 'DayOfWeek']
num_cols = [c for c in X_train.columns if c not in cat_cols]

In [16]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", BinaryEncoder(), cat_cols),
])

In [32]:
pipe = Pipeline([
    ("pre", preprocessor),
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(1024,1024,512),
        activation="relu",
        solver="adam",
        alpha=1e-4,
        batch_size=8192 * 4,
        max_iter=50,
        random_state=0,
        verbose=True
    ))
])

In [33]:
pipe.fit(X_train, y_train)

Iteration 1, loss = 0.66895151
Iteration 2, loss = 0.62311033
Iteration 3, loss = 0.61892243
Iteration 4, loss = 0.61624045
Iteration 5, loss = 0.61423597
Iteration 6, loss = 0.61267224
Iteration 7, loss = 0.61135749
Iteration 8, loss = 0.61014730
Iteration 9, loss = 0.60901706
Iteration 10, loss = 0.60791115
Iteration 11, loss = 0.60703888
Iteration 12, loss = 0.60552732
Iteration 13, loss = 0.60453806
Iteration 14, loss = 0.60395205
Iteration 15, loss = 0.60224355
Iteration 16, loss = 0.60145420
Iteration 17, loss = 0.60063924
Iteration 18, loss = 0.59826736
Iteration 19, loss = 0.59676065
Iteration 20, loss = 0.59676179
Iteration 21, loss = 0.59444430
Iteration 22, loss = 0.59285310
Iteration 23, loss = 0.59242459
Iteration 24, loss = 0.59081435
Iteration 25, loss = 0.58914868
Iteration 26, loss = 0.58818811
Iteration 27, loss = 0.58668398
Iteration 28, loss = 0.58511864
Iteration 29, loss = 0.59139317
Iteration 30, loss = 0.59009163
Iteration 31, loss = 0.58450330
Iteration 32, los



In [34]:
y_pred = pipe.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy: 0.6571002159867256

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.81      0.72     59824
           1       0.66      0.47      0.55     48053

    accuracy                           0.66    107877
   macro avg       0.66      0.64      0.64    107877
weighted avg       0.66      0.66      0.65    107877


Confusion Matrix:
[[48361 11463]
 [25528 22525]]
