In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from category_encoders import BinaryEncoder
from sklearn.model_selection import RandomizedSearchCV

In [2]:
from utils import *

In [15]:
df = load()
X_train, X_test, y_train, y_test = get_split(df)


cat_cols = ['Airline', 'AirportFrom', 'AirportTo', 'Route', 'DayOfWeek']
num_cols = [c for c in X_train.columns if c not in cat_cols]

In [16]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", BinaryEncoder(), cat_cols),
])

In [29]:
pipe = Pipeline([
    ("pre", preprocessor),
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(512,512,256),
        activation="relu",
        solver="adam",
        alpha=1e-4,
        batch_size=8192 * 4,
        max_iter=50,
        random_state=0,
        verbose=True
    ))
])

In [30]:
pipe.fit(X_train, y_train)

Iteration 1, loss = 0.64340423
Iteration 2, loss = 0.62116723
Iteration 3, loss = 0.61690835
Iteration 4, loss = 0.61423269
Iteration 5, loss = 0.61261103
Iteration 6, loss = 0.61122123
Iteration 7, loss = 0.61001049
Iteration 8, loss = 0.60933453
Iteration 9, loss = 0.60794376
Iteration 10, loss = 0.60681345
Iteration 11, loss = 0.60577238
Iteration 12, loss = 0.60545692
Iteration 13, loss = 0.60411015
Iteration 14, loss = 0.60406714
Iteration 15, loss = 0.60175870
Iteration 16, loss = 0.60127896
Iteration 17, loss = 0.60035255
Iteration 18, loss = 0.59899168
Iteration 19, loss = 0.59979603
Iteration 20, loss = 0.59681368
Iteration 21, loss = 0.59638449
Iteration 22, loss = 0.59500492
Iteration 23, loss = 0.59505746
Iteration 24, loss = 0.59496433
Iteration 25, loss = 0.59227551
Iteration 26, loss = 0.59117347
Iteration 27, loss = 0.59097495
Iteration 28, loss = 0.58944319
Iteration 29, loss = 0.58964625
Iteration 30, loss = 0.58685675
Iteration 31, loss = 0.58597148
Iteration 32, los



In [31]:
y_pred = pipe.predict(X_test)
print_metrics(y_test, y_pred)

Accuracy: 0.6565532968102561

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.74      0.71     59824
           1       0.63      0.55      0.59     48053

    accuracy                           0.66    107877
   macro avg       0.65      0.65      0.65    107877
weighted avg       0.65      0.66      0.65    107877


Confusion Matrix:
[[44488 15336]
 [21714 26339]]
