### Imports

In [28]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from catboost import CatBoostClassifier
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import (
    RFE,
    RFECV,
    SelectKBest,
    VarianceThreshold,
    f_classif,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML
from xgboost import XGBClassifier

### Constants

In [29]:
SEED = 42
N_JOBS = -1
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 60 * 8
TRAIN_TIME_LIMIT_MLJAR = 60 * 60 * 1
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = False
APPLY_REMOVE_CORRELATED_FEATURES = False
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_RECURSEIVE_FEATURE_ELIMINATION = True
APPLY_ANOVA = False
ANOVE_FEATURES = 25

### Make sure the output directories exist

In [30]:
for output_dir in [OUTPUT_DIR_MANUAL, OUTPUT_DIR_AUTOGLUON, OUTPUT_DIR_MLJAR]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240115_092046
Creating output directory output\autogluon\20240115_092046
Creating output directory output\mljar\20240115_092046


### Define utility functions

In [31]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    upper = np.triu(corr_matrix, k=1)
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [32]:
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [33]:
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [34]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [35]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)

    if isinstance(proba, pd.DataFrame):
        proba = proba.values

    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
    )

In [36]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [46]:
def apply_recursive_feature_elimination(train_x, train_y, valid_x, test_x):
    estimator_et = ExtraTreesClassifier(random_state=0)
    rfe_et = RFE(estimator=estimator_et, n_features_to_select=250)
    rfe_et.fit(train_x, train_y)
    train_x = train_x[:, rfe_et.support_]
    valid_x = valid_x[:, rfe_et.support_]
    test_x = test_x[:, rfe_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    estimator_rf = RandomForestClassifier(random_state=0)
    rfe_rf = RFE(estimator=estimator_rf, n_features_to_select=125)
    rfe_rf.fit(train_x, train_y)
    train_x = train_x[:, rfe_rf.support_]
    valid_x = valid_x[:, rfe_rf.support_]
    test_x = test_x[:, rfe_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_et = RFECV(estimator=estimator_et, cv=3, min_features_to_select=25)
    rfecv_et.fit(train_x, train_y)
    train_x = train_x[:, rfecv_et.support_]
    valid_x = valid_x[:, rfecv_et.support_]
    test_x = test_x[:, rfecv_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_rf = RFECV(estimator=estimator_rf, cv=3, min_features_to_select=15)
    rfecv_rf.fit(train_x, train_y)
    train_x = train_x[:, rfecv_rf.support_]
    valid_x = valid_x[:, rfecv_rf.support_]
    test_x = test_x[:, rfecv_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    selected_columns = np.where(rfecv_rf.support_)[0]

    os.makedirs(path.join("output", UNIQUE_ID), exist_ok=True)

    np.savetxt(
        path.join("output", UNIQUE_ID, "selected_features.txt"),
        selected_columns,
        fmt="%d",
    )

    return train_x, valid_x, test_x

### Load data

In [47]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [48]:
_test_x = np.array(_test_x, dtype=float, copy=True)
_train_x = np.array(_train_x, dtype=float, copy=True)
_train_y = np.array(_train_y, dtype=float, copy=True)

In [49]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [50]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].copy(), _train_x[:split].copy()
    train_y, valid_y = _train_y[split:].copy(), _train_y[:split].copy()
    return train_x, train_y, valid_x, valid_y

In [51]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


### Perform feature selection

In [52]:
if APPLY_RECURSEIVE_FEATURE_ELIMINATION:
    train_x, valid_x, test_x = apply_recursive_feature_elimination(
        train_x, train_y.copy().ravel(), valid_x, _test_x
    )
    print(train_x.shape, valid_x.shape, test_x.shape)

(1600, 250) (400, 250) (600, 250)
(1600, 125) (400, 125) (600, 125)
(1600, 25) (400, 25) (600, 25)
(1600, 20) (400, 20) (600, 20)
(1600, 20) (400, 20) (600, 20)


In [53]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [54]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [55]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [56]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [57]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 20)
valid_y.shape:  (400, 1)
test_x.shape:  (600, 20)


### Train manual model

In [59]:
base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)

committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    (
        "et",
        make_pipeline(
            StandardScaler(),
            ExtraTreesClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "cb",
        make_pipeline(
            StandardScaler(),
            CatBoostClassifier(
                iterations=500,
                learning_rate=0.03,
                depth=6,
                l2_leaf_reg=3,
                border_count=32,
                cat_features=None,
                loss_function="Logloss",
                eval_metric="Accuracy",
                random_seed=SEED,
                early_stopping_rounds=50,
                verbose=100,
            ),
        ),
    ),
    (
        "xgb",
        make_pipeline(
            StandardScaler(),
            XGBClassifier(
                random_state=SEED,
                use_label_encoder=False,
                eval_metric=balanced_accuracy_score,
                n_estimators=500,
                learning_rate=0.02,
                max_depth=6,
                min_child_weight=1,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0.1,
                reg_lambda=1.0,
                scale_pos_weight=1,
            ),
        ),
    ),
]

committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x.copy(), train_y.copy().ravel())

0:	learn: 0.7775000	total: 149ms	remaining: 1m 14s
100:	learn: 0.9162500	total: 496ms	remaining: 1.96s
200:	learn: 0.9418750	total: 849ms	remaining: 1.26s
300:	learn: 0.9631250	total: 1.2s	remaining: 792ms
400:	learn: 0.9775000	total: 1.55s	remaining: 384ms
499:	learn: 0.9862500	total: 1.93s	remaining: 0us


In [None]:
y_pred = committee_model.predict(train_x)
balanced_accuracy = balanced_accuracy_score(train_y, y_pred)
print("Committee Model Score:", balanced_accuracy) # Committee Model Score: 1.0

In [60]:
y_pred = committee_model.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

Model Balanced Accuracy: 0.8999774949363607


In [61]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(committee_model, test_x, output_path_proba)
dump_model(committee_model, output_path_model)

### Train model with Autogloun

In [62]:
train_data = np.concatenate((train_x.copy(), train_y.copy()), axis=1)
train_data_pd = pd.DataFrame(train_data, copy=True)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x.copy(), valid_y.copy()), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data, copy=True)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

print(train_data_pd.shape, valid_data_pd.shape)

(1600, 21) (400, 21)


In [63]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
    fit_weighted_ensemble=True,
    fit_full_last_level_weighted_ensemble=True,
    full_weighted_ensemble_additionally=True,
    num_bag_folds=15,
    num_bag_sets=25,
    num_stack_levels=3,
    auto_stack=True,
    dynamic_stacking=True,
    feature_generator="auto",
    hyperparameter_tune_kwargs={
        "scheduler": "local",
        "searcher": "auto",
        "time_out": 1200,
        "num_trials": 30,
    },
)

No model was trained during hyperparameter tuning NeuralNetTorch_BAG_L4... Skipping this model.
Fitting model: LightGBMLarge_BAG_L4 ... Training model for up to 390.02s of the -144.95s of remaining time.
	Fitting 15 child models (S1F1 - S1F15) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=1.14%)
	0.9356	 = Validation score   (balanced_accuracy)
	82.28s	 = Training   runtime
	0.11s	 = Validation runtime
Completed 1/25 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_ALL_L5 ... Training model for up to 476.69s of the -236.6s of remaining time.
	Ensemble Weights: {'LightGBMXT_BAG_L4\T2': 1.0}
	0.9407	 = Validation score   (balanced_accuracy)
	2.78s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: WeightedEnsemble_L5 ... Training model for up to 476.69s of the -239.53s of remaining time.
	Ensemble Weights: {'LightGBMXT_BAG_L4\T2': 1.0}
	0.9407	 = Validation score   (balanced_accuracy)
	2.22s	 = Training   runtime
	0.0s	 = 

In [65]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L5,0.940655,balanced_accuracy,0.383666,11918.407147,0.000000,2.217469,5,True,421
1,LightGBMXT_BAG_L4\T2,0.940655,balanced_accuracy,0.383666,11916.189678,0.000000,27.240463,4,True,317
2,WeightedEnsemble_ALL_L5,0.940655,balanced_accuracy,0.386193,11918.969054,0.002527,2.779375,5,True,420
3,LightGBM_BAG_L4\T1,0.940027,balanced_accuracy,0.383666,11926.541159,0.000000,37.591944,4,True,346
4,LightGBM_BAG_L4\T19,0.940023,balanced_accuracy,0.383666,11927.900822,0.000000,38.951606,4,True,364
...,...,...,...,...,...,...,...,...,...,...
416,XGBoost_BAG_L1\T14,0.850625,balanced_accuracy,0.000000,25.468491,0.000000,25.468491,1,True,102
417,LightGBMXT_BAG_L1\T12,0.838790,balanced_accuracy,0.000000,25.248737,0.000000,25.248737,1,True,14
418,LightGBMXT_BAG_L1\T15,0.830681,balanced_accuracy,0.000000,24.891185,0.000000,24.891185,1,True,17
419,LightGBMXT_BAG_L1\T25,0.803766,balanced_accuracy,0.000000,26.097915,0.000000,26.097915,1,True,27


In [66]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.8898252106724013,
 'accuracy': 0.89,
 'mcc': 0.780040627030424,
 'roc_auc': 0.9381860918706708,
 'f1': 0.8871794871794872,
 'precision': 0.8963730569948186,
 'recall': 0.8781725888324873}

In [67]:
output_path_proba = path.join(
    OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_proba.txt"
)
dump_proba(predictor, pd.DataFrame(test_x), output_path_proba)

### Train model with MLJAR

In [68]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "tmp"),
)
train_y = train_y.copy().reshape(-1)
print(train_y)
automl.fit(train_x.copy(), train_y)

[ 1.  1.  1. ...  1. -1. -1.]
AutoML directory: output\mljar\20240115_092046\tmp
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.702703 trained in 3.74 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.744366 trained in 5.3 seconds
2_DecisionTree f1 0.668969 trained in 7.27 second



6_Default_Xgboost f1 0.853735 trained in 17.68 seconds
7_Default_CatBoost f1 0.88543 trained in 13.91 seconds
8_Default_NeuralNetwork f1 0.856423 trained in 10.13 seconds




9_Default_RandomForest f1 0.809938 trained in 15.21 seconds
10_Default_ExtraTrees f1 0.765041 trained in 15.54 seconds
11_Default_NearestNeighbors f1 0.886534 trained in 6.29 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM f1 0.864966 trained in 14.06 seconds




12_Xgboost f1 0.844666 trained in 17.94 seconds
30_CatBoost f1 0.883634 trained in 14.98 seconds
39_RandomForest f1 0.842759 trained in 18.04 seconds
48_ExtraTrees f1 0.843887 trained in 13.62 seconds
57_NeuralNetwork f1 0.829826 trained in 10.92 seconds




66_NearestNeighbors f1 0.882611 trained in 6.19 seconds
22_LightGBM f1 0.866915 trained in 16.22 seconds




13_Xgboost f1 0.776589 trained in 22.91 seconds
31_CatBoost f1 0.879852 trained in 13.22 seconds
40_RandomForest f1 0.737231 trained in 13.71 seconds
49_ExtraTrees f1 0.732912 trained in 29.93 seconds
58_NeuralNetwork f1 0.851645 trained in 23.79 seconds




67_NearestNeighbors f1 0.889167 trained in 16.53 seconds
23_LightGBM f1 0.861748 trained in 18.0 seconds




14_Xgboost f1 0.845141 trained in 26.17 seconds
32_CatBoost f1 0.885164 trained in 30.52 seconds
41_RandomForest f1 0.738728 trained in 14.75 seconds
50_ExtraTrees f1 0.727273 trained in 14.01 seconds
59_NeuralNetwork f1 0.84472 trained in 12.12 seconds




68_NearestNeighbors f1 0.889167 trained in 7.96 seconds
24_LightGBM f1 0.869025 trained in 129.62 seconds




15_Xgboost f1 0.825871 trained in 25.63 seconds
33_CatBoost f1 0.864629 trained in 14.26 seconds
42_RandomForest f1 0.811275 trained in 17.55 seconds
51_ExtraTrees f1 0.756355 trained in 16.69 seconds
60_NeuralNetwork f1 0.848787 trained in 12.87 seconds




69_NearestNeighbors f1 0.889167 trained in 8.35 seconds
25_LightGBM f1 0.872908 trained in 17.38 seconds




16_Xgboost f1 0.636533 trained in 21.69 seconds
34_CatBoost f1 0.868699 trained in 14.46 seconds
43_RandomForest f1 0.742751 trained in 16.43 seconds
52_ExtraTrees f1 0.701995 trained in 16.54 seconds
61_NeuralNetwork f1 0.85254 trained in 13.21 seconds




70_NearestNeighbors f1 0.882611 trained in 9.45 seconds
26_LightGBM f1 0.877105 trained in 20.43 seconds




17_Xgboost f1 0.853234 trained in 25.53 seconds
35_CatBoost f1 0.88 trained in 16.29 seconds
44_RandomForest f1 0.835301 trained in 21.94 seconds
53_ExtraTrees f1 0.819083 trained in 21.53 seconds
62_NeuralNetwork f1 0.82045 trained in 16.31 seconds




71_NearestNeighbors f1 0.889167 trained in 10.31 seconds
27_LightGBM f1 0.86935 trained in 105.57 seconds




18_Xgboost f1 0.77707 trained in 23.57 seconds
36_CatBoost f1 0.882353 trained in 16.41 seconds
45_RandomForest f1 0.831169 trained in 17.49 seconds
54_ExtraTrees f1 0.836634 trained in 16.54 seconds
63_NeuralNetwork f1 0.857498 trained in 12.82 seconds




72_NearestNeighbors f1 0.889167 trained in 9.55 seconds
28_LightGBM f1 0.872976 trained in 19.6 seconds




19_Xgboost f1 0.861789 trained in 22.28 seconds
37_CatBoost f1 0.887647 trained in 21.86 seconds
46_RandomForest f1 0.743985 trained in 19.37 seconds
55_ExtraTrees f1 0.70098 trained in 18.5 seconds
64_NeuralNetwork f1 0.804462 trained in 15.07 seconds




29_LightGBM f1 0.881587 trained in 20.16 seconds




20_Xgboost f1 0.833539 trained in 29.1 seconds
38_CatBoost f1 0.873691 trained in 15.69 seconds
47_RandomForest f1 0.805337 trained in 17.43 seconds
56_ExtraTrees f1 0.738716 trained in 18.59 seconds
65_NeuralNetwork f1 0.835658 trained in 14.19 seconds




* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: feature_11_diff_feature_19
Add Golden Feature: feature_6_diff_feature_7
Add Golden Feature: feature_9_diff_feature_11
Add Golden Feature: feature_7_sum_feature_2
Add Golden Feature: feature_7_diff_feature_14
Add Golden Feature: feature_7_diff_feature_11
Add Golden Feature: feature_15_sum_feature_11
Add Golden Feature: feature_15_multiply_feature_11
Add Golden Feature: feature_20_sum_feature_3
Add Golden Feature: feature_8_ratio_feature_16
Created 10 Golden Features in 18.59 seconds.
67_NearestNeighbors_GoldenFeatures f1 0.883721 trained in 29.76 seconds
69_NearestNeighbors_GoldenFeatures f1 0.883721 trained in 10.99 seconds
68_NearestNeighbors_GoldenFeatures f1 0.883721 trained in 11.08 seconds
* Step kmeans_features will try to check up to 3 models




67_NearestNeighbors_KMeansFeatures f1 0.866667 trained in 11.79 seconds




69_NearestNeighbors_KMeansFeatures f1 0.871571 trained in 11.74 seconds




68_NearestNeighbors_KMeansFeatures f1 0.861268 trained in 11.98 seconds
* Step insert_random_feature will try to check up to 1 model
'KNeighborsAlgorithm' object has no attribute 'classes_'
Problem during computing permutation importance. Skipping ...
'KNeighborsAlgorithm' object has no attribute 'classes_'
Problem during computing permutation importance. Skipping ...
'KNeighborsAlgorithm' object has no attribute 'classes_'
Problem during computing permutation importance. Skipping ...
'KNeighborsAlgorithm' object has no attribute 'classes_'
Problem during computing permutation importance. Skipping ...
'KNeighborsAlgorithm' object has no attribute 'classes_'
Problem during computing permutation importance. Skipping ...
69_NearestNeighbors_RandomFeature f1 0.884013 trained in 11.48 seconds
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 39 models
73_NearestNeighbors f1 0.888889 trained in 11.46 seconds
74_NearestNeighbors f1 0.



87_Xgboost f1 0.871028 trained in 29.28 seconds




88_Xgboost f1 0.871189 trained in 24.59 seconds
89_NeuralNetwork f1 0.836634 trained in 16.22 seconds




90_NeuralNetwork f1 0.863118 trained in 17.03 seconds




91_NeuralNetwork f1 0.853613 trained in 16.77 seconds




92_NeuralNetwork f1 0.866911 trained in 16.93 seconds




93_Xgboost f1 0.851436 trained in 27.3 seconds




94_Xgboost f1 0.842759 trained in 25.42 seconds




95_Xgboost f1 0.842368 trained in 26.44 seconds




96_Xgboost f1 0.846058 trained in 25.18 seconds
97_NeuralNetwork f1 0.837736 trained in 17.82 seconds




98_NeuralNetwork f1 0.848297 trained in 17.65 seconds




99_ExtraTrees f1 0.835933 trained in 22.09 seconds
100_ExtraTrees f1 0.844774 trained in 20.9 seconds
101_RandomForest f1 0.840472 trained in 27.22 seconds
102_RandomForest f1 0.839752 trained in 26.24 seconds
103_ExtraTrees f1 0.832918 trained in 21.88 seconds
104_ExtraTrees f1 0.843223 trained in 23.82 seconds
105_RandomForest f1 0.834363 trained in 26.08 seconds
106_RandomForest f1 0.829177 trained in 26.65 seconds
107_RandomForest f1 0.832306 trained in 25.39 seconds
108_RandomForest f1 0.840273 trained in 31.63 seconds
109_ExtraTrees f1 0.822055 trained in 23.77 seconds
110_ExtraTrees f1 0.825416 trained in 22.56 seconds
* Step hill_climbing_2 will try to check up to 26 models
111_CatBoost f1 0.88543 trained in 38.98 seconds
112_CatBoost f1 0.887781 trained in 34.24 seconds
113_CatBoost f1 0.885307 trained in 27.62 seconds
114_CatBoost f1 0.888195 trained in 25.4 seconds
115_CatBoost f1 0.871128 trained in 19.22 seconds
116_CatBoost f1 0.888331 trained in 20.86 seconds
117_LightGB



88_Xgboost_Stacked f1 0.900062 trained in 28.4 seconds
92_NeuralNetwork_Stacked f1 0.889024 trained in 22.8 seconds




100_ExtraTrees_Stacked f1 0.907049 trained in 22.92 seconds
39_RandomForest_Stacked f1 0.908411 trained in 41.62 seconds
116_CatBoost_Stacked f1 0.906933 trained in 25.95 seconds
29_LightGBM_Stacked f1 0.901863 trained in 24.88 seconds




87_Xgboost_Stacked f1 0.901863 trained in 27.48 seconds
90_NeuralNetwork_Stacked f1 0.894147 trained in 21.82 seconds




48_ExtraTrees_Stacked f1 0.904229 trained in 25.58 seconds
101_RandomForest_Stacked f1 0.907731 trained in 39.68 seconds
114_CatBoost_Stacked f1 0.912609 trained in 42.89 seconds
117_LightGBM_Stacked f1 0.90491 trained in 26.76 seconds




19_Xgboost_Stacked f1 0.902545 trained in 27.07 seconds
63_NeuralNetwork_Stacked f1 0.893097 trained in 21.3 seconds




104_ExtraTrees_Stacked f1 0.907165 trained in 24.59 seconds
108_RandomForest_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 5.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
112_CatBoost_Stacked not trained. Stop training after the first fold. Time needed to train on the first fold 15.0 seconds. The time estimate for training on all folds is larger than total_time_limit.
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked f1 0.914321 trained in 88.73 seconds
AutoML fit time: 3712.39 seconds
AutoML best model: Ensemble_Stacked


In [69]:
print(valid_x.shape, valid_y.shape)
print(train_x.shape, train_y.shape)
predictions = automl.predict_proba(valid_x.copy().reshape(-1))

score = balanced_accuracy_score(valid_y, predictions)

print(f"Model Balanced Accuracy: {score}")

(400, 20) (400, 1)
(1600, 20) (1600,)


CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/libs/data/model_dataset_compatibility.cpp:81: At position 20 should be feature with name Ensemble_prediction_0_for_-1.0_1_for_1.0 (found Ensemble_prediction).

In [None]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path)