In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import numpy as np 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
from sklearn.metrics import balanced_accuracy_score,f1_score
from supervised.automl import AutoML
from sklearn.feature_selection import RFECV
# from autogluon.tabular import  TabularPredictor ##with Python 3.9.18

### Wczytanie danych

In [2]:
X = pd.read_csv('data/artificial_train.data', delim_whitespace=True, header=None)
y = pd.read_csv("data/artificial_train.labels", header=None)
Xtest = pd.read_csv('data/artificial_test.data', delim_whitespace=True, header=None)

In [3]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,490,505,503,474,463,461,519,476,518,467,...,467,479,449,588,499,506,475,463,507,501
1996,480,475,476,480,495,482,515,479,480,484,...,464,474,473,424,454,570,476,493,465,485
1997,480,517,631,470,485,474,535,476,493,466,...,501,483,479,687,488,488,483,500,523,481
1998,484,481,505,478,542,477,518,477,510,472,...,487,483,526,750,486,529,484,473,527,485


In [4]:
y

Unnamed: 0,0
0,-1
1,-1
2,-1
3,1
4,1
...,...
1995,1
1996,-1
1997,-1
1998,1


In [3]:
y.value_counts()

-1    1000
 1    1000
Name: count, dtype: int64

Policzności etykiet stwierdzamy, że zbiór treningowy jest zbalansowany

In [5]:
Xtest

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,483,454,513,495,523,469,453,477,506,479,...,455,480,543,259,413,520,485,498,523,510
1,485,508,493,487,478,472,504,476,479,475,...,486,480,535,534,514,452,484,495,548,477
2,483,521,507,475,493,486,421,475,496,483,...,491,476,498,495,508,528,486,465,508,503
3,474,504,576,480,553,483,524,478,483,483,...,521,475,470,463,509,525,479,467,552,517
4,495,474,523,479,495,488,485,476,497,478,...,510,471,522,343,509,520,475,493,506,491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,493,458,503,478,517,479,472,478,444,477,...,469,475,485,443,517,486,474,489,506,506
596,481,484,481,490,449,481,467,478,469,483,...,506,485,508,599,498,527,481,490,455,451
597,485,485,530,480,444,487,462,475,509,494,...,442,474,502,368,453,482,478,481,484,517
598,477,469,528,485,483,469,482,477,494,476,...,473,476,453,638,471,538,470,490,613,492


### Zbiór walidacyjny

## Modelowanie ręczne

In [6]:
def train(iter, clf, params, Xtrain, ytrain,cv, selector=False):
    random_search = RandomizedSearchCV(
    clf,
    param_distributions=params,
    n_iter=iter,
    scoring='balanced_accuracy',
    n_jobs=-1,
    cv=5,
    verbose=2,
    random_state=42
    )
    if selector:
        selector = RFECV(estimator=clf, scoring='balanced_accuracy',verbose=1, min_features_to_select =400,importance_getter="feature_importances_")
        Xtrain = selector.fit_transform(Xtrain, ytrain)

    random_search.fit(Xtrain, ytrain)
    best_clf = random_search.best_estimator_

    # Walidacja krzyżowa
    scores = cross_val_score(best_clf, Xtrain, ytrain, cv=cv, scoring='balanced_accuracy')
    
    # ypred = best_clf.predict(Xtest)
    # balanced_accuracy = balanced_accuracy_score(ytest, ypred)
    return (best_clf, scores, selector) if selector else (best_clf, scores)


In [3]:
# Strategia podziału dla walidacji krzyżowej, wspólna dla każdego modelu
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#### Decision Tree

In [41]:
clf = DecisionTreeClassifier()

In [42]:
params_DT = {
    'max_depth': np.arange(1, 31), 
    'max_features': [None, 'log2', 'sqrt'],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': np.arange(1, 61),  
    'min_samples_leaf': np.arange(1, 61),
}

Trening z walidacją krzyżową

In [43]:
best_clf, scores= train(1000, clf, params_DT, X,y,cv)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


Trening z rekukrencyjnym doborem cech i walidacją krzyżową

In [44]:
best_clf2, scores2, selector = train(1000, clf, params_DT, X,y,cv,selector=True)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

In [45]:
np.mean(scores)

0.7805000000000002

In [46]:
np.mean(scores2)

0.7875000000000001

Wybór klasyfikatora

In [47]:
if np.mean(scores2)>np.mean(scores):
    with open('models/DT.pkl','wb') as f:
        pickle.dump(best_clf2,f)
    with open('selectors/DT.pkl','wb') as f:
        pickle.dump(selector,f)
    best_clf = best_clf2
    X_test = selector.transform(Xtest)
    print("Feature selection improved the scores")
else:
    with open('models/DT.pkl','wb') as f:
        pickle.dump(best_clf,f)
    X_test = Xtest
    print("All features used")

Feature selection improved the scores


#### Random Forest

In [33]:
clf = RandomForestClassifier()

In [34]:
params_RF = {
    'n_estimators': np.arange(1, 2001),
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 11),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'bootstrap': [True, False],
    'max_samples': np.arange(0, 1.1, 0.1),   
}

Trening z walidacją krzyżową

In [35]:
best_clf, scores = train(200, clf, params_RF, X,y,cv)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Trening z rekukrencyjnym doborem cech i walidacją krzyżową

In [36]:
best_clf2, scores2, selector = train(200, clf, params_RF, X,y,cv,selector=True)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

In [37]:
np.mean(scores)

0.6970000000000001

In [38]:
np.mean(scores2)

0.6845

Wybór klasyfikatora

In [39]:
if np.mean(scores2)>np.mean(scores):
    with open('models/RF.pkl','wb') as f:
        pickle.dump(best_clf2,f)
    with open('selectors/RF.pkl','wb') as f:
        pickle.dump(selector,f)
    best_clf = best_clf2
    X_test = selector.transform(Xtest)
    print("Feature selection improved the scores")
else:
    with open('models/RF.pkl','wb') as f:
        pickle.dump(best_clf,f)
    X_test = Xtest
    print("All features used")

All features used


### XGBoost

In [8]:
clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss')

In [9]:
params_XGB = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': np.arange(3, 10),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'subsample': np.linspace(0.6, 1.0, 5),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'gamma': [0, 1, 5]
}

Trening z walidacją krzyżową

In [10]:
best_clf, scores = train(200, clf, params_XGB, X,y.replace(-1,0),cv)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


Trening z rekukrencyjnym doborem cech i walidacją krzyżową

In [11]:
best_clf2, scores2, selector = train(200, clf, params_XGB, X, y.replace(-1,0)  ,cv,selector=True)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

In [18]:
np.mean(scores)

0.8280000000000001

In [19]:
np.mean(scores2)

0.8330000000000002

Wybór klasyfikatora

In [20]:
if np.mean(scores2)>np.mean(scores):
    with open('models/XGB.pkl','wb') as f:
        pickle.dump(best_clf2,f)
    with open('selectors/XGB.pkl','wb') as f:
        pickle.dump(selector,f)
    X_test = selector.transform(Xtest)
    print("Feature selection improved the scores")
else:
    with open('models/XGB.pkl','wb') as f:
        pickle.dump(best_clf,f)
    X_test = selector.transform(Xtest)
    print("All features used")

Feature selection improved the scores


Predykcja prawodpodobieństw dla najlepszego klasyfikatora utworzonego ręcznie (bez doboru cech)

In [21]:
prob = best_clf.predict_proba(X_test)
np.savetxt('probs/313401_313392_artifical_model_prediction.txt', prob[:, 1])

In [4]:
with open('models/XGB.pkl', 'rb') as f:
    best_clf = pickle.load(f)

In [5]:
meanf1 = np.mean(cross_val_score(best_clf, X, y.replace(-1,0), cv=cv, scoring='f1'))

In [6]:
meanf1

0.8293715510767171

## Podejście AutoML 

Aby uniknąć problemów przy zautomatyzowanym tworzeniu klasyfikatora binarnego zamieniamy etykiety -1 na 0

In [7]:
y =y.replace(-1,0)

In [8]:
y = y.rename(columns={0: 'class'})

In [9]:
from sklearn.model_selection import train_test_split

Xtrain, X_test, ytrain, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

full_data = pd.concat([X, y], axis=1)
train_data = pd.concat([Xtrain, ytrain], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

### MLJAR

W celu zorientowania się w czasie treningu modelu i jakich conajmniej wyników można się spodziewać użyto `mode='Explain'`.

Explain

In [30]:
automl = AutoML(
    ml_task="binary_classification",
    mode="Explain",
    eval_metric = "f1", 
    random_state=42
)
automl.fit(X, y)

AutoML directory: AutoML_3
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_Baseline f1 0.0 trained in 0.57 seconds
2_DecisionTree f1 0.742616 trained in 41.58 seconds
3_Linear f1 0.513347 trained in 19.08 seconds
* Step default_algorithms will try to check up to 3 models
4_Default_Xgboost f1 0.807229 trained in 38.0 seconds
5_Default_NeuralNetwork f1 0.564202 trained in 12.22 seconds
6_Default_RandomForest f1 0.792969 trained in 33.75 seconds
* Step ensemble will try to check up to 1 model
Ensemble f1 0.812877 trained in 1.84 seconds
AutoML fit time: 158.07 seconds
AutoML best model: Ensemble


Compete

In [13]:
automl = AutoML(
    ml_task="binary_classification",
    mode="Compete",
    eval_metric = "f1", 
    random_state=42
)
automl.fit(X_test, y_test)

AutoML directory: AutoML_2
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.628571 trained in 2.15 seconds
Adjust validation. Remove: 1_DecisionTree
*** Disable stacking for small dataset (nrows < 500)
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.524934 trained in 18.03 seconds
2_DecisionTree f1 0.447368 trained in 16.5 seconds
3_DecisionTree 

In [14]:
y_pred = automl.predict(X_test)

In [15]:
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"BA: {balanced_accuracy_score(y_test,y_pred)}")

F1: 0.9198966408268734
BA: 0.9222199994998874


Modified Compete

In [10]:
automl2 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    ml_task="binary_classification",
    start_random_models=15, 
    hill_climbing_steps= 3,
    top_models_to_improve =4,
    eval_metric = "f1",
    mode="Compete",
    random_state=42
)
automl2.fit(X_test, y_test)

AutoML directory: AutoML_1
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'LightGBM']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'hill_climbing_3', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.628571 trained in 1.07 seconds
Adjust validation. Remove: 1_DecisionTree
*** Disable stacking for small dataset (nrows < 500)
Validation strategy: 10-fold CV Shuffle,Stratify
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_LightGBM f1 0.647668 trained in 11.35 seconds
2_Default_Xgboost f1 0.535809 trained in 18.1 seconds
3_Default_CatBoost f1 0.6

In [20]:
y_pred = automl2.predict(X_test)

In [21]:
print(f"F1: {f1_score(y_test, y_pred)}")
print(f"BA: {balanced_accuracy_score(y_test,y_pred)}")

F1: 0.9924433249370278
BA: 0.9926108374384237


Wytrenujemy jeszcze modele na pełnych danych.

Compete

In [None]:
automl = AutoML(
    ml_task="binary_classification",
    mode="Compete",
    eval_metric = "f1", 
    random_state=42
)
automl.fit(X, y)

AutoML directory: AutoML_2
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.736842 trained in 4.19 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.748707 trained in 6.66 seconds
2_DecisionTree f1 0.610116 trained in 4.99 seconds
3_DecisionTree f1 0.609795 trained in 4.76 seconds
4_Linear f1 0.54024

Modified Compete

In [None]:
automl2 = AutoML(
    algorithms=["CatBoost", "Xgboost", "LightGBM"],
    ml_task="binary_classification",
    start_random_models=15, 
    hill_climbing_steps= 3,
    top_models_to_improve =4,
    eval_metric = "f1",
    mode="Compete",
    random_state=42
)
automl2.fit(X, y)

AutoML directory: AutoML_2
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['CatBoost', 'Xgboost', 'LightGBM']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'hill_climbing_3', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.736842 trained in 4.83 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle,Stratify
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_LightGBM f1 0.83045 trained in 29.23 seconds
2_Default_Xgboost f1 0.817635 trained in 31.16 seconds
3_Default_CatBoost f1 0.858416 trained in 24.65 seconds
* Step not_so_random wi

In [4]:
proba =automl2.predict_proba(Xtest)

Predykcja prawodpodobieństw dla najlepszego klasyfikatora utworzonego automatycznie

In [5]:
import numpy as np
np.savetxt('probs/313401_313392_artifical_automl_prediction.txt', proba[:, 1])

### Autogluon

Porównanie modelu AutoGluona z modelem MLJar wymaga wspólnej metryki - f1. Docelową metryką, na której testowany będzie najlepszy model jest balanced accuracy dostępny przy trenowaniu modelu AutoGluon.

W celu zorientowania się w czasie treningu modelu i jakich conajmniej wyników można się spodziewać użyto `presets='medium_quality'`. Przyjmuje się, aby dla `presets='best_quality'`limit czasowy był co najmniej 16 razy dłuższym od czasu wykonania dla `presets='medium_quality'`.

Medium

In [23]:
startpredictor = TabularPredictor('class', eval_metric='balanced_accuracy').fit(train_data, time_limit=None, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240114_205615"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240114_205615"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          16
Memory Avail:       4.79 GB / 13.87 GB (34.5%)
Disk Space Avail:   799.34 GB / 951.65 GB (84.0%)
Train Data Rows:    1600
Train Data Columns: 500
Label Column:       class
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Us

In [24]:
startpredictor.evaluate(test_data)

{'balanced_accuracy': 0.8098822235002876,
 'accuracy': 0.81,
 'mcc': 0.619888465000666,
 'roc_auc': 0.8891250531369559,
 'f1': 0.8061224489795918,
 'precision': 0.8102564102564103,
 'recall': 0.8020304568527918}

Zadanie zakończyło się w przeciągu 44 sekund. Ustalamy `time_limit = 60*60`

Best

In [25]:
predictor = TabularPredictor('class', eval_metric='balanced_accuracy').fit(train_data, time_limit=60*60, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240114_210105"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240114_210105/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 913 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 2687 seconds.
Starting full fit now with num_stack_le

In [26]:
predictor.evaluate(test_data)

{'balanced_accuracy': 0.835262934160186,
 'accuracy': 0.835,
 'mcc': 0.6706600439375775,
 'roc_auc': 0.9059538396139131,
 'f1': 0.835820895522388,
 'precision': 0.8195121951219512,
 'recall': 0.8527918781725888}

Warto zauważyć, że otrzymane wyniki balanced_accuracy i f1 score zarówno przy `medium_quality` jak i `best_quality` są zbliżone. Wytrenujemy jeszcze model na pełnych danych.

Medium

In [27]:
startpredictor = TabularPredictor('class', eval_metric='balanced_accuracy').fit(full_data, time_limit=None, presets='medium_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240114_220432"
Presets specified: ['medium_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240114_220432"
AutoGluon Version:  1.0.0
Python Version:     3.9.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          16
Memory Avail:       4.38 GB / 13.87 GB (31.6%)
Disk Space Avail:   798.57 GB / 951.65 GB (83.9%)
Train Data Rows:    2000
Train Data Columns: 500
Label Column:       class
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Us

Best

In [29]:
predictor = TabularPredictor('class', eval_metric='balanced_accuracy').fit(full_data, time_limit=32*60, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels\ag-20240114_220649"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 1920 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels\ag-20240114_220649/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 595 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 1325 seconds.
Starting full fit now with num_stack_le