In [30]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgbm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import StackingClassifier
from supervised.automl import AutoML
from autogluon.tabular import TabularDataset, TabularPredictor

Wczytanie zbioru

In [2]:
X = pd.read_csv('artificial_train.data', header = None, delim_whitespace=True)
X_test = pd.read_csv('artificial_test.data', header = None, delim_whitespace=True)
y = pd.read_csv('artificial_train.labels', header = None)
y = np.where(y == -1, 0, y)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.9, random_state=1700)

# Modele ręczne

Zbiór zawiera bardzo dużo kolumn, z którego część prawdopodobnie jest nieistotna. Spróbujmy ocenić istotność każdej kolumny budując duży model i odczytując z niego feature importances. Bardzo dobrze nada się do tego algorytm random forest, który do tworzenia podziałów używa losowych podzbiorów predyktorów, więc przy odpowiednio dużym lesie każda zmienna powinna być odpowiednio wiele razy uwzględniania, co powinno skutkować poprawną oceną jej istotności.

In [3]:
#duży model RF do selekcji zmiennych
rf = RandomForestClassifier(n_estimators=10000, max_depth = 10, min_samples_split=5, n_jobs=8, random_state=2024)
rf.fit(X_train, y_train.reshape(-1))

In [15]:
imp = rf.feature_importances_

threshold = 0.004
imp_cols = np.where(imp > threshold)[0]
print(f"Wybrano {len(imp_cols)} kolumn")

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(imp)
plt.axhline(threshold, linestyle = "--", c = "red")
plt.title("Feature importance")

plt.subplot(1, 2, 2)
plt.plot(imp[imp<threshold])
plt.title("Feature importance - odrzucone zmienne")

plt.show()

Wybrano 19 kolumn




Po uwzględnieniu progu importance = 0.004 pozostało 19 zmiennych

In [16]:
X_train_imp = X_train[imp_cols]
X_val_imp = X_val[imp_cols]
X_test_imp = X_test[imp_cols]

Możemy teraz zbudować modele na okrojonym zbiorze i przeprowadzić lekką optymalizację najważniejszych hiperparametrów.

In [27]:
grid_knn = {
    "n_neighbors": np.arange(1, 20)
}

grid_xgb = {
    "n_estimators": [50, 100, 250, 500, 750, 1000, 1250, 1500, 2000, 3000],
    "max_depth": [5, 10, 15, 20]
}

grid_lgbm = {
    "n_estimators": [50, 100, 250, 500, 750, 1000, 1250, 1500, 2000, 3000],
    "max_depth": [5, 10, 15, 20],
    "extra_trees": [True, False]
}

grid_rf = {
    "n_estimators": [50, 100, 250, 500, 750, 1000, 1250, 1500, 2000, 3000],
    "max_depth": [5, 10, 15, 20],
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(balanced_accuracy_score)

clf_kn = KNeighborsClassifier()
clf_lgbm = lgbm.LGBMClassifier(learning_rate=0.05)
clf_xgb = xgb.XGBClassifier(learning_rate=0.025)
clf_rf = RandomForestClassifier(n_jobs = 4)

search_knn = GridSearchCV(
    clf_kn, param_grid=grid_knn, 
    scoring=scorer, cv=cv, n_jobs=4
)

search_rf = GridSearchCV(
    clf_rf, param_grid=grid_rf, 
    scoring=scorer, cv=cv, n_jobs=4
)

search_xgb = GridSearchCV(
    clf_xgb, param_grid=grid_xgb, 
    scoring=scorer, cv=cv, n_jobs=4
)

search_lgbm = GridSearchCV(
    clf_lgbm, param_grid=grid_lgbm,
    scoring=scorer, cv=cv, n_jobs=4
)

search_knn.fit(X_train_imp, y_train)
search_rf.fit(X_train_imp, y_train)
search_xgb.fit(X_train_imp, y_train)
search_lgbm.fit(X_train_imp, y_train)

clf_kn = KNeighborsClassifier(**search_knn.best_params_)
clf_lgbm = lgbm.LGBMClassifier(learning_rate=0.05, **search_lgbm.best_params_)
clf_xgb = xgb.XGBClassifier(learning_rate=0.025, **search_xgb.best_params_)
clf_rf = RandomForestClassifier(n_jobs = 4, **search_rf.best_params_)

In [8]:
results_1 = [
    balanced_accuracy_score(y_true = y_val,
                            y_pred = clf.fit(X_train_imp, y_train).predict(X_val_imp)) for clf in [clf_kn, clf_xgb, clf_lgbm, clf_rf]
]
print(results_1)

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 904, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3905
[LightGBM] [Info] Number of data points in the train set: 1800, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502222 -> initscore=0.008889
[LightGBM] [Info] Start training from score 0.008889


  return fit_method(estimator, *args, **kwargs)


[0.9154647435897436, 0.905849358974359, 0.9110576923076923, 0.9006410256410255]


Następnie jako finalny klasyfikator przyjmujemy stacking powyższych modeli.

In [15]:
clf_stack = StackingClassifier(
    estimators=[
        ('random_forest', clf_rf),
        ('lightgbm', clf_lgbm),
        ('xgboost', clf_xgb),
        ('kn', clf_kn)
    ],
    final_estimator=RandomForestClassifier(n_estimators=250, random_state=77),
    stack_method='auto',
    cv=5 
)

clf_stack.fit(X_train_imp, y_train)
y_pred_val = clf_stack.predict(X_val_imp)

accuracy = balanced_accuracy_score(y_val, y_pred_val)
print(f"Stacking Classifier validation Accuracy: {accuracy:.3f}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 904, number of negative: 896
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3905
[LightGBM] [Info] Number of data points in the train set: 1800, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502222 -> initscore=0.008889
[LightGBM] [Info] Start training from score 0.008889
[LightGBM] [Info] Number of positive: 724, number of negative: 716
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3817
[LightGBM] [Info] Number of data points in the train set: 1440, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502778 -> initscore=0.011111
[LightGBM] [Info] Start training from score 0.011111
[LightGBM] [Info] Number o

In [83]:
y_pred_test_manual = clf_stack.predict_proba(X_test_imp)[:,0]

np.savetxt("268906_artifical_model_prediction.txt", y_pred_test_manual, fmt='%.6f')



# AutoML

Teraz użyjemy kilku metod automatycznych.

MLJAR

In [None]:
clf_mljar = AutoML(mode = "Compete", eval_metric="accuracy", 
                validation_strategy = {
                        "validation_type": "kfold",
                        "k_folds": 5,
                        "shuffle": True,
                        "stratify": True,
                        "random_seed": 123
                    }
)
clf_mljar.fit(X_train, y_train.reshape(-1))

In [54]:
y_pred_val_mljar = clf_mljar.predict(X_val)
print(f"MLJAR balanced accuracy: {balanced_accuracy_score(y_val, y_pred_val_mljar):.4f}")

AutoGluon

In [None]:
data_ag_train = TabularDataset(np.concatenate([X_train, y_train], axis = 1))
data_ag_val = TabularDataset(np.concatenate([X_val, y_val], axis = 1))
data_ag_train = data_ag_train.rename(columns = {500: "y"})
data_ag_val = data_ag_val.rename(columns = {500: "y"})

clf_ag = TabularPredictor(label='y', eval_metric='balanced_accuracy')
clf_ag.fit(data_ag_train, presets = "good_quality")

In [9]:
y_pred_val_ag = clf_ag.predict(data_ag_val)
print(f"AutoGluon validation accuracy: {balanced_accuracy_score(y_val, y_pred_val_ag):.4f}")

AutoGluon validation accuracy: 0.8898


In [13]:
#predykcja ostateczna - wybieramy mljar, bo lepszy wynik
y_pred_test_auto = clf_mljar.predict_proba(X_test)[:,0]
#y_pred_test_auto = clf_ag.predict_proba(data_ag_val)[1]

np.savetxt("268906_artifical_automl_ag_prediction.txt", y_pred_test_auto, fmt='%.6f')