In [1]:
import orchest
## EVALML
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings("ignore")




In [2]:
data = orchest.get_inputs()  # data = [(df_data, df_target)]
bcell, covid, sars, bcell_sars = data["data"]

In [3]:
X = bcell_sars.drop(
    ["target", "parent_protein_id", "protein_seq", "peptide_seq"], axis=1
)
y = bcell_sars["target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Advantages
- For classification and regression tasks, find quality models with lower computational resources.
- Users can choose their desired customizability: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), full customization (arbitrary training and evaluation code).
- Allow human guidance in hyperparameter tuning to respect prior on certain subspaces but also able to explore other subspaces. Read more about the hyperparameter optimization methods in FLAML here. They can be used beyond the AutoML context. And they can be used in distributed HPO frameworks such as ray tune or nni.
- Support online AutoML: automatic hyperparameter tuning for online learning algorithms. Read more about the online AutoML method in FLAML [here](https://github.com/microsoft/FLAML/tree/main/flaml/onlineml).

In [5]:
%%time
# Initialize an AutoML instance
automl = AutoML()

# Specify automl goal and constraint
automl_settings = {
    "time_budget": 300,  # in seconds
    "metric": "roc_auc",
    "task": "classification",
}

CPU times: user 15 µs, sys: 11 µs, total: 26 µs
Wall time: 34.8 µs


In [6]:
# Train with labeled input data
automl.fit(
    X_train=X_train, 
    y_train=y_train,
    **automl_settings
)

[flaml.automl: 08-19 14:16:44] {1121} INFO - Evaluation method: cv


[flaml.automl: 08-19 14:16:44] {618} INFO - Using StratifiedKFold


[flaml.automl: 08-19 14:16:44] {1142} INFO - Minimizing error metric: 1-roc_auc


[flaml.automl: 08-19 14:16:44] {1163} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']


[flaml.automl: 08-19 14:16:44] {1253} INFO - iteration 0, current learner lgbm


[flaml.automl: 08-19 14:17:47] {1411} INFO -  at 64.0s,	best lgbm's error=0.2890,	best lgbm's error=0.2890


[flaml.automl: 08-19 14:17:47] {1253} INFO - iteration 1, current learner lgbm


[flaml.automl: 08-19 14:18:37] {1411} INFO -  at 114.7s,	best lgbm's error=0.2890,	best lgbm's error=0.2890


[flaml.automl: 08-19 14:18:38] {1253} INFO - iteration 2, current learner lgbm


[flaml.automl: 08-19 14:19:27] {1411} INFO -  at 164.3s,	best lgbm's error=0.2767,	best lgbm's error=0.2767


[flaml.automl: 08-19 14:19:27] {1253} INFO - iteration 3, current learner lgbm


[flaml.automl: 08-19 14:20:11] {1411} INFO -  at 207.8s,	best lgbm's error=0.2598,	best lgbm's error=0.2598


[flaml.automl: 08-19 14:20:11] {1253} INFO - iteration 4, current learner lgbm


[flaml.automl: 08-19 14:21:08] {1411} INFO -  at 265.1s,	best lgbm's error=0.2379,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:08] {1253} INFO - iteration 5, current learner xgboost


[flaml.automl: 08-19 14:21:31] {1411} INFO -  at 288.0s,	best xgboost's error=0.2943,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:31] {1253} INFO - iteration 6, current learner extra_tree


[flaml.automl: 08-19 14:21:34] {1411} INFO -  at 290.8s,	best extra_tree's error=0.3563,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:34] {1253} INFO - iteration 7, current learner extra_tree


[flaml.automl: 08-19 14:21:37] {1411} INFO -  at 294.3s,	best extra_tree's error=0.2886,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:37] {1253} INFO - iteration 8, current learner extra_tree


[flaml.automl: 08-19 14:21:41] {1411} INFO -  at 297.9s,	best extra_tree's error=0.2886,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:41] {1253} INFO - iteration 9, current learner rf


[flaml.automl: 08-19 14:21:43] {1411} INFO -  at 300.2s,	best rf's error=0.3240,	best lgbm's error=0.2379


[flaml.automl: 08-19 14:21:43] {1461} INFO - selected model: LGBMClassifier(learning_rate=0.25775724472262795, max_bin=256,
               min_child_samples=12, n_estimators=4, num_leaves=7,
               objective='binary', reg_alpha=0.0013933617380144255,
               reg_lambda=0.18096917948292968, subsample=0.9266743941610592,
               verbose=-1)


[flaml.automl: 08-19 14:21:43] {1184} INFO - fit succeeded


[flaml.automl: 08-19 14:21:43] {1185} INFO - Time taken to find the best model: 265.0558395385742




In [7]:
# Train with labeled input data
automl.best_estimator

'lgbm'

In [8]:
print("AUC Score:",roc_auc_score(y_test,automl.predict(X_test)))
orchest.output(automl,name='flaml')

AUC Score: 0.5351840526606486
