In [77]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb

In [78]:
Best_Heckman_data = pd.read_csv("../../../Data/Best&Heckman/BestHeckman+SDSS+wise+LOFAR.csv")

In [79]:
# Only selecting data with a classification
Best_Heckman_data = Best_Heckman_data[Best_Heckman_data['Classification'] != 'Radio-loud AGN'] 

In [89]:
X = Best_Heckman_data[[c for c in Best_Heckman_data.columns if c != 'Classification']]
y = Best_Heckman_data['Classification']

In [90]:
pd.Series(y).value_counts()

jet-mode radio AGN/low-excitation radio galaxy          9771
star-forming galaxy                                     2913
quasar-like radio AGN / high-excitation radio galaxy     478
Name: Classification, dtype: int64

In [81]:
le = LabelEncoder()
labels = np.unique(y.astype(str))
y = le.fit_transform(y)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

In [83]:
from flaml import AutoML
automl = AutoML()

In [85]:
import warnings

#estimators = ['xgb_limitdepth']
#estimators = ['extra_tree']

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    automl.fit(X_train, y_train, task="classification", metric='accuracy',
                                #X_val=X_test , y_val=y_test.to_numpy(),
                                #ensemble=True,
                                #estimator_list=estimators, 
                                time_budget=60, n_jobs=8, 
                                eval_method='cv',
                                #log_file_name='general_extra_features3.log',
                                #starting_points=automl.best_config_per_estimator
              )

[flaml.automl: 09-19 13:06:10] {2390} INFO - task = classification
[flaml.automl: 09-19 13:06:10] {2392} INFO - Data split method: stratified
[flaml.automl: 09-19 13:06:10] {2396} INFO - Evaluation method: cv
[flaml.automl: 09-19 13:06:10] {2465} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 09-19 13:06:10] {2605} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl: 09-19 13:06:10] {2897} INFO - iteration 0, current learner lgbm
[flaml.automl: 09-19 13:06:10] {3025} INFO - Estimated sufficient time budget=2730s. Estimated necessary time budget=67s.
[flaml.automl: 09-19 13:06:10] {3072} INFO -  at 0.4s,	estimator lgbm's best error=0.2577,	best estimator lgbm's best error=0.2577
[flaml.automl: 09-19 13:06:10] {2897} INFO - iteration 1, current learner lgbm
[flaml.automl: 09-19 13:06:11] {3072} INFO -  at 0.6s,	estimator lgbm's best error=0.2577,	best estimator lgbm's best error=0.2577
[flaml.aut

[flaml.automl: 09-19 13:06:27] {2897} INFO - iteration 36, current learner catboost
[flaml.automl: 09-19 13:06:28] {3072} INFO -  at 17.7s,	estimator catboost's best error=0.1336,	best estimator rf's best error=0.0792
[flaml.automl: 09-19 13:06:28] {2897} INFO - iteration 37, current learner xgboost
[flaml.automl: 09-19 13:06:28] {3072} INFO -  at 18.0s,	estimator xgboost's best error=0.1204,	best estimator rf's best error=0.0792
[flaml.automl: 09-19 13:06:28] {2897} INFO - iteration 38, current learner catboost
[flaml.automl: 09-19 13:06:29] {3072} INFO -  at 18.8s,	estimator catboost's best error=0.1336,	best estimator rf's best error=0.0792
[flaml.automl: 09-19 13:06:29] {2897} INFO - iteration 39, current learner catboost
[flaml.automl: 09-19 13:06:30] {3072} INFO -  at 19.6s,	estimator catboost's best error=0.1336,	best estimator rf's best error=0.0792
[flaml.automl: 09-19 13:06:30] {2897} INFO - iteration 40, current learner rf
[flaml.automl: 09-19 13:06:31] {3072} INFO -  at 21.

[flaml.automl: 09-19 13:06:56] {2897} INFO - iteration 74, current learner xgboost
[flaml.automl: 09-19 13:06:56] {3072} INFO -  at 46.0s,	estimator xgboost's best error=0.1184,	best estimator rf's best error=0.0778
[flaml.automl: 09-19 13:06:56] {2897} INFO - iteration 75, current learner extra_tree
[flaml.automl: 09-19 13:06:56] {3072} INFO -  at 46.3s,	estimator extra_tree's best error=0.0848,	best estimator rf's best error=0.0778
[flaml.automl: 09-19 13:06:56] {2897} INFO - iteration 76, current learner rf
[flaml.automl: 09-19 13:06:59] {3072} INFO -  at 49.3s,	estimator rf's best error=0.0778,	best estimator rf's best error=0.0778
[flaml.automl: 09-19 13:06:59] {2897} INFO - iteration 77, current learner rf
[flaml.automl: 09-19 13:07:00] {3072} INFO -  at 50.3s,	estimator rf's best error=0.0778,	best estimator rf's best error=0.0778
[flaml.automl: 09-19 13:07:00] {2897} INFO - iteration 78, current learner extra_tree
[flaml.automl: 09-19 13:07:01] {3072} INFO -  at 50.6s,	estimato

In [86]:
y_pred = automl.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.9402    0.9724    0.9560      1955
quasar-like radio AGN / high-excitation radio galaxy     0.5556    0.1053    0.1770        95
                                 star-forming galaxy     0.8904    0.9057    0.8980       583

                                            accuracy                         0.9263      2633
                                           macro avg     0.7954    0.6611    0.6770      2633
                                        weighted avg     0.9153    0.9263    0.9150      2633

