In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
mightee_data = pd.read_csv("../../../Data/MIGHTEE/Classification/final_gaussian_radio.csv")

In [3]:
# Dropping nan classifications
mightee_data = mightee_data[mightee_data['Classification'].notna()]
print("Amount of rows:", len(mightee_data))

Amount of rows: 4370


In [4]:
X = mightee_data[[c for c in mightee_data.columns if c != 'Classification']]
y = mightee_data['Classification']

In [5]:
X.columns

Index(['Unnamed: 0', 'S_INT14', 'Z_BEST', 'S_PEAK14', 'ch1_flux_corr',
       'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr', 'F_MIPS_24',
       'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500',
       'EBV', 'Ks_flux_corr', 'H_flux_corr', 'J_flux_corr', 'i_flux_corr',
       'r_flux_corr', 'u_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'NUV_flux_corr', 'FUV_flux_corr', 'Total_flux', 'Peak_flux'],
      dtype='object')

In [6]:
X = X[['Z_BEST', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160',
       'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500', 'Ks_flux_corr',
       'H_flux_corr', 'J_flux_corr', 'i_flux_corr', 'r_flux_corr',
       'u_flux_corr', 'z_flux_corr', 'y_flux_corr', 'NUV_flux_corr',
       'FUV_flux_corr', 'Total_flux', 'Peak_flux', 'S_INT14', 'S_PEAK14']]

In [7]:
le = LabelEncoder()
labels = np.unique(y.astype(str))
y = le.fit_transform(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

In [9]:
X_train.columns

Index(['Z_BEST', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250',
       'F_SPIRE_350', 'F_SPIRE_500', 'Ks_flux_corr', 'H_flux_corr',
       'J_flux_corr', 'i_flux_corr', 'r_flux_corr', 'u_flux_corr',
       'z_flux_corr', 'y_flux_corr', 'NUV_flux_corr', 'FUV_flux_corr',
       'Total_flux', 'Peak_flux', 'S_INT14', 'S_PEAK14'],
      dtype='object')

In [249]:
from flaml import AutoML
automl = AutoML()

In [251]:
import warnings

estimators = ['xgb_limitdepth']
#estimators = ['extra_tree']

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    automl.fit(X_train, y_train, task="classification", metric='accuracy',
                                #X_val=X_test , y_val=y_test.to_numpy(),
                                #ensemble=True,
                                estimator_list=['extra_tree'], 
                                time_budget=60, n_jobs=8, 
                                eval_method='cv',
                                #log_file_name='general_extra_features3.log',
                                #starting_points=automl.best_config_per_estimator
              )

[flaml.automl: 10-18 13:41:16] {2390} INFO - task = classification
[flaml.automl: 10-18 13:41:16] {2392} INFO - Data split method: stratified
[flaml.automl: 10-18 13:41:16] {2396} INFO - Evaluation method: cv
[flaml.automl: 10-18 13:41:16] {2465} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 10-18 13:41:16] {2605} INFO - List of ML learners in AutoML Run: ['extra_tree']
[flaml.automl: 10-18 13:41:16] {2897} INFO - iteration 0, current learner extra_tree
[flaml.automl: 10-18 13:41:16] {3025} INFO - Estimated sufficient time budget=961s. Estimated necessary time budget=1s.
[flaml.automl: 10-18 13:41:16] {3072} INFO -  at 0.1s,	estimator extra_tree's best error=0.4701,	best estimator extra_tree's best error=0.4701
[flaml.automl: 10-18 13:41:16] {2897} INFO - iteration 1, current learner extra_tree
[flaml.automl: 10-18 13:41:16] {3072} INFO -  at 0.3s,	estimator extra_tree's best error=0.3919,	best estimator extra_tree's best error=0.3919
[flaml.automl: 10-18 13:41:16] {2897} I

[flaml.automl: 10-18 13:41:24] {2897} INFO - iteration 34, current learner extra_tree
[flaml.automl: 10-18 13:41:24] {3072} INFO -  at 8.9s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:24] {2897} INFO - iteration 35, current learner extra_tree
[flaml.automl: 10-18 13:41:25] {3072} INFO -  at 9.9s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:25] {2897} INFO - iteration 36, current learner extra_tree
[flaml.automl: 10-18 13:41:26] {3072} INFO -  at 10.1s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:26] {2897} INFO - iteration 37, current learner extra_tree
[flaml.automl: 10-18 13:41:26] {3072} INFO -  at 10.4s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:26] {2897} INFO - iteration 38, current learner extra_t

[flaml.automl: 10-18 13:41:41] {2897} INFO - iteration 70, current learner extra_tree
[flaml.automl: 10-18 13:41:41] {3072} INFO -  at 25.8s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:41] {2897} INFO - iteration 71, current learner extra_tree
[flaml.automl: 10-18 13:41:42] {3072} INFO -  at 26.1s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:42] {2897} INFO - iteration 72, current learner extra_tree
[flaml.automl: 10-18 13:41:42] {3072} INFO -  at 26.6s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:42] {2897} INFO - iteration 73, current learner extra_tree
[flaml.automl: 10-18 13:41:43] {3072} INFO -  at 27.0s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:43] {2897} INFO - iteration 74, current learner extra

[flaml.automl: 10-18 13:41:54] {2897} INFO - iteration 106, current learner extra_tree
[flaml.automl: 10-18 13:41:54] {3072} INFO -  at 38.9s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:54] {2897} INFO - iteration 107, current learner extra_tree
[flaml.automl: 10-18 13:41:55] {3072} INFO -  at 39.3s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:55] {2897} INFO - iteration 108, current learner extra_tree
[flaml.automl: 10-18 13:41:55] {3072} INFO -  at 39.6s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:55] {2897} INFO - iteration 109, current learner extra_tree
[flaml.automl: 10-18 13:41:56] {3072} INFO -  at 40.1s,	estimator extra_tree's best error=0.1995,	best estimator extra_tree's best error=0.1995
[flaml.automl: 10-18 13:41:56] {2897} INFO - iteration 110, current learner 

[flaml.automl: 10-18 13:42:06] {2897} INFO - iteration 142, current learner extra_tree
[flaml.automl: 10-18 13:42:07] {3072} INFO -  at 51.3s,	estimator extra_tree's best error=0.1971,	best estimator extra_tree's best error=0.1971
[flaml.automl: 10-18 13:42:07] {2897} INFO - iteration 143, current learner extra_tree
[flaml.automl: 10-18 13:42:07] {3072} INFO -  at 51.6s,	estimator extra_tree's best error=0.1971,	best estimator extra_tree's best error=0.1971
[flaml.automl: 10-18 13:42:07] {2897} INFO - iteration 144, current learner extra_tree
[flaml.automl: 10-18 13:42:07] {3072} INFO -  at 51.9s,	estimator extra_tree's best error=0.1963,	best estimator extra_tree's best error=0.1963
[flaml.automl: 10-18 13:42:07] {2897} INFO - iteration 145, current learner extra_tree
[flaml.automl: 10-18 13:42:08] {3072} INFO -  at 52.2s,	estimator extra_tree's best error=0.1963,	best estimator extra_tree's best error=0.1963
[flaml.automl: 10-18 13:42:08] {2897} INFO - iteration 146, current learner 

In [252]:
y_pred = automl.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.7059    0.4000    0.5106        30
quasar-like radio AGN / high-excitation radio galaxy     0.5882    0.4082    0.4819        49
                                     radio-quiet AGN     0.7500    0.7317    0.7407        82
                                 star-forming galaxy     0.8087    0.9673    0.8810       153

                                            accuracy                         0.7643       314
                                           macro avg     0.7132    0.6268    0.6536       314
                                        weighted avg     0.7492    0.7643    0.7467       314



In [238]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(max_leaf_nodes=1000, n_estimators=100).fit(X_train_imp, y_train)

In [239]:
y_pred = clf.predict(X_test_imp)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.6985    0.5135    0.5919       185
quasar-like radio AGN / high-excitation radio galaxy     0.5000    0.0612    0.1091        49
                                     radio-quiet AGN     0.8889    0.2927    0.4404        82
                                 star-forming galaxy     0.7702    0.9731    0.8599       558

                                            accuracy                         0.7609       874
                                           macro avg     0.7144    0.4601    0.5003       874
                                        weighted avg     0.7510    0.7609    0.7217       874



## Random Forest

In [202]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_imp = imp_mean.fit_transform(X_train)
X_test_imp = imp_mean.transform(X_test)

In [203]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0, n_estimators=50, max_leaf_nodes=100).fit(X_train_imp, y_train)

In [204]:
y_pred = clf.predict(X_test_imp)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.6781    0.5351    0.5982       185
quasar-like radio AGN / high-excitation radio galaxy     0.8000    0.0816    0.1481        49
                                     radio-quiet AGN     1.0000    0.1707    0.2917        82
                                 star-forming galaxy     0.7616    0.9677    0.8524       558

                                            accuracy                         0.7517       874
                                           macro avg     0.8099    0.4388    0.4726       874
                                        weighted avg     0.7685    0.7517    0.7065       874



In [9]:
from sklearn.svm import SVC
def optimise_rf(n_estimators, max_leaf_nodes):
    
    clf = RandomForestClassifier(random_state=0, n_estimators=int(n_estimators), max_leaf_nodes=int(max_leaf_nodes), n_jobs=8).fit(X_train_imp, y_train)
    
    
    y_pred = clf.predict(X_val_imp)
    return accuracy_score(y_val, y_pred)

def optimise_scv(C):
    clf = SVC(random_state=0, C=C, kernel='poly').fit(X_train_imp, y_train)
    
    y_pred = clf.predict(X_val_imp)
    return accuracy_score(y_val, y_pred)

In [32]:
from bayes_opt import BayesianOptimization

In [33]:
# Bounded region of parameter space
pbounds = {
    'n_estimators': (1, 500),
    'max_leaf_nodes': (2, 200),
}
pbounds = {
    'C': (0.1, 50),
}

In [11]:
X_test_imp, X_val_imp, y_test, y_val = train_test_split(X_test_imp, y_test, train_size=0.6, stratify=y_test, random_state=42)

In [34]:
optimizer = BayesianOptimization(
    f=optimise_scv,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
)

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimizer.maximize(
        init_points=5,
        n_iter=1000,
    )

In [None]:
clf = SVC(random_state=0, C=1, kernel='linear').fit(X_train_imp, y_train)

In [None]:
y_pred = clf.predict(X_val_imp)
print(classification_report(y_val, y_pred, target_names=labels, digits=4))

## XGBoost

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.8, stratify=y_test, random_state=42)

In [265]:
model = XGBClassifier(use_label_encoder=False, 
                      max_depth=3, 
                      #reg_alpha=1,   
                      #min_child_weight=0.5, 
                      reg_lambda=5,
                      #subsample=0.5,
                      eta=0.1, # Remember to reduce to 0.1 or 0.05 for better results
                      tree_method='exact', # exact is more precies, but this is much faster
                      gpu_id=1,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=500,
                      )

In [266]:
bst = model.fit(X_train, y_train, 
                eval_set=[
                    (X_val, y_val),
                ], 
                early_stopping_rounds=20,
                verbose=True, 
                #sample_weight=classes_weights,
               ) #

[0]	validation_0-merror:0.22222
[1]	validation_0-merror:0.25397
[2]	validation_0-merror:0.23810
[3]	validation_0-merror:0.25397
[4]	validation_0-merror:0.23810
[5]	validation_0-merror:0.23810
[6]	validation_0-merror:0.23810
[7]	validation_0-merror:0.23810
[8]	validation_0-merror:0.23810
[9]	validation_0-merror:0.23810
[10]	validation_0-merror:0.23810
[11]	validation_0-merror:0.23810
[12]	validation_0-merror:0.23810
[13]	validation_0-merror:0.23810


  from pandas import MultiIndex, Int64Index


[14]	validation_0-merror:0.23810
[15]	validation_0-merror:0.23810
[16]	validation_0-merror:0.22222
[17]	validation_0-merror:0.22222
[18]	validation_0-merror:0.22222
[19]	validation_0-merror:0.22222


In [267]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels, digits=4))

                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.7143    0.4167    0.5263        24
quasar-like radio AGN / high-excitation radio galaxy     0.5714    0.6154    0.5926        39
                                     radio-quiet AGN     0.7885    0.6212    0.6949        66
                                 star-forming galaxy     0.8392    0.9836    0.9057       122

                                            accuracy                         0.7769       251
                                           macro avg     0.7283    0.6592    0.6799       251
                                        weighted avg     0.7723    0.7769    0.7653       251

