In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.utils.class_weight import compute_class_weight


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
from bayes_opt import BayesianOptimization

In [2]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [3]:
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/combined_using_similar_columns.csv")

In [4]:
X = data[features_num]
y = data[[y_column, 'Source']]

# Adding 2 class columns

In [5]:
def AGN(row):
    
    if row['Classification'] == 'jet-mode radio AGN/low-excitation radio galaxy':
        return 1
    elif row['Classification'] == 'quasar-like radio AGN / high-excitation radio galaxy':
        return 1
    elif row['Classification'] == 'radio-quiet AGN':
        return 1
    elif row['Classification'] == 'star-forming galaxy':
        return 0

In [6]:
# Temporarily relabelling it for function
y['AGN'] =  y.apply(AGN, axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['AGN'] =  y.apply(AGN, axis=1, result_type='expand')


## Making subsets

In [7]:
X_elais = X[y['Source']=='Elais-N1']
y_elais = y[y['Source']=='Elais-N1']

X_lockman = X[y['Source']=='Lockman']
y_lockman = y[y['Source']=='Lockman']

X_bootes = X[y['Source']=='Bootes']
y_bootes = y[y['Source']=='Bootes']

In [8]:
X_combined = pd.concat([X_lockman, X_elais])
y_combined = pd.concat([y_lockman, y_elais])

## Adding noisy samples

## Converting PACS and SPIRE quartiles to stds

In [9]:
non_sigma_columns_u = ['FErr_MIPS_24_u', 'FErr_PACS_100_u', 'FErr_PACS_160_u', 
                     'FErr_SPIRE_250_u', 'FErr_SPIRE_350_u', 'FErr_SPIRE_500_u']
non_sigma_columns_l = ['FErr_MIPS_24_l', 'FErr_PACS_100_l', 'FErr_PACS_160_l', 
                     'FErr_SPIRE_250_l', 'FErr_SPIRE_350_l', 'FErr_SPIRE_500_l']
non_sigma_columns_corresponding = ['F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 
                                 'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500']

In [10]:
# Converting from 84th Percentile to 1 sigma and 16th percentile to 1 sigma and then taking the average
for column1, column2, column3 in zip(non_sigma_columns_u, non_sigma_columns_l, non_sigma_columns_corresponding):
    std1 = np.abs(data[column3] - data[column1])/0.9945
    std2 = np.abs(X[column3] - data[column2])/0.9945
    data[column3 + '_error'] = (std1+std2)/2
    data = data.drop(columns=[column1, column2])

## Making the noisy samples

In [11]:
signal = ['Total_flux', 'Peak_flux', 'NUV_flux_corr', 'u_flux_corr',
       'Bw_flux_corr', 'R_flux_corr', 'I_flux_corr', 'z_flux_corr',
       'y_flux_corr', 'J_flux_corr', 'H_flux_corr', 'K_flux_corr',
       'Ks_flux_corr', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'g_flux_corr',
       'nb921_hsc_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250',
       'F_SPIRE_350', 'F_SPIRE_500']
noise=[
        'E_Total_flux', 'E_Peak_flux',
       'NUV_fluxerr_corr', 'u_fluxerr_corr', 'Bw_fluxerr_corr',
       'R_fluxerr_corr', 'I_fluxerr_corr', 'z_fluxerr_corr', 'y_fluxerr_corr',
       'J_fluxerr_corr', 'H_fluxerr_corr', 'K_fluxerr_corr', 'Ks_fluxerr_corr',
       'ch1_fluxerr_corr', 'ch2_fluxerr_corr', 'ch3_fluxerr_corr',
       'ch4_fluxerr_corr', 'g_fluxerr_corr', 'nb921_hsc_fluxerr_corr',
       'F_MIPS_24_error', 'F_PACS_100_error', 'F_PACS_160_error',
       'F_SPIRE_250_error', 'F_SPIRE_350_error', 'F_SPIRE_500_error',
]

In [12]:
def resample(X, flux_column, error_column, sn_mult=1):
    noise =  np.random.normal(0, sn_mult*X[error_column])
    
    # adding the noise
    return X[flux_column] + noise

In [13]:
data_resampled = data.copy()
for c1, c2 in zip(signal, noise):
        data_resampled[c1] = resample(data_resampled, c1, c2, 1)

In [14]:
X_lockman_resampled = data_resampled[data_resampled['Source']=='Lockman'][features_num]
y_lockman_resampled = data_resampled[data_resampled['Source']=='Lockman'][[y_column, 'Source']]

X_elais_resampled = data_resampled[data_resampled['Source']=='Elais-N1'][features_num]
y_elais_resampled = data_resampled[data_resampled['Source']=='Elais-N1'][[y_column, 'Source']]

In [15]:
X_combined_resampled = pd.concat([X_elais_resampled, X_lockman_resampled, X_combined])
y_combined_resampled = pd.concat([y_elais_resampled, y_lockman_resampled, y_combined])

In [16]:
# Temporarily relabelling it for function
y_combined_resampled['AGN'] =  y_combined_resampled.apply(AGN, axis=1, result_type='expand')

# BH

In [17]:
Best_Heckman_data = pd.read_csv("../../../Data/Best&Heckman/BestHeckman+SDSS+wise+LOFAR_better_fixed_fluxes.csv")

In [18]:
# Only selecting data with a classification
Best_Heckman_data = Best_Heckman_data[Best_Heckman_data['Classification'] != 'Radio-loud AGN'] 

In [19]:
Best_Heckman_X = Best_Heckman_data[[c for c in Best_Heckman_data.columns if c != 'Classification']]
Best_Heckman_y = Best_Heckman_data[['Classification']]

In [20]:
Best_Heckman_X = Best_Heckman_X[['Z_BEST', 'u_flux_corr',
       'g_flux_corr', 'R_flux_corr', 'I_flux_corr', 'z_flux_corr', 'ch1_flux_corr', 'ch2_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'Ks_flux_corr', 'Peak_flux', 'Total_flux']]

# Adding nans to missing columns
Best_Heckman_X[['NUV_flux_corr', 'Bw_flux_corr', 'y_flux_corr', 'K_flux_corr', 
                'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
                'F_SPIRE_500', 'nb921_hsc_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr']] = np.nan

In [21]:
# Changing column order
Best_Heckman_X = Best_Heckman_X[features_num]

In [22]:
Best_Heckman_y['AGN'] =  Best_Heckman_y.apply(AGN, axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Best_Heckman_y['AGN'] =  Best_Heckman_y.apply(AGN, axis=1, result_type='expand')


# Hyperparameter search

In [38]:
print(classification_report(y_bootes['AGN'], automl.predict(X_bootes), digits=4))

              precision    recall  f1-score   support

           0     0.9072    0.8680    0.8872     12213
           1     0.7332    0.8033    0.7667      5516

    accuracy                         0.8479     17729
   macro avg     0.8202    0.8357    0.8269     17729
weighted avg     0.8530    0.8479    0.8497     17729



In [49]:
from flaml import AutoML
automl = AutoML()

In [231]:
y_combined

Unnamed: 0,Classification,Source,AGN
47977,star-forming galaxy,Lockman,0
47978,star-forming galaxy,Lockman,0
47979,radio-quiet AGN,Lockman,1
47980,jet-mode radio AGN/low-excitation radio galaxy,Lockman,1
47981,star-forming galaxy,Lockman,0
...,...,...,...
47972,star-forming galaxy,Elais-N1,0
47973,star-forming galaxy,Elais-N1,0
47974,star-forming galaxy,Elais-N1,0
47975,jet-mode radio AGN/low-excitation radio galaxy,Elais-N1,1


In [236]:
import warnings

estimators = ['xgb_limitdepth']
#estimators = ['extra_tree']

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    automl.fit(X, y['Classification'], task="classification", metric='macro_f1',
                                #X_val=X_bootes , y_val=y_bootes['Classification'],
                                #ensemble=True,
                                #estimator_list=['xgboost'], 
                                time_budget=600, n_jobs=8,
                                #eval_method='cv',
                                log_file_name='general.log',
                                #starting_points=automl.best_config_per_estimator
              )

[flaml.automl: 02-27 15:23:10] {2390} INFO - task = classification
INFO:flaml.automl:task = classification
[flaml.automl: 02-27 15:23:10] {2392} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 02-27 15:23:10] {2396} INFO - Evaluation method: cv
INFO:flaml.automl:Evaluation method: cv
[flaml.automl: 02-27 15:23:10] {2465} INFO - Minimizing error metric: 1-macro_f1
INFO:flaml.automl:Minimizing error metric: 1-macro_f1
[flaml.automl: 02-27 15:23:10] {2605} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl: 02-27 15:23:10] {2897} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.automl: 02-27 15:23:14] {3025} INFO - Estimated sufficient time budget=39420s. Estimated necessary time budget=909s.
INFO:fla

INFO:flaml.automl: at 115.2s,	estimator rf's best error=0.6397,	best estimator lgbm's best error=0.2845
[flaml.automl: 02-27 15:25:05] {2897} INFO - iteration 20, current learner lgbm
INFO:flaml.automl:iteration 20, current learner lgbm
[flaml.automl: 02-27 15:26:05] {3072} INFO -  at 175.8s,	estimator lgbm's best error=0.2845,	best estimator lgbm's best error=0.2845
INFO:flaml.automl: at 175.8s,	estimator lgbm's best error=0.2845,	best estimator lgbm's best error=0.2845
[flaml.automl: 02-27 15:26:05] {2897} INFO - iteration 21, current learner rf
INFO:flaml.automl:iteration 21, current learner rf
[flaml.automl: 02-27 15:26:13] {3072} INFO -  at 184.2s,	estimator rf's best error=0.6397,	best estimator lgbm's best error=0.2845
INFO:flaml.automl: at 184.2s,	estimator rf's best error=0.6397,	best estimator lgbm's best error=0.2845
[flaml.automl: 02-27 15:26:13] {2897} INFO - iteration 22, current learner lgbm
INFO:flaml.automl:iteration 22, current learner lgbm
[flaml.automl: 02-27 15:26:

KeyboardInterrupt: 

In [235]:
print(classification_report(Best_Heckman_y['Classification'], automl.predict(Best_Heckman_X), digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                                                      precision    recall  f1-score   support

      jet-mode radio AGN/low-excitation radio galaxy     0.7792    0.8576    0.8166      9771
quasar-like radio AGN / high-excitation radio galaxy     0.0000    0.0000    0.0000       478
                                 star-forming galaxy     0.3875    0.3203    0.3507      2913

                                            accuracy                         0.7076     13162
                                           macro avg     0.3889    0.3926    0.3891     13162
                                        weighted avg     0.6642    0.7076    0.6838     13162



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [286]:
import lightgbm as lgb

In [287]:
param = {
    #'n_estimators': 189,
    'n_estimators': 10**5,
  'num_leaves': 20,
  'min_child_samples': 6,
  'learning_rate': 0.06500463168967072,
  'colsample_bytree': 0.6649148062238498,
  'reg_alpha': 0.0009765625,
  'reg_lambda': 0.004681547467007761
}

In [288]:
train_data = lgb.Dataset(X_combined, label=y_combined['AGN'])
validation_data = lgb.Dataset(X_bootes, label=y_bootes['AGN'])

In [289]:
bst = lgb.train(param, train_data, valid_sets=[train_data, validation_data], early_stopping_rounds=20)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 59880, number of used features: 23
[LightGBM] [Info] Start training from score 0.258066
[1]	training's l2: 0.182898	valid_1's l2: 0.206177
Training until validation scores don't improve for 20 rounds
[2]	training's l2: 0.174888	valid_1's l2: 0.196376
[3]	training's l2: 0.167218	valid_1's l2: 0.187074
[4]	training's l2: 0.1618	valid_1's l2: 0.18071
[5]	training's l2: 0.156236	valid_1's l2: 0.174203
[6]	training's l2: 0.150885	valid_1's l2: 0.167528
[7]	training's l2: 0.146085	valid_1's l2: 0.162764
[8]	training's l2: 0.142183	valid_1's l2: 0.158223
[9]	training's l2: 0.138345	valid_1's l2: 0.153657
[10]	training's l2: 0.133933	valid_1's l2: 0.148977
[11]	training's l2: 0.131114	valid_1's l2: 0.146476
[12]	training's l2: 0.127548	valid_1's l2: 0.14297
[13]	training's l2: 0.124526	valid_1's l2: 0.140383
[14]	training's l2: 0.121772	valid_1's

In [290]:
pred = bst.predict(Best_Heckman_X)
pred[pred>0.5]=1
pred[pred<0.5]=0
print(classification_report(Best_Heckman_y['AGN'], pred, digits=4))

              precision    recall  f1-score   support

           0     0.8443    0.5043    0.6314      2913
           1     0.8736    0.9736    0.9209     10249

    accuracy                         0.8697     13162
   macro avg     0.8589    0.7389    0.7761     13162
weighted avg     0.8671    0.8697    0.8568     13162



In [291]:
filt = ~Best_Heckman_X['Total_flux'].isna()

In [None]:
X_bootes_noradio = X_bootes.copy()

In [295]:
pred = bst.predict(X_bootes)
pred[pred>0.5]=1
pred[pred<0.5]=0
print(classification_report(y_bootes['AGN'], pred, digits=4))

              precision    recall  f1-score   support

           0     0.9089    0.8549    0.8811     12213
           1     0.7161    0.8104    0.7603      5516

    accuracy                         0.8411     17729
   macro avg     0.8125    0.8326    0.8207     17729
weighted avg     0.8489    0.8411    0.8435     17729



## Bayesian opt

In [225]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score

params = {'verbose': -1}
train_data = lgb.Dataset(X_combined, label=y_combined['AGN'], params=params, free_raw_data=False)
validation_data = lgb.Dataset(X_bootes, label=y_bootes['AGN'], params=params, free_raw_data=False)

In [226]:
def optimise_xgboost(num_leaves,
                     learning_rate,
                     colsample_bytree
                    ):
    num_leaves = int(num_leaves)
    
    param = {
        'n_estimators': 10**5,
      'num_leaves': num_leaves,
      'learning_rate': learning_rate,
      'colsample_bytree': colsample_bytree,
      'verbose':-1
    }
    
    # Training the model
    bst = lgb.train(param, train_data, valid_sets=[validation_data], early_stopping_rounds=20, verbose_eval=False)

    # Getting the accuracy and appending
    pred = bst.predict(Best_Heckman_X)
    
    pred[pred>0.5]=1
    pred[pred<0.5]=0

    return f1_score(Best_Heckman_y['AGN'], pred, average='macro')

In [227]:
# Bounded region of parameter space
pbounds = {
    'num_leaves': (10, 50),
    'learning_rate': (0.001, 0.8),
    'colsample_bytree': (0.1,1)
}

In [228]:
optimizer = BayesianOptimization(
    f=optimise_xgboost,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
)

In [230]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimizer.maximize(
        init_points=10,
        n_iter=1000,
    )

|   iter    |  target   | colsam... | learni... | num_le... |
-------------------------------------------------------------
| [0m24       [0m | [0m0.3781   [0m | [0m0.6963   [0m | [0m0.2501   [0m | [0m30.8     [0m |
| [0m25       [0m | [0m0.3104   [0m | [0m0.592    [0m | [0m0.1487   [0m | [0m48.78    [0m |
| [0m26       [0m | [0m0.2415   [0m | [0m0.7976   [0m | [0m0.7517   [0m | [0m45.79    [0m |
| [0m27       [0m | [0m0.2254   [0m | [0m0.6381   [0m | [0m0.7376   [0m | [0m13.54    [0m |
| [0m28       [0m | [0m0.3826   [0m | [0m0.2764   [0m | [0m0.03714  [0m | [0m23.01    [0m |
| [0m29       [0m | [0m0.4378   [0m | [0m0.4498   [0m | [0m0.2178   [0m | [0m43.15    [0m |
| [0m30       [0m | [0m0.7098   [0m | [0m0.4211   [0m | [0m0.2255   [0m | [0m31.71    [0m |
| [0m31       [0m | [0m0.189    [0m | [0m0.2268   [0m | [0m0.642    [0m | [0m12.98    [0m |
| [0m32       [0m | [0m0.3214   [0m | [0m0.9882   [0m 

| [0m103      [0m | [0m0.2619   [0m | [0m0.9404   [0m | [0m0.09989  [0m | [0m49.22    [0m |
| [0m104      [0m | [0m0.2997   [0m | [0m0.6859   [0m | [0m0.3914   [0m | [0m19.8     [0m |
| [0m105      [0m | [0m0.2681   [0m | [0m0.8943   [0m | [0m0.41     [0m | [0m11.96    [0m |
| [0m106      [0m | [0m0.4411   [0m | [0m0.8904   [0m | [0m0.4617   [0m | [0m21.96    [0m |
| [0m107      [0m | [0m0.2246   [0m | [0m0.756    [0m | [0m0.1781   [0m | [0m21.95    [0m |
| [0m108      [0m | [0m0.3118   [0m | [0m0.6112   [0m | [0m0.297    [0m | [0m34.23    [0m |
| [0m109      [0m | [0m0.4373   [0m | [0m0.2088   [0m | [0m0.7548   [0m | [0m47.02    [0m |
| [0m110      [0m | [0m0.746    [0m | [0m0.924    [0m | [0m0.1048   [0m | [0m30.73    [0m |
| [0m111      [0m | [0m0.2797   [0m | [0m0.4381   [0m | [0m0.7017   [0m | [0m48.68    [0m |
| [0m112      [0m | [0m0.4378   [0m | [0m0.404    [0m | [0m0.6838   [0m | 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/users/karsten/.local/lib/python3.9/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (0.13787494994592095, 0.008485200576467114, 41.67448229179727)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Software/users/modules/7/software/anaconda3/2021.11/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/lib/condor/execute/dir_1357265/ipykernel_1357405/1298557552.py", line 4, in <module>
    optimizer.maximize(
  File "/Users/users/karsten/.local/lib/python3.9/site-packages/bayes_opt/bayesian_optimization.py", line 305, in maximize
    self.probe(x_probe, lazy=False)
  File "/Users/users/karsten/.local/lib/python3.9/site-packages/bayes_opt/bayesian_optimization.py", line 200, in probe
    self._space.probe(params)
  Fil

TypeError: object of type 'NoneType' has no len()

## MIGHTEE

In [243]:
mightee_data = pd.read_csv("../../../Data/MIGHTEE/Classification/final_gaussian_radio_more.csv")

In [244]:
# Dropping nan classifications
mightee_data = mightee_data[mightee_data['Classification'].notna()]

# Dropping quasar-like radio AGN / high-excitation radio galaxy
#mightee_data = mightee_data[mightee_data['Classification'] != 'star-forming galaxy']
print("Amount of rows:", len(mightee_data))

Amount of rows: 4370


In [245]:
mightee_X = mightee_data[[c for c in mightee_data.columns if c != 'Classification']]
mightee_y = mightee_data[['Classification']]

In [246]:
mightee_X = mightee_X[['Z_BEST', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160',
       'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500', 'Ks_flux_corr',
       'H_flux_corr', 'J_flux_corr', 'i_flux_corr', 'r_flux_corr',
       'u_flux_corr', 'z_flux_corr', 'y_flux_corr', 'NUV_flux_corr',
       #'FUV_flux_corr', 
                       'Total_flux', 'Peak_flux']]

mightee_X[['I_flux_corr', 'R_flux_corr']] = mightee_X[['i_flux_corr', 'r_flux_corr']]
mightee_X = mightee_X.drop(columns=['i_flux_corr', 'r_flux_corr'])

# Adding nans to missing columns
mightee_X[['Bw_flux_corr', 'K_flux_corr', 'g_flux_corr', 'nb921_hsc_flux_corr']] = np.nan

# Changing order
mightee_X = mightee_X[['Total_flux', 'Peak_flux', 
                       'NUV_flux_corr', 'u_flux_corr', 
                       'Bw_flux_corr', 'R_flux_corr', 
                       'I_flux_corr', 'z_flux_corr', 
                       'y_flux_corr', 'J_flux_corr', 
                       'H_flux_corr', 'K_flux_corr', 
                       'Ks_flux_corr', 'ch1_flux_corr', 
                       'ch2_flux_corr', 'ch3_flux_corr', 
                       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 
                       'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350', 
                       'F_SPIRE_500', 'Z_BEST', 
                       'g_flux_corr', 'nb921_hsc_flux_corr']]

In [247]:
mightee_y['AGN'] =  mightee_y.apply(AGN, axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mightee_y['AGN'] =  mightee_y.apply(AGN, axis=1, result_type='expand')


In [248]:
pred = bst.predict(mightee_X)
#pred = (pred-pred.min())
#pred = pred/pred.max()
pred = np.rint(pred)
print(classification_report(mightee_y['AGN'], pred, digits=4))

              precision    recall  f1-score   support

           0     0.7843    0.7900    0.7871      2790
           1     0.6244    0.6165    0.6204      1580

    accuracy                         0.7272      4370
   macro avg     0.7044    0.7032    0.7038      4370
weighted avg     0.7265    0.7272    0.7268      4370

