In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_class_weight


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, XGBRegressor
import xgboost as xgb
from tpot import TPOTClassifier



In [2]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [3]:
data = pd.read_csv("../../../Data/Fangyou_data/Cleaned/combined_using_similar_columns.csv")
data['Source'] = 'Original'

In [4]:
X = data[features_num]
y = data[[y_column, 'Source', 'Xray', 'Opt_spec', 'IRAGN', 'Donley']]

In [5]:
le = LabelEncoder()
labels = np.unique(y["Classification"].astype(str))
y["Classification"] = le.fit_transform(y["Classification"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["Classification"] = le.fit_transform(y["Classification"])


In [6]:
# -1 in the LOFAR deep fields should become nans since they are factually nans
for c in ['Xray', 'Opt_spec', 'IRAGN', 'Donley']:
    y[c][y[c]==-1]=np.nan
    y[c][y[c] == 1] = True
    y[c][y[c] == 0] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[c][y[c]==-1]=np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[c][y[c] == 1] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[c][y[c] == 0] = False


## BH data

In [7]:
Best_Heckman_data = pd.read_csv("../../../Data/Best&Heckman/BestHeckman+SDSS+wise+LOFAR_better.csv")
Best_Heckman_data['Source'] = 'BH'

In [8]:
# Only selecting data with a classification
Best_Heckman_data = Best_Heckman_data[Best_Heckman_data['Classification'] != 'Radio-loud AGN'] 

In [9]:
Best_Heckman_X = Best_Heckman_data[[c for c in Best_Heckman_data.columns if c != 'Classification']]
Best_Heckman_y = Best_Heckman_data[['Classification', 'Source']]

In [10]:
Best_Heckman_y['Classification'] = le.transform(Best_Heckman_y['Classification'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Best_Heckman_y['Classification'] = le.transform(Best_Heckman_y['Classification'])


In [11]:
y = pd.concat([y, Best_Heckman_y]).reset_index(drop=True)

In [12]:
Best_Heckman_X = Best_Heckman_X[['Z_BEST', 'u_flux_corr',
       'g_flux_corr', 'R_flux_corr', 'I_flux_corr', 'z_flux_corr', 'ch1_flux_corr', 'ch2_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'Ks_flux_corr', 'Peak_flux', 'Total_flux']]

# Adding nans to missing columns
Best_Heckman_X[['NUV_flux_corr', 'Bw_flux_corr', 'y_flux_corr', 'K_flux_corr', 
                'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
                'F_SPIRE_500', 'nb921_hsc_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr']] = np.nan

In [13]:
X = pd.concat([X, Best_Heckman_X]).reset_index(drop=True)

### MIGHTEE data

In [14]:
mightee_data = pd.read_csv("../../../Data/MIGHTEE/Classification/final_gaussian_radio_more.csv")
mightee_data['Source'] = 'MIGHTEE'

In [15]:
# Dropping nan classifications
mightee_data = mightee_data[mightee_data['Classification'].notna()]

# Dropping quasar-like radio AGN / high-excitation radio galaxy
#mightee_data = mightee_data[mightee_data['Classification'] != 'star-forming galaxy']
print("Amount of rows:", len(mightee_data))

Amount of rows: 4370


In [16]:
mightee_X = mightee_data[[c for c in mightee_data.columns if c != 'Classification']]
mightee_y = mightee_data[['Classification', 'Source', 'XAGN', 'midIRAGN', 'VLBAAGN', 'RLAGN', 'optAGN']]

In [17]:
mightee_y['Classification'] = le.transform(mightee_y['Classification'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mightee_y['Classification'] = le.transform(mightee_y['Classification'])


In [18]:
mightee_X = mightee_X[['Z_BEST', 'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr',
       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 'F_PACS_160',
       'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500', 'Ks_flux_corr',
       'H_flux_corr', 'J_flux_corr', 'i_flux_corr', 'r_flux_corr',
       'u_flux_corr', 'z_flux_corr', 'y_flux_corr', 'NUV_flux_corr',
       #'FUV_flux_corr', 
                       'Total_flux', 'Peak_flux']]

mightee_X[['I_flux_corr', 'R_flux_corr']] = mightee_X[['i_flux_corr', 'r_flux_corr']]
mightee_X = mightee_X.drop(columns=['i_flux_corr', 'r_flux_corr'])

# Adding nans to missing columns
mightee_X[['Bw_flux_corr', 'K_flux_corr', 'g_flux_corr', 'nb921_hsc_flux_corr']] = np.nan

# Changing order
mightee_X = mightee_X[['Total_flux', 'Peak_flux', 
                       'NUV_flux_corr', 'u_flux_corr', 
                       'Bw_flux_corr', 'R_flux_corr', 
                       'I_flux_corr', 'z_flux_corr', 
                       'y_flux_corr', 'J_flux_corr', 
                       'H_flux_corr', 'K_flux_corr', 
                       'Ks_flux_corr', 'ch1_flux_corr', 
                       'ch2_flux_corr', 'ch3_flux_corr', 
                       'ch4_flux_corr', 'F_MIPS_24', 'F_PACS_100', 
                       'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350', 
                       'F_SPIRE_500', 'Z_BEST', 
                       'g_flux_corr', 'nb921_hsc_flux_corr']]

### Bayesian optimisation

In [40]:
from bayes_opt import BayesianOptimization

In [41]:
def optimise_xgboost(max_depth, 
                     reg_alpha, 
                     reg_lambda, 
                     min_child_weight, 
                     eta,
                     max_delta_step,
                     n_estimators):
    # The algorithm gives floats, so we round them
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    model = XGBClassifier(use_label_encoder=False, 
                      max_depth=max_depth, 
                      reg_alpha=reg_alpha,   
                      min_child_weight=min_child_weight, 
                      reg_lambda=reg_lambda,
                      max_delta_step=max_delta_step,
                      eta=eta,
                      tree_method='hist',
                      #gpu_id=0,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=n_estimators
                      )

    # Training the model, no early stopping
    model.fit(X_train, y_train['Classification'], 
                verbose=False, 
               )

    # Getting the accuracy and appending
    y_pred = model.predict(mightee_X)
    return accuracy_score(mightee_y['Classification'], y_pred)

In [42]:
# Bounded region of parameter space
pbounds = {
    'max_depth': (5, 20),
    'reg_alpha': (0, 5),
    'reg_lambda': (0, 10),
    'min_child_weight': (0, 5),
    'eta': (0.05, 0.6),
    'max_delta_step': (0,10),
    'n_estimators': (10,1000)
}

In [43]:
from sklearn.model_selection import StratifiedKFold

# Getting k folds for cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y[['Source', 'Classification']], random_state=42)

In [44]:
optimizer = BayesianOptimization(
    f=optimise_xgboost,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
)

In [45]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimizer.maximize(
        init_points=5,
        n_iter=100,
    )

|   iter    |  target   |    eta    | max_de... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6666   [0m | [0m0.256    [0m | [0m9.507    [0m | [0m15.98    [0m | [0m2.993    [0m | [0m164.5    [0m | [0m0.78     [0m | [0m0.5808   [0m |
| [0m2        [0m | [0m0.6597   [0m | [0m0.5264   [0m | [0m6.011    [0m | [0m15.62    [0m | [0m0.1029   [0m | [0m970.2    [0m | [0m4.162    [0m | [0m2.123    [0m |
| [0m3        [0m | [0m0.6561   [0m | [0m0.15     [0m | [0m1.834    [0m | [0m9.564    [0m | [0m2.624    [0m | [0m437.6    [0m | [0m1.456    [0m | [0m6.119    [0m |
| [0m4        [0m | [0m0.6547   [0m | [0m0.1267   [0m | [0m2.921    [0m | [0m10.5     [0m | [0m2.28     [0m | [0m787.3    [0m | [0m0.9984   [0m | [0m5.142    [0m |
| [95m5        [0m | [95m0.6677   [0m | [95m0.3

| [0m45       [0m | [0m0.6744   [0m | [0m0.2236   [0m | [0m1.286    [0m | [0m10.09    [0m | [0m3.55     [0m | [0m10.47    [0m | [0m0.02673  [0m | [0m4.132    [0m |
| [0m46       [0m | [0m0.686    [0m | [0m0.2024   [0m | [0m1.973    [0m | [0m6.101    [0m | [0m4.414    [0m | [0m12.22    [0m | [0m0.9135   [0m | [0m5.105    [0m |
| [0m47       [0m | [0m0.6833   [0m | [0m0.3087   [0m | [0m9.208    [0m | [0m5.712    [0m | [0m0.4454   [0m | [0m12.67    [0m | [0m1.965    [0m | [0m8.173    [0m |
| [95m48       [0m | [95m0.6998   [0m | [95m0.6      [0m | [95m3.375    [0m | [95m5.0      [0m | [95m4.664    [0m | [95m10.02    [0m | [95m1.985    [0m | [95m0.08594  [0m |
| [0m49       [0m | [0m0.6643   [0m | [0m0.3202   [0m | [0m9.377    [0m | [0m16.15    [0m | [0m3.701    [0m | [0m301.6    [0m | [0m1.103    [0m | [0m0.7895   [0m |
| [0m50       [0m | [0m0.6954   [0m | [0m0.2258   [0m | [0m7.977    [0m |

| [0m90       [0m | [0m0.6492   [0m | [0m0.2386   [0m | [0m0.1665   [0m | [0m18.38    [0m | [0m2.365    [0m | [0m17.23    [0m | [0m4.209    [0m | [0m7.705    [0m |
| [0m91       [0m | [0m0.6686   [0m | [0m0.4069   [0m | [0m4.545    [0m | [0m19.49    [0m | [0m0.2749   [0m | [0m330.9    [0m | [0m4.258    [0m | [0m8.79     [0m |
| [0m92       [0m | [0m0.6879   [0m | [0m0.05     [0m | [0m2.005    [0m | [0m7.695    [0m | [0m5.0      [0m | [0m18.5     [0m | [0m5.0      [0m | [0m10.0     [0m |
| [0m93       [0m | [0m0.6522   [0m | [0m0.08914  [0m | [0m0.7145   [0m | [0m5.292    [0m | [0m0.3936   [0m | [0m694.9    [0m | [0m3.917    [0m | [0m7.659    [0m |
| [0m94       [0m | [0m0.6661   [0m | [0m0.1804   [0m | [0m9.776    [0m | [0m19.29    [0m | [0m2.488    [0m | [0m382.4    [0m | [0m4.179    [0m | [0m0.06438  [0m |
| [0m95       [0m | [0m0.6723   [0m | [0m0.5776   [0m | [0m3.425    [0m | [0m6.53