In [13]:
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Importing data

In [14]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [15]:
data = pd.read_csv("../../Data/MIGHTEE/Classification/final_gaussian_radio_more.csv")

In [16]:
# Dropping nan classifications
data = data[data['Classification'].notna()]

# Dropping quasar-like radio AGN / high-excitation radio galaxy
#mightee_data = mightee_data[mightee_data['Classification'] != 'star-forming galaxy']
print("Amount of rows:", len(data))

Amount of rows: 4370


In [17]:
X = data.drop(columns=['Classification', 'XAGN', 'midIRAGN', 'VLBAAGN', 'RLAGN', 'optAGN'])
y = data[['Classification', 'XAGN', 'midIRAGN', 'VLBAAGN', 'RLAGN', 'optAGN']]

# Filter by redshift slice

In [18]:
z = 0.6
#z = 3
delta_z = 0.5

redshift_filt = (X.Z_BEST >z) & (X.Z_BEST <(z+delta_z))
X_sliced = X[redshift_filt]

In [20]:
X_sliced.columns

Index(['Unnamed: 0', 'S_INT14', 'Z_BEST', 'S_PEAK14', 'ch1_flux_corr',
       'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr', 'F_MIPS_24',
       'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350', 'F_SPIRE_500',
       'EBV', 'Ks_flux_corr', 'H_flux_corr', 'J_flux_corr', 'i_flux_corr',
       'r_flux_corr', 'u_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'NUV_flux_corr', 'FUV_flux_corr', 'Total_flux', 'Peak_flux'],
      dtype='object')

# Splitting into train and testing set

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_sliced, y[redshift_filt], test_size=0.25, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=42)

## Filling nan's

In [139]:
imp = IterativeImputer(max_iter=100, min_value=0)
X_train = imp.fit_transform(X_train)

X_test = imp.transform(X_test)
X_val = imp.transform(X_val)

# Scaling

In [140]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Running UMAP

In [141]:
reducer = umap.UMAP(n_neighbors=5, min_dist=0.1)
X_train = reducer.fit_transform(X_train)

X_test = reducer.transform(X_test)
X_val = reducer.transform(X_val)

# Hyperparameter tuning

In [168]:
from bayes_opt import BayesianOptimization

In [169]:
from sklearn.model_selection import StratifiedKFold

# 5 folds
skf = StratifiedKFold(n_splits=5)

# Creating new sets
X_train, X_test, y_train, y_test = train_test_split(X_sliced, y[redshift_filt]['Classification'], test_size=0.25, random_state=42)

In [174]:
imp = IterativeImputer(max_iter=100, min_value=0)
scaler = MaxAbsScaler()

def optimise(max_depth, 
                     reg_alpha, 
                     reg_lambda, 
                     min_child_weight, 
                     eta,
                     n_neighbors,
                     min_dist
                    ):
    # The algorithm gives floats, so we round them
    max_depth = int(max_depth)
    n_neighbors = int(n_neighbors)
    
    # UMAP
    reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist)
    
    model = XGBClassifier(use_label_encoder=True, 
                      max_depth=max_depth, 
                      reg_alpha=reg_alpha,   
                      min_child_weight=min_child_weight, 
                      reg_lambda=reg_lambda,
                      eta=eta,
                      tree_method='hist',
                      #gpu_id=0,
                      objective='multi:softprob',
                      eval_metric =['merror'],
                      nthread=8,
                      n_estimators=10**5
                      )
    
    y_preds = []
    for train_index, val_index in skf.split(X_train, y_train):    
        # Splitting up the new train and test set (from the original train set)
        X_train_cv, X_test_cv = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_cv, y_test_cv = y_train.iloc[train_index], y_train.iloc[val_index]
        
        X_test_cv, X_val_cv, y_test_cv, y_val_cv = train_test_split(X_test_cv, y_test_cv, train_size=0.8, 
                                                                    stratify=y_test_cv, random_state=42)
        
        # Filling nan's
        X_train_cv = imp.fit_transform(X_train_cv)
        X_test_cv = imp.transform(X_test_cv)
        X_val_cv = imp.transform(X_val_cv)
        
        # Scaling
        X_train_cv = scaler.fit_transform(X_train_cv)
        X_test_cv = scaler.transform(X_test_cv)
        X_val_cv = scaler.transform(X_val_cv)
        
        # UMAP reducing
        X_train_cv = reducer.fit_transform(X_train_cv)
        X_test_cv = reducer.transform(X_test_cv)
        X_val_cv = reducer.transform(X_val_cv)

        # Training the model
        model.fit(X_train_cv, y_train_cv, 
                    verbose=False, 
                    eval_set=[
                        (X_val_cv, y_val_cv)
                    ], 
                    early_stopping_rounds=25
                   )
        
        

        # Getting the accuracy and appending
        y_pred = model.predict(X_test_cv)
        y_preds.append(accuracy_score(y_test_cv, y_pred))
    return np.mean(y_preds)

In [175]:
# Bounded region of parameter space
pbounds = {
    'max_depth': (3, 10),
    'reg_alpha': (0, 10),
    'reg_lambda': (0, 30),
    'min_child_weight': (0, 10),
    'eta': (0.01, 0.5),
    'n_neighbors': (2,50),
    'min_dist': (0.001, 1)
}

In [176]:
optimizer = BayesianOptimization(
    f=optimise,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    random_state=42,
)

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimizer.maximize(
        init_points=5,
        n_iter=100,
    )

|   iter    |  target   |    eta    | max_depth | min_ch... | min_dist  | n_neig... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9735   [0m | [0m0.1935   [0m | [0m9.655    [0m | [0m7.32     [0m | [0m0.5991   [0m | [0m9.489    [0m | [0m1.56     [0m | [0m1.743    [0m |
| [0m2        [0m | [0m0.9676   [0m | [0m0.4344   [0m | [0m7.208    [0m | [0m7.081    [0m | [0m0.02156  [0m | [0m48.56    [0m | [0m8.324    [0m | [0m6.37     [0m |
| [0m3        [0m | [0m0.9706   [0m | [0m0.09909  [0m | [0m4.284    [0m | [0m3.042    [0m | [0m0.5252   [0m | [0m22.73    [0m | [0m2.912    [0m | [0m18.36    [0m |
| [0m4        [0m | [0m0.9706   [0m | [0m0.07835  [0m | [0m5.045    [0m | [0m3.664    [0m | [0m0.4566   [0m | [0m39.69    [0m | [0m1.997    [0m | [0m15.43    [0m |
| [0m5        [0m | [0m0.9657   [0m | [0m0.3003