In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Importing data (just original for now)

In [None]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [None]:
data = pd.read_csv("../../Data/Fangyou_data/Cleaned/combined_using_similar_columns.csv")

In [None]:
X = data[features_num]
y = data[[y_column, 'Source', 'Xray', 'Opt_spec', 'IRAGN', 'Donley']]

# Dropping columns with little data (optional)

In [None]:
X = X.drop(columns=['NUV_flux_corr', 'Bw_flux_corr', 'y_flux_corr', 'H_flux_corr', 'Ks_flux_corr', 'K_flux_corr',
                   'g_flux_corr', 'nb921_hsc_flux_corr'])

## Filling nan's

In [None]:
imp = IterativeImputer(max_iter=100, min_value=0)
X_filled = imp.fit_transform(X)

In [None]:
X_filled = np.log10(X_filled+0.1)

## Normalisations

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()
X_filled = scaler.fit_transform(X_filled)

## Running DBSCAN

In [61]:
clustering = DBSCAN(eps=0.5, min_samples=1000, n_jobs=64).fit(X_filled)
np.unique(clustering.labels_, return_counts=True)

(array([-1]), array([77609]))

In [42]:
from bayes_opt import BayesianOptimization


In [62]:
def optimise(eps):
    clustering = DBSCAN(eps=eps, min_samples=1000, n_jobs=8).fit(X_filled)
    
    labels, counts = np.unique(clustering.labels_, return_counts=True)
    
    return len(labels[labels!=-1])

In [66]:
# Bounded region of parameter space
pbounds = {
    'eps': (0.3, 0.5)
}

In [67]:
optimizer = BayesianOptimization(
    f=optimise,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
    #random_state=42,
)

In [68]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimizer.maximize(
        init_points=5,
        n_iter=100,
    )

|   iter    |  target   |    eps    |
-------------------------------------
| [0m1        [0m | [0m1.0      [0m | [0m0.393    [0m |
| [0m2        [0m | [0m0.0      [0m | [0m0.3382   [0m |
| [0m3        [0m | [0m1.0      [0m | [0m0.3545   [0m |
| [0m4        [0m | [0m0.0      [0m | [0m0.3023   [0m |
| [0m5        [0m | [0m1.0      [0m | [0m0.3828   [0m |
| [95m6        [0m | [95m2.0      [0m | [95m0.3673   [0m |
| [0m7        [0m | [0m1.0      [0m | [0m0.4179   [0m |
| [0m8        [0m | [0m2.0      [0m | [0m0.3672   [0m |
| [0m9        [0m | [0m1.0      [0m | [0m0.5      [0m |
| [0m10       [0m | [0m1.0      [0m | [0m0.4599   [0m |
| [0m11       [0m | [0m1.0      [0m | [0m0.4391   [0m |
| [0m12       [0m | [0m1.0      [0m | [0m0.4801   [0m |
| [0m13       [0m | [0m2.0      [0m | [0m0.371    [0m |
| [0m14       [0m | [0m2.0      [0m | [0m0.3692   [0m |
| [0m15       [0m | [0m1.0      [0m | [0m0.4054 

KeyboardInterrupt: 