In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## Importing data (just original for now)

In [2]:
features_num = [
    'Total_flux', 'Peak_flux', 
       'NUV_flux_corr', 'u_flux_corr', 'Bw_flux_corr', 'R_flux_corr',
       'I_flux_corr', 'z_flux_corr', 'y_flux_corr',
       'J_flux_corr', 'H_flux_corr', 'K_flux_corr', 'Ks_flux_corr',
       'ch1_flux_corr', 'ch2_flux_corr', 'ch3_flux_corr', 'ch4_flux_corr',
       'F_MIPS_24', 'F_PACS_100', 'F_PACS_160', 'F_SPIRE_250', 'F_SPIRE_350',
       'F_SPIRE_500', 'Z_BEST',
       'g_flux_corr', 'nb921_hsc_flux_corr'
    ]
y_column = "Classification"

classes = ['jet-mode radio AGN/low-excitation radio galaxy', 'quasar-like radio AGN / high-excitation radio galaxy', 
           'radio-quiet AGN', 'star-forming galaxy']

In [3]:
data = pd.read_csv("../../Data/Best&Heckman/BestHeckman+SDSS+wise+LOFAR_better_fixed_fluxes.csv")

In [4]:
# Only selecting data with a classification
data = data[data['Classification'] != 'Radio-loud AGN'] 

In [5]:
data.columns

Index(['RAJ2000', 'DEJ2000', 'Z_BEST', 'SimbadName', 'u_flux_corr',
       'g_flux_corr', 'R_flux_corr', 'I_flux_corr', 'z_flux_corr',
       'Classification', 'wise_ra', 'wise_dec', 'ch1_flux_corr',
       'ch2_flux_corr', 'tmass_key', 'J_flux_corr', 'H_flux_corr',
       'Ks_flux_corr', 'E_Total_flux', 'E_Peak_flux', 'Total_flux',
       'Peak_flux'],
      dtype='object')

In [6]:
X = data.drop(columns=['RAJ2000', 'DEJ2000', 'SimbadName','Classification', 'wise_ra', 'wise_dec', 
                        'tmass_key', 'E_Total_flux', 'E_Peak_flux'])
y = data[['Classification']]

# Dropping columns with little data (optional)

In [9]:
X = X.drop(columns=['NUV_flux_corr', 'Bw_flux_corr', 'y_flux_corr', 'H_flux_corr', 'Ks_flux_corr', 'K_flux_corr',
                   'g_flux_corr', 'nb921_hsc_flux_corr'])

## Filling nan's

In [10]:
imp = IterativeImputer(max_iter=100, min_value=0)
X_filled = imp.fit_transform(X)

## Normalisations

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()
X_filled = scaler.fit_transform(X_filled)

## Running DBSCAN

In [13]:
clustering = DBSCAN(eps=0.5, min_samples=1000, n_jobs=8).fit(X_filled)
np.unique(clustering.labels_, return_counts=True)

(array([-1,  0]), array([8022, 5140]))