In [1]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor, Pool

import warnings
warnings.filterwarnings('ignore')

path = ''



In [2]:
df = dd.read_parquet(path+'Mock_cat_physparam_JADES_Deep_1_po-1_2p.pq', engine='pyarrow')

# replace missing values with magic value -99
df = df.fillna(-99).compute()

# take a look at what's in the data
cols = df.columns.values.tolist()
print(cols)

['ID', 'ID0', 'RA', 'DEC', 'z', 'LIR', 'SED_type', 'SED_number', 'M', 'LIR_SF', 'SFR_IR', 'SFR_UV', 'LIR_AGN', 'L_2_10keV', 'L_2_10keV_obs', 'NH', 'LACC_AGN', 'LBOL_AGN', 'log10N', 'L_150MHz', 'L_1_4GHz', 'logM_DM', '12logOH_W', 'Av', 'mu', 'JWST_NIRCam_F090W', 'errJWST_NIRCam_F090W', 'JWST_NIRCam_F115W', 'errJWST_NIRCam_F115W', 'JWST_NIRCam_F150W', 'errJWST_NIRCam_F150W', 'JWST_NIRCam_F200W', 'errJWST_NIRCam_F200W', 'JWST_NIRCam_F277W', 'errJWST_NIRCam_F277W', 'JWST_NIRCam_F335M', 'errJWST_NIRCam_F335M', 'JWST_NIRCam_F356W', 'errJWST_NIRCam_F356W', 'JWST_NIRCam_F410M', 'errJWST_NIRCam_F410M', 'JWST_NIRCam_F444W', 'errJWST_NIRCam_F444W']


In [3]:
# select columns to use for model training
cols_ml = [x for x in cols if x[:4]=='JWST']
targets = ['z','M','SFR_IR','SFR_UV']

# convert flux to a magnitude-like scale
for col in cols_ml:
    df[col] = -2.5 * np.log10(df[col].astype('float32'))

df[cols_ml] = df[cols_ml].replace([np.inf, -np.inf], -99)

# generate colours
cols_ml_with_colours = cols_ml.copy()
cols_ml_some_colours = cols_ml.copy()
for i,c0 in enumerate(cols_ml[:-1]):
    for j,c1 in enumerate(cols_ml[1:]):
        if j>=i:
            colour_name = c0+'-'+c1
            df[colour_name] = df[c0]-df[c1]
            
            cols_ml_with_colours.append(colour_name)
            if j==i:
                cols_ml_some_colours.append(colour_name)

df[cols_ml_some_colours].head()

Unnamed: 0,JWST_NIRCam_F090W,JWST_NIRCam_F115W,JWST_NIRCam_F150W,JWST_NIRCam_F200W,JWST_NIRCam_F277W,JWST_NIRCam_F335M,JWST_NIRCam_F356W,JWST_NIRCam_F410M,JWST_NIRCam_F444W,JWST_NIRCam_F090W-JWST_NIRCam_F115W,JWST_NIRCam_F115W-JWST_NIRCam_F150W,JWST_NIRCam_F150W-JWST_NIRCam_F200W,JWST_NIRCam_F200W-JWST_NIRCam_F277W,JWST_NIRCam_F277W-JWST_NIRCam_F335M,JWST_NIRCam_F335M-JWST_NIRCam_F356W,JWST_NIRCam_F356W-JWST_NIRCam_F410M,JWST_NIRCam_F410M-JWST_NIRCam_F444W
0,1.847682,1.349508,1.096546,0.895745,1.211952,1.688168,1.723691,1.79653,1.889233,0.498174,0.252962,0.200801,-0.316207,-0.476216,-0.035524,-0.072839,-0.092703
1,1.98243,1.482879,1.186536,0.990724,1.220856,1.730843,1.765809,1.859914,1.931054,0.499551,0.296343,0.195812,-0.230132,-0.509986,-0.034967,-0.094105,-0.07114
2,-99.0,8.012038,7.819609,8.125659,6.327655,6.932856,6.318841,6.967535,7.151089,-107.012039,0.192429,-0.30605,1.798004,-0.6052,0.614015,-0.648694,-0.183555
3,9.731976,7.557979,7.029085,7.052557,7.029789,6.964879,6.807697,7.241346,7.397506,2.173996,0.528894,-0.023472,0.022768,0.06491,0.157181,-0.433649,-0.156159
4,7.766983,7.029789,6.836186,6.441885,6.082728,5.745864,6.794011,7.25339,8.588335,0.737194,0.193603,0.394301,0.359157,0.336864,-1.048147,-0.459379,-1.334945


In [4]:
# define learning algorithm
clf = CatBoostRegressor(logging_level='Silent',
                        thread_count=3,
                        max_depth=10,
                        n_estimators=2000)

# try with or without colours
case_descriptions = ['mags only','mags and adjacent colours','mags and colours']

for k,cols_use in enumerate([cols_ml,cols_ml_some_colours,cols_ml_with_colours]):
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(df[cols_use], 
                                                        df[targets],
                                                        test_size=0.33)
    
    print('Training the models... ('+case_descriptions[k]+')\n')
    
    for target in targets:

        print('Target label: '+target)

        clf.fit(X_train,y_train[target])
        preds = clf.predict(X_test)

        if target == 'z':
            nmad = 1.48 * np.median(np.abs(y_test[target] - preds) / y_test[target])
            bias = np.median((preds-y_test[target])/(y_test[target]))
            f_outl = len(y_test[np.abs(y_test[target]-preds)/y_test[target] > 0.15])/len(y_test[target])
            print("Photo-z NMAD:",np.round(nmad,6))
            print("Photo-z bias:",np.round(bias,6))
            print("Photo-z outlier fraction:",np.round(f_outl,6),'\n')

        else:
            nmad = 1.48 * np.median(np.abs(y_test[target] - preds))
            bias = np.median(preds-y_test[target])
            f_outl = len(y_test[np.abs(y_test[target]-preds) > 0.3])/len(y_test[target])
            print(target+" NMAD:",np.round(nmad,6))
            print(target+" bias:",np.round(bias,6))
            print(target+" outlier fraction:",np.round(f_outl,6),'\n')




Training the models... (mags only)

Target label: z
Photo-z NMAD: 0.147004
Photo-z bias: 0.003847
Photo-z outlier fraction: 0.35726 

Target label: M
M NMAD: 0.371285
M bias: -0.002638
M outlier fraction: 0.420289 

Target label: SFR_IR
SFR_IR NMAD: 0.368127
SFR_IR bias: 0.009943
SFR_IR outlier fraction: 0.418293 

Target label: SFR_UV
SFR_UV NMAD: 0.143645
SFR_UV bias: -0.016572
SFR_UV outlier fraction: 0.125434 

Training the models... (mags and adjacent colours)

Target label: z
Photo-z NMAD: 0.134835
Photo-z bias: 0.003094
Photo-z outlier fraction: 0.34117 

Target label: M
M NMAD: 0.365589
M bias: -0.001539
M outlier fraction: 0.41399 

Target label: SFR_IR
SFR_IR NMAD: 0.364053
SFR_IR bias: 0.005148
SFR_IR outlier fraction: 0.411704 

Target label: SFR_UV
SFR_UV NMAD: 0.130548
SFR_UV bias: -0.006391
SFR_UV outlier fraction: 0.122212 

Training the models... (mags and colours)

Target label: z
Photo-z NMAD: 0.121559
Photo-z bias: 0.004352
Photo-z outlier fraction: 0.315955 

Targe