In [1]:
from astropy.table import Table, join
import numpy as np

import time
from numpy.random import default_rng

import healpy as hp
import pandas as pd

import astropy.cosmology
from astropy.coordinates import SkyCoord
from astropy import units as u
from astropy.table import Table

import sys
sys.path.insert(0,'../code')
%load_ext autoreload
%autoreload 2
import utils
import generate_random
import correlations
import masks
import maps

# Parameter setting

Saving

In [89]:
quants = {}

In [41]:
fn_quants = '../data/quantities.txt'
save = True

Names

In [3]:
name_catalog = '\emph{{Gaia}} Cosmological Quasar Catalog'
abbrv_catalog = 'GaiaQ'

name_gpurer = '\emph{{Gaia}} Purer Sample'
abbrv_gpurer = '\emph{{Gaia}} Purer'

Functions

In [4]:
def get_ndens_map(ras, decs, NSIDE=64):
    map_nqso, _ = maps.get_map(NSIDE, ras, decs, null_val=0)
    map_ndens = map_nqso/area_per_pixel
    map_ndens_masked = hp.ma(map_ndens)
    map_ndens_masked.mask = map_ndens==0
    return map_ndens_masked

Map settings

In [5]:
NSIDE = 64
area_per_pixel = hp.nside2pixarea(NSIDE, degrees=True)
print(f"Area per pixel: {area_per_pixel:.3f} deg")

Area per pixel: 0.839 deg


Other prelims

In [6]:
G_hi = 20.5
G_lo = 20.0

In [7]:
rng = np.random.default_rng(seed=42)

# Load datasets

### Gaia

In [8]:
fn_gall = '../data/gaia_candidates.fits.gz'
tab_gall = utils.load_table(fn_gall)
print(len(tab_gall))

6649162


In [9]:
fn_gpurer_ids = '../data/gaia_purer_sourceids.fits'
tab_gpurer_ids = utils.load_table(fn_gpurer_ids)
tab_gpurer = join(tab_gall, tab_gpurer_ids, keys='source_id', join_type='inner', metadata_conflicts='silent')
print(len(tab_gpurer))

1942825


In [10]:
fn_gsup = '../data/gaia_candidates_superset.fits'
tab_gsup = utils.load_table(fn_gsup)
print(len(tab_gsup))

1518782


In [11]:
fn_gclean = '../data/gaia_candidates_clean.fits'
tab_gclean = utils.load_table(fn_gclean)
print(len(tab_gclean))

1442077


In [101]:
fn_gcatlo = f'../data/gaiaQ_G{G_lo}.fits'
tab_gcatlo = utils.load_table(fn_gqlo)
print(len(tab_gqlo))

767866


In [102]:
fn_gcathi = f'../data/gaiaQ_G{G_hi}.fits'
tab_gcathi = utils.load_table(fn_gqhi)
print(len(tab_gqhi))

1318566


### SDSS

These are SDSS objects that have (any) Gaia source matches. There is unWISE data where available, but the below catalogs are not limited to unWISE matches.

In [14]:
tab_squasars = utils.load_table(f'../data/quasars_sdss_xgaia_xunwise_good_nodup.fits')
print(f"Number of SDSS quasars: {len(tab_squasars)}")
print(f"Number with unWISE info:", np.sum(np.isfinite(tab_squasars['mag_w1_vg']) & ~tab_squasars.mask['mag_w1_vg']))

Number of SDSS quasars: 379698
Number with unWISE info: 350070


In [15]:
tab_sstars = utils.load_table(f'../data/stars_sdss_xgaia_xunwise_good_nodup.fits')
print(f"Number of SDSS stars: {len(tab_sstars)}")
print(f"Number with unWISE info:", np.sum(np.isfinite(tab_sstars['mag_w1_vg']) & ~tab_sstars.mask['mag_w1_vg']))

Number of SDSS stars: 683221
Number with unWISE info: 482080


In [16]:
tab_sgals = utils.load_table(f'../data/galaxies_sdss_xgaia_xunwise_good_nodup.fits')
print(f"Number of SDSS galaxies: {len(tab_sgals)}")
print(f"Number with unWISE info:", np.sum(np.isfinite(tab_sgals['mag_w1_vg']) & ~tab_sgals.mask['mag_w1_vg']))

Number of SDSS galaxies: 717059
Number with unWISE info: 600897


Only objects in superset (gaia qso candidates with unWISE data and QSOC redshifts and G lim)

In [17]:
fn_labeled_sup = '../data/labeled_superset.fits'
tab_labeled_sup = utils.load_table(fn_labeled_sup)
print(f"Number of labeled Gaia quasar candidates for decontamination: {len(tab_labeled_sup)}")

Number of labeled Gaia quasar candidates for training/validation: 249714


In [162]:
fn_labeled_clean = '../data/labeled_clean.fits'
tab_labeled_clean = utils.load_table(fn_labeled_clean)
print(f"Number of labeled Gaia quasar candidates for redshift estimation: {len(tab_labeled_clean)}")

Number of labeled Gaia quasar candidates for redshift estimation: 246588


Redshifts

In [53]:
redshift_estimator_name = 'kNN'
save_tag_model = f'_K27_std'
fn_spz_labeled = f'../data/redshift_estimates/redshifts_spz_labeled_{redshift_estimator_name}{save_tag_model}.fits'
tab_spz_labeled = utils.load_table(fn_spz_labeled)
print(f"N = {len(tab_spz_labeled)}")

N = 245308


# Quantities

## Numbers of objects

In [90]:
quants['N_gall'] = f"{len(tab_gall):,}"
print(quants['N_gall'])

6,649,162


In [91]:
i_gall_wqsoc = np.isfinite(tab_gall['redshift_qsoc'])
quants['N_gall_wqsoc'] = f"{np.sum(i_gall_wqsoc):,}"
print(quants['N_gall_wqsoc'])

6,375,063


In [92]:
quants['N_gpurer'] = f"{len(tab_gpurer):,}"
print(quants['N_gpurer'])

1,942,825


In [93]:
i_gpurer_wqsoc = np.isfinite(tab_gpurer['redshift_qsoc'])
quants['N_gpurer_wqsoc'] = f"{np.sum(i_gpurer_wqsoc):,}"
print(quants['N_gpurer_wqsoc'])

1,729,625


In [169]:
quants['N_gsup'] = f"{len(tab_gsup):,}"
print(quants['N_gsup'])

1,518,782


In [168]:
quants['N_gclean'] = f"{len(tab_gclean):,}"
print(quants['N_gclean'])

1,442,077


In [172]:
quants['p_cut_gsup_gclean'] = f"{100*(len(tab_gsup) - len(tab_gclean))/len(tab_gsup):.0f}"
print(quants['p_cut_gsup_gclean'])

5


In [137]:
quants['N_gcatlo'] = f"{len(tab_gcatlo):,}"
print(quants['N_gcatlo'])

767,866


In [138]:
quants['N_gcathi'] = f"{len(tab_gcathi):,}"
print(quants['N_gcathi'])

1,318,566


In [161]:
i_squasars_unwise = np.isfinite(tab_squasars['mag_w1_vg']) & ~tab_squasars.mask['mag_w1_vg'] & \
                    np.isfinite(tab_squasars['mag_w2_vg']) & ~tab_squasars.mask['mag_w2_vg']
quants['N_squasars_unwise'] = f"{np.sum(i_squasars_unwise):,}"
print(quants['N_squasars_unwise'])

343,074


In [167]:
i_sq_sup = tab_labeled_sup['class']=='q'
quants['N_squasars_sup'] = f"{np.sum(i_sq_sup):,}"
print(quants['N_squasars_sup'])

246,122


## Redshift info

### general

In [94]:
i_zfinite = np.isfinite(tab_gall['redshift_qsoc'])
quants['z_med_gall'] = f"{np.median(tab_gall['redshift_qsoc'][i_zfinite]):.2f}"
print(quants['z_med_gall'])

1.67


In [177]:
quants['z_med_gcatlo'] = f"{np.median(tab_gcatlo['redshift_spz']):.2f}"
print(quants['z_med_gcatlo'])

1.45


In [181]:
zintermediate = 2.5
quants['zintermediate'] = zintermediate
i_above_zintermediate_gcatlo = tab_gcatlo['redshift_spz'] > zintermediate
quants['N_above_zintermediate_gcatlo'] = np.sum(i_above_zintermediate_gcatlo)
quants['p_above_zintermediate_gcatlo'] = np.sum(i_above_zintermediate_gcatlo)/len(i_above_zintermediate_gcatlo)
print(quants['N_above_zintermediate_gcatlo'], quants['p_above_zintermediate_gcatlo'])

79441 0.1034568531488567


### dz checks vs sdss

In [95]:
rand_ints_labeled = tab_spz_labeled['rand_ints']
i_train, i_valid, i_test = utils.split_train_val_test(rand_ints_labeled,
                                 frac_train=0.7, frac_val=0.15, frac_test=0.15)

In [96]:
z_sdss_test = tab_spz_labeled['z_sdss'][i_test]
z_spzraw_test = tab_spz_labeled['redshift_spz_raw'][i_test]
z_spz_test = tab_spz_labeled['redshift_spz'][i_test]
z_gaia_test = tab_spz_labeled['redshift_qsoc'][i_test]

In [97]:
dz_spzraw_test = (z_spzraw_test - z_sdss_test)/(1+z_sdss_test)
dz_spz_test = (z_spz_test - z_sdss_test)/(1+z_sdss_test)
dz_gaia_test = (z_gaia_test - z_sdss_test)/(1+z_sdss_test)

In [183]:
dz_threshs = {'dzlo': 0.01, 'dzmid': 0.1, 'dzhi': 0.2}
G_maxs = {'Gbright': 19.0, 'Glo': 20.0, 'Ghi': 20.5}
dz_arrs = {'zspz': dz_spz_test, 'zgaia': dz_gaia_test}

In [184]:
for G_name, G_max in G_maxs.items():
    quants[G_name] = G_max
    for dz_thresh_name, dz_thresh in dz_threshs.items():
        quants[dz_thresh_name] = dz_thresh
        for z_name, dz_arr in dz_arrs.items():
        
            i_G = tab_spz_labeled['phot_g_mean_mag'][i_test] < G_max
            i_outliers = np.abs(dz_arr[i_G]) > dz_thresh
            key = f'p_outliers_{z_name}_{dz_thresh_name}_{G_name}'
            val = f"{100*np.sum(i_outliers)/len(i_outliers):.0f}"
            print(key, val)
            quants[key] = val
            
            i_acc = np.abs(dz_arr[i_G]) <= dz_thresh
            key = f'p_acc_{z_name}_{dz_thresh_name}_{G_name}'
            val = f"{100*np.sum(i_acc)/len(i_acc):.0f}"
            print(key, val)
            quants[key] = val

p_outliers_zspz_dzlo_Gbright 14
p_acc_zspz_dzlo_Gbright 86
p_outliers_zgaia_dzlo_Gbright 9
p_acc_zgaia_dzlo_Gbright 91
p_outliers_zspz_dzmid_Gbright 5
p_acc_zspz_dzmid_Gbright 95
p_outliers_zgaia_dzmid_Gbright 7
p_acc_zgaia_dzmid_Gbright 93
p_outliers_zspz_dzhi_Gbright 3
p_acc_zspz_dzhi_Gbright 97
p_outliers_zgaia_dzhi_Gbright 6
p_acc_zgaia_dzhi_Gbright 94
p_outliers_zspz_dzlo_Glo 26
p_acc_zspz_dzlo_Glo 74
p_outliers_zgaia_dzlo_Glo 25
p_acc_zgaia_dzlo_Glo 75
p_outliers_zspz_dzmid_Glo 10
p_acc_zspz_dzmid_Glo 90
p_outliers_zgaia_dzmid_Glo 19
p_acc_zgaia_dzmid_Glo 81
p_outliers_zspz_dzhi_Glo 5
p_acc_zspz_dzhi_Glo 95
p_outliers_zgaia_dzhi_Glo 18
p_acc_zgaia_dzhi_Glo 82
p_outliers_zspz_dzlo_Ghi 39
p_acc_zspz_dzlo_Ghi 61
p_outliers_zgaia_dzlo_Ghi 38
p_acc_zgaia_dzlo_Ghi 62
p_outliers_zspz_dzmid_Ghi 17
p_acc_zspz_dzmid_Ghi 83
p_outliers_zgaia_dzmid_Ghi 30
p_acc_zgaia_dzmid_Ghi 70
p_outliers_zspz_dzhi_Ghi 9
p_acc_zspz_dzhi_Ghi 91
p_outliers_zgaia_dzhi_Ghi 28
p_acc_zgaia_dzhi_Ghi 72


Get numbers for all Gaia candidates with SDSS redshifts:

In [107]:
tab_squasars.columns

<TableColumns names=('ra','dec','source_id','ra_sdss','dec_sdss','objid','z_sdss','phot_g_mean_mag','phot_bp_mean_mag','phot_rp_mean_mag','phot_bp_n_obs','phot_rp_n_obs','dec_unwise','mag_w1_vg','mag_w2_vg','ra_unwise','unwise_objid','dist_arcsec')>

In [115]:
tab_squasars_lite = tab_squasars.copy()
tab_squasars_lite.keep_columns(['source_id', 'z_sdss'])
tab_gall_lite = tab_gall.copy()
tab_gall_lite.keep_columns(['source_id', 'redshift_qsoc', 'phot_g_mean_mag'])

In [116]:
tab_gall_xsquasars = join(tab_gall_lite, tab_squasars_lite, keys='source_id', join_type='inner')
print(len(tab_gall_xsquasars))

326067


In [118]:
tab_gall_xsquasars = tab_gall_xsquasars[np.isfinite(tab_gall_xsquasars['redshift_qsoc'])]
print(len(tab_gall_xsquasars))

320149


In [119]:
z_sdss_gall = tab_gall_xsquasars['z_sdss']
z_gaia_gall = tab_gall_xsquasars['redshift_qsoc']

dz_gaia_gall = (z_gaia_gall - z_sdss_gall)/(1+z_sdss_gall)

In [127]:
z_name = 'zgaia'
for G_name, G_max in G_maxs.items():
    for dz_thresh_name, dz_thresh in dz_threshs.items():
        
        i_G = tab_gall_xsquasars['phot_g_mean_mag'] < G_max
        i_outliers = np.abs(dz_gaia_gall[i_G]) > dz_thresh
        key = f'p_outliers_gall_{z_name}_{dz_thresh_name}_{G_name}'
        val = f"{100*np.sum(i_outliers)/len(i_outliers):.0f}"
        print(key, val)
        quants[key] = val

p_outliers_gall_zgaia_dzlo_Glo 25
p_outliers_gall_zgaia_dzmid_Glo 19
p_outliers_gall_zgaia_dzhi_Glo 18
p_outliers_gall_zgaia_dzlo_Ghi 38
p_outliers_gall_zgaia_dzmid_Ghi 30
p_outliers_gall_zgaia_dzhi_Ghi 28


Ends up being the same as our cleaned sample lol oop (rounded to a percent)

In [142]:
G_name = 'Glo'
for dz_thresh_name, dz_thresh in dz_threshs.items():
    fac = float(quants[f'p_outliers_gall_zgaia_{dz_thresh_name}_{G_name}'])/ \
          float(quants[f'p_outliers_zspz_{dz_thresh_name}_{G_name}'])
    key = f'factor_reduction_outliers_{dz_thresh_name}_{G_name}'
    fac_rounded = round(fac*2)/2
    val = rf'${{\sim}}{fac_rounded:g}\times$'
    print(fac)
    print(key,':', val)
    quants[key] = val

0.9615384615384616
factor_reduction_outliers_dzlo_Glo : ${\sim}1\times$
1.9
factor_reduction_outliers_dzmid_Glo : ${\sim}2\times$
3.6
factor_reduction_outliers_dzhi_Glo : ${\sim}3.5\times$


### Decontamination

In [154]:
i_contam_labeled = (tab_labeled_sup['class']=='s') | (tab_labeled_sup['class']=='g')
print(np.sum(i_contam_labeled))

i_contam_gclean = np.isin(tab_labeled_sup['source_id'][i_contam_labeled], tab_gclean['source_id'])
print(np.sum(i_contam_gclean))

3592
1138


In [155]:
key = 'factor_reduction_contaminants'
fac = np.sum(i_contam_labeled)/np.sum(i_contam_gclean)
fac_rounded = round(fac*2)/2
val = rf'${{\sim}}{fac_rounded:g}\times$'
print(fac)
print(key, val)
quants[key] = val

3.156414762741652
factor_reduction_contaminants ${\sim}3\times$


# Write dict to file

In [182]:
with open(fn_quants, 'w') as f:
    for key, val in quants.items():
        print(key, val)
        if save:
            f.write(f'{key} = {val}\n')
if save:
    print()
    print(f"Saved to {fn_quants}!")

N_gall 6,649,162
N_gall_wqsoc 6,375,063
N_gpurer 1,942,825
N_gpurer_wqsoc 1,729,625
z_med_gall 1.67
Glo 20.0
dzlo 0.01
p_outliers_dzlo_gall_Glo 74
dzmid 0.1
p_outliers_dzmid_gall_Glo 90
dzhi 0.2
p_outliers_dzhi_gall_Glo 95
Ghi 20.5
p_outliers_dzlo_gall_Ghi 61
p_outliers_dzmid_gall_Ghi 83
p_outliers_dzhi_gall_Ghi 91
p_outliers_zspz_dzhi_Glo 5
p_outliers_zgaia_dzhi_Glo 18
p_outliers_zspz_dzhi_Ghi 9
p_outliers_zgaia_dzhi_Ghi 28
p_outliers_zspz_dzlo_Glo 26
p_outliers_zgaia_dzlo_Glo 25
p_outliers_zspz_dzmid_Glo 10
p_outliers_zgaia_dzmid_Glo 19
p_outliers_zspz_dzlo_Ghi 39
p_outliers_zgaia_dzlo_Ghi 38
p_outliers_zspz_dzmid_Ghi 17
p_outliers_zgaia_dzmid_Ghi 30
p_outliers_gall_zgaia_dzlo_Glo 25
p_outliers_gall_zgaia_dzmid_Glo 19
p_outliers_gall_zgaia_dzhi_Glo 18
p_outliers_gall_zgaia_dzlo_Ghi 38
p_outliers_gall_zgaia_dzmid_Ghi 30
p_outliers_gall_zgaia_dzhi_Ghi 28
factor_reduction_outliers_dzlo_Glo ${\sim}1\times$
factor_reduction_outliers_dzmid_Glo ${\sim}2\times$
factor_reduction_outliers_dzhi