In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import astropy.coordinates as coord
import astropy.units as u
import emcee
import sys
from astropy.table import Table

if './SelfCalGroupFinder/py/' not in sys.path:
    sys.path.append('./SelfCalGroupFinder/py/')
from pyutils import *
import plotting as pp
from dataloc import *
from bgs_helpers import *
import catalog_definitions as cat
from groupcatalog import *

%load_ext autoreload
%autoreload 2
DPI = 100
pp.DPI = DPI

After the group finder is run, this notebook is used to postprocess the results, generating plots and such for analysis.


## Loading existing datasets 

In [48]:
#mxxl_all=deserialize(cat.mxxl_all)
#mxxl_fiberonly=deserialize(cat.mxxl_fiberonly)
#mxxl_nn=deserialize(cat.mxxl_nn)
mxxl_simple_4=deserialize(cat.mxxl_simple_4)

In [2]:
sdss_vanilla = deserialize(cat.sdss_vanilla_v2)
sdss_colors = deserialize(cat.sdss_colors_v2)
sdss_colors_chi = deserialize(cat.sdss_colors_chi_v2)
cat.sdss_published.postprocess()
sdss_published = cat.sdss_published # It really is ~exactly sdss_colors_chi, which is great news for reproducibility
sdss_vanilla_old = deserialize(cat.sdss_vanilla)
sdss_bgscut = deserialize(cat.sdss_bgscut)

In [4]:
fiberonly_BGS=deserialize(cat.bgs_fiberonly)
fiberonly_1pass_BGS=deserialize(cat.bgs_fiberonly_1pass)
nn_BGS=deserialize(cat.bgs_nn)
nn_BGS_sdsslike=deserialize(cat.bgs_nn_sdsslike)
simple4_BGS=deserialize(cat.bgs_simple_4)
#simple4_BGS_old=deserialize(cat.bgs_simple_4_old)
#simple4_BGS_c=deserialize(cat.bgs_simple_4_c)
#bgs_simple_4_no_sdss=deserialize(cat.bgs_simple_4_no_sdss)
bgs_simple_4_4p = deserialize(cat.bgs_simple_4_4p)
bgs_simple_4_1pass = deserialize(cat.bgs_simple_4_1pass)
bgs_simple_5 = deserialize(cat.bgs_simple_5)


In [5]:
bgs_y1_pz_2_4 = deserialize(cat.bgs_y1_pz_2_4)

In [51]:
bgs_y3_simple_4 = deserialize(cat.bgs_y3_simple_4)
bgs_y3_simple_5 = deserialize(cat.bgs_y3_simple_5)
bgs_y3_simple_4_4p = deserialize(cat.bgs_y3_simple_4_4p)

In [2]:
bgs_sv3_simple_4_10p = deserialize(cat.bgs_sv3_simple_4_10p)
bgs_sv3_simple_4_9p = deserialize(cat.bgs_sv3_simple_4_9p)
bgs_sv3_simple_4_8p = deserialize(cat.bgs_sv3_simple_4_8p)
bgs_sv3_simple_4_7p = deserialize(cat.bgs_sv3_simple_4_7p)
bgs_sv3_simple_4_6p = deserialize(cat.bgs_sv3_simple_4_6p)
bgs_sv3_simple_4_5p = deserialize(cat.bgs_sv3_simple_4_5p)
bgs_sv3_simple_4_4p = deserialize(cat.bgs_sv3_simple_4_4p)
bgs_sv3_simple_4_3p = deserialize(cat.bgs_sv3_simple_4_3p)
bgs_sv3_simple_4_2p = deserialize(cat.bgs_sv3_simple_4_2p)
bgs_sv3_simple_4_1p = deserialize(cat.bgs_sv3_simple_4_1p)

bgs_sv3_fiberonly_10p = deserialize(cat.bgs_sv3_fiberonly_10p)

bgs_sv3_nn_10p = deserialize(cat.bgs_sv3_nn_10p)
bgs_sv3_nn_7p = deserialize(cat.bgs_sv3_nn_7p)
bgs_sv3_nn_6p = deserialize(cat.bgs_sv3_nn_6p)

bgs_sv3_simple_5_7p = deserialize(cat.bgs_sv3_simple_5_7p)

bgs_sv3_pz_1_10p = deserialize(cat.bgs_sv3_pz_1_10p)
bgs_sv3_pz_2_4_10p = deserialize(cat.bgs_sv3_pz_2_4_10p) # Our best source of BGS Truth
bgs_sv3_pz_1_0_7p = deserialize(cat.bgs_sv3_pz_1_0_7p)
bgs_sv3_pz_2_0_7p = deserialize(cat.bgs_sv3_pz_2_0_7p)
bgs_sv3_pz_2_4_7p = deserialize(cat.bgs_sv3_pz_2_4_7p)
bgs_sv3_pz_2_4_6p = deserialize(cat.bgs_sv3_pz_2_4_6p)
bgs_sv3_pz_2_5_7p = deserialize(cat.bgs_sv3_pz_2_5_7p)
bgs_sv3_pz_3_1_7p = deserialize(cat.bgs_sv3_pz_3_1_7p)

bgs_y3_like_sv3_fiberonly = deserialize(cat.bgs_y3_like_sv3_fiberonly)
bgs_y3_like_sv3_pz_2_4 = deserialize(cat.bgs_y3_like_sv3_pz_2_4)

In [None]:
all_u = deserialize(cat.uchuu_all)

### Get catalogs from MCMC chains

In [None]:
# Get GF MCMC results for SDSS
run = 1
path = f'/mount/sirocco1/imw2293/GROUP_CAT/MCMC/mcmc_{run}/mcmc_{run}.dat'
reader = emcee.backends.HDFBackend(path, read_only=True)
sdss_colors_mine = SDSSGroupCatalog.from_MCMC(reader, "SDSS Colors Mine", SDSS_v2_DAT_FILE, SDSS_v2_GALPROPS_FILE)
sdss_colors_mine.run_group_finder(popmock=True)
sdss_colors_mine.run_corrfunc()
sdss_colors_mine.postprocess()
serialize(sdss_colors_mine)

In [None]:
# Get best preprocess photo-z-plus v2 result for SV3
path = f'/mount/sirocco1/imw2293/GROUP_CAT/mcmc13_m4_2_4.h5'
reader = emcee.backends.HDFBackend(path, read_only=True)
bgs_sv3_pz2_mcmcbest = BGSGroupCatalog.from_MCMC(reader, Mode.PHOTOZ_PLUS_v2)
bgs_sv3_pz2_mcmcbest.run_group_finder(popmock=False)
bgs_sv3_pz2_mcmcbest.postprocess()
serialize(bgs_sv3_pz2_mcmcbest)

# 64.94% neighbor
#[6.32102907 1.59813654 1.49851461 3.18966963 0.83098626 2.83411711 3.26649451 1.75386219 1.95862571 2.56928697 0.91387857 1.70360255 3.6181996 ]

In [None]:
# Get best preprocess photo-z-plus v3 result for SV3
path = f'/mount/sirocco1/imw2293/GROUP_CAT/mcmc13_m4_3_1.h5'
reader = emcee.backends.HDFBackend(path, read_only=True)
bgs_sv3_pz3_mcmcbest = BGSGroupCatalog.from_MCMC(reader, Mode.PHOTOZ_PLUS_v3)
bgs_sv3_pz3_mcmcbest.run_group_finder(popmock=False)
bgs_sv3_pz3_mcmcbest.postprocess()
serialize(bgs_sv3_pz3_mcmcbest)

# 37% Neighbor for this, not bad
#[3.68776334 1.03045493 1.00947751 2.79354858 0.88263756 1.15014321 2.71197235 0.63517952 1.44684275 2.63171751 1.16820625 0.96790557 3.02351026]

# This one is great at 63.13% Neighbor.
# [8.26010114 1.29383299 1.54671643 3.01349293 1.2229046  0.86286149 2.58828658 0.87067123 0.61260216 2.44470607 1.11635435 1.29386183 3.16506802]

In [None]:
# In case MCMC has been dumb, check similar parameter values
import copy
bb, rb, br, rr = bgs_sv3_pz3_mcmcbest.extra_params[1:13].reshape(4, 3)

params = [bb, rb, br, rr]
colors = [[0, 0, 1.0], [1.0, 0, 0.4], [0.2, 0.7, 0.2], [1.0, 0.0, 0.0]]
variants = []

for i, (param, color) in enumerate(zip(params, colors), start=1):
    variant = BGSGroupCatalog(
        f"PZP 3 Variant {i}",
        bgs_sv3_pz3_mcmcbest.mode,
        bgs_sv3_pz3_mcmcbest.mag_cut,
        bgs_sv3_pz3_mcmcbest.catalog_mag_cut,
        bgs_sv3_pz3_mcmcbest.sdss_fill,
        bgs_sv3_pz3_mcmcbest.num_passes,
        bgs_sv3_pz3_mcmcbest.drop_passes,
        bgs_sv3_pz3_mcmcbest.data_cut,
        bgs_sv3_pz3_mcmcbest.extra_params
    )
    variant.extra_params = [bgs_sv3_pz3_mcmcbest.extra_params[0], param, param, param, param]
    variant.color = color
    variant.preprocess()
    variant.run_group_finder(popmock=False)
    variant.postprocess()
    variants.append(variant)

bgs_sv3_pz3_mcmcbest_var1, bgs_sv3_pz3_mcmcbest_var2, bgs_sv3_pz3_mcmcbest_var3, bgs_sv3_pz3_mcmcbest_var4 = variants


## Publishing / Sharing

In [None]:
# Make a call like this to write a csv for sharing on NERSC
simple4_BGS.write_sharable_output_file()
bgs_simple_4_1pass.write_sharable_output_file()

In [None]:
bgs_sv3_pz_2_4_10p.wp_err

# Special handling for SV3 'Truth' 

In [None]:
#bgs_sv3_pz_2_4_10p.calculate_projected_clustering(with_extra_randoms=True)
#bgs_sv3_pz_2_4_10p.calculate_projected_clustering_in_magbins(with_extra_randoms=True)

# TODO run this again
bgs_sv3_pz_2_4_10p.add_jackknife_err_to_proj_clustering(with_extra_randoms=True, for_mag_bins=False)
#bgs_sv3_pz_2_4_10p.add_jackknife_err_to_proj_clustering(with_extra_randoms=False, for_mag_bins=True) # BUG Broken
serialize(bgs_sv3_pz_2_4_10p)

In [None]:
with np.printoptions(precision=2, suppress=True, linewidth=200):
    print(bgs_sv3_pz_2_4_10p.wp_all_extra[1])
    print(bgs_sv3_pz_2_4_10p.wp_err)

    std_devs = np.sqrt(np.diag(bgs_sv3_pz_2_4_10p.wp_cov))
    print(std_devs)

    print(bgs_sv3_pz_2_4_10p.wp_cov)

In [None]:
# Question - does using the small set of randoms vs the full set of randoms make a difference?
percent_diff = np.abs(bgs_sv3_pz_2_4_10p.wp_all[1] - bgs_sv3_pz_2_4_10p.wp_all_extra[1]) / bgs_sv3_pz_2_4_10p.wp_all[1] * 100
print(percent_diff)
red_p_diff = np.abs(bgs_sv3_pz_2_4_10p.wp_all[2] - bgs_sv3_pz_2_4_10p.wp_all_extra[2]) / bgs_sv3_pz_2_4_10p.wp_all[2] * 100
print(red_p_diff)
blue_p_diff = np.abs(bgs_sv3_pz_2_4_10p.wp_all[3] - bgs_sv3_pz_2_4_10p.wp_all_extra[3]) / bgs_sv3_pz_2_4_10p.wp_all[3] * 100
print(blue_p_diff)
# Answer - Less than 1% generally

colors = ['k', 'r', 'b']
f = bgs_sv3_pz_2_4_10p
plt.figure(figsize=(5, 5))
if f.wp_err is not None:
    plt.errorbar(f.wp_all[0][:-1], f.wp_all[1], yerr=f.wp_err, marker='o', linestyle='-', label='All', color=colors[0], alpha=0.5)
    plt.errorbar(f.wp_all[0][:-1], f.wp_all[2], yerr=f.wp_r_err, marker='o', linestyle='-', label='Red', color=colors[1], alpha=0.5)
    plt.errorbar(f.wp_all[0][:-1], f.wp_all[3], yerr=f.wp_b_err, marker='o', linestyle='-', label='Blue', color=colors[2], alpha=0.5)
else:
    plt.plot(f.wp_all[0][:-1], f.wp_all[1], marker='o', linestyle='-', label='All', color=colors[0])
    plt.plot(f.wp_all[0][:-1], f.wp_all[2], marker='o', linestyle='-', label='Red', color=colors[1])
    plt.plot(f.wp_all[0][:-1], f.wp_all[3], marker='o', linestyle='-', label='Blue', color=colors[2])

plt.plot(f.wp_all_extra[0][:-1], f.wp_all_extra[1], marker='x', linestyle='--', label='All Extra Rands', color=colors[0])
plt.plot(f.wp_all_extra[0][:-1], f.wp_all_extra[2], marker='x', linestyle='--', label='Red Extra Rands', color=colors[1])
plt.plot(f.wp_all_extra[0][:-1], f.wp_all_extra[3], marker='x', linestyle='--', label='Blue Extra Rands', color=colors[2])

plt.xscale('log')
plt.ylim(8, 2000)
plt.yscale('log')
plt.xlabel(r'$r_p$ [Mpc/h]')
plt.ylabel(r'$w_p(r_p)$')
plt.legend()
plt.title('Full Sample $w_p(r_p)$ ')
plt.grid(True)
plt.show()

# Current Comparison

In [None]:
sets = [bgs_y3_like_sv3_pz_2_4, bgs_sv3_pz_2_4_7p]

for s in sets:
    print(f"--- {s.name} ---")
    print(s.wp_all_extra)
    print(s.wp_slices_extra)


#bgs_sv3_pz_2_5_7p.marker = '--'

#sets = [bgs_sv3_pz_2_4_7p, bgs_sv3_nn_7p]
#bgs_sv3_pz_2_4_7p.name = "New Technique"
#bgs_sv3_nn_7p.name = "Old Technique"
#bgs_sv3_pz_2_4_10p.name = "~Truth"


In [None]:
pp.plots(*sets, bgs_sv3_pz_2_4_10p, show_err=bgs_sv3_pz_2_4_10p)
pp.completeness_stats(sets)

In [None]:
truth_to_use = bgs_sv3_pz_2_4_10p.all_data
for s in sets:
    print(s.name)
    s.get_true_z_from(truth_to_use)
    s.refresh_df_views()

In [None]:
pp.luminosity_function_plots(*sets)
pp.correct_redshifts_assigned_plot(*sets)

In [None]:
for s in sets:
    pp.single_plots(s)

pp.single_plots(bgs_sv3_pz_2_4_10p)

In [None]:
for s in sets:
    data = s.all_data.loc[z_flag_is_not_spectro_z(s.all_data['z_assigned_flag'])]
    delta_red = data['z'] - data['z_T'] # I used to do z_obs for SV3 dropping passes... TODO
    plt.hist(delta_red, bins=100, range=(-0.05, 0.05), histtype='step', label=s.name, color=s.color)
    plt.yscale('log')
    plt.legend()

In [None]:
pp.test_purity_and_completeness(*sets, lost_only=True)
pp.purity_complete_plots(*sets)

In [None]:
# Make a bar plot of the z_assigned_flag values for each set
for s in sets:
    j=plt.hist(s.all_data['z_assigned_flag'], bins=[-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12], histtype='step', label=s.name)

#plt.yscale('log')
plt.ylim(0, 10000)
plt.legend()

In [None]:
# Compare Clustering
for s in sets:
    pp.compare_wp_rp(s, bgs_sv3_pz_2_4_10p)

In [None]:
for s in sets:
    fig=pp.make_map(s.all_data['RA'].to_numpy(), s.all_data['Dec'].to_numpy())

In [None]:
pp.plot_positions(*sets, DEG_LONG=4, split=True, ra_min=148, dec_min=0)

# Other Plots

In [None]:
# Targets in SV3 region observed in main survey got new redshift measurements
# Q: How different are those z's compared to SV3 z's? 
# A: They are similar, but not identical. The difference is less than 0.001 for 99.7% so it's OK for us I think.
#    (Subdominant to v_peculiar)

from astropy.coordinates import SkyCoord
from astropy import units as u
from astropy.coordinates import match_coordinates_sky

def find_unique_and_matched_objects(cat1, cat2):
    df1 = cat1.all_data.loc[z_flag_is_spectro_z(cat1.all_data['z_assigned_flag'])].reset_index()
    df2 = cat2.all_data.loc[z_flag_is_spectro_z(cat2.all_data['z_assigned_flag'])].reset_index()
                                                 
    # Extract RA and Dec from the catalogs
    ra1, dec1 = df1['RA'].to_numpy(), df1['Dec'].to_numpy()
    ra2, dec2 = df2['RA'].to_numpy(), df2['Dec'].to_numpy()
        
    # Create SkyCoord objects
    coords1 = SkyCoord(ra=ra1*u.degree, dec=dec1*u.degree)
    coords2 = SkyCoord(ra=ra2*u.degree, dec=dec2*u.degree)
    
    # Match coordinates
    idx, d2d, _ = match_coordinates_sky(coords1, coords2)

    df1['FID'] = idx
    df2['FID'] = df2.index
    
    # Find objects in df1 that are not in df2
    unique_mask = d2d > 1*u.arcsec  # You can adjust the threshold as needed

    # join with df2 for matched_objects on the FID
    matched_objects = df1.join(df2.set_index('FID'), on='FID', rsuffix='_2')
    matched_objects = matched_objects[~unique_mask]

    print(f"Total spectroscopic galaxies in cat1: {len(df1)}, cat2: {len(df2)}")
    print(f'Unique objects in cat1: {unique_mask.sum()}, Matched objects in cat1: {len(matched_objects)}')
    
    return df1[unique_mask], matched_objects

# Example usage
unique_objects, matched_objects = find_unique_and_matched_objects(bgs_sv3_pz_2_4_10p, bgs_y3_like_sv3_pz_2_4)

print(np.isclose(matched_objects['z'], matched_objects['z_2'], atol=0.001, rtol=0).sum() / len(matched_objects))
#fig=pp.make_map(unique_objects.RA.to_numpy(), unique_objects.Dec.to_numpy())

plt.hist(matched_objects['z'] - matched_objects['z_2'], bins=np.linspace(-0.005, 0.005, 100))
plt.yscale('log')

# Draw verticle line at 0.005
plt.axvline(x=0.005, color='r', linestyle='--')
plt.axvline(x=-0.005, color='r', linestyle='--')

In [None]:
# SV3 10p and SDSS BGS-cut are very similar!
bgs_sv3_nn_10p.color = 'k'
pp.plots(bgs_sv3_nn_10p, sdss_bgscut)

In [None]:
# SV3 PLOTS (FULL 10 pass FOOTPRINT)

pp.LEGENDS_ON = False
pp.plots(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p)
pp.LEGENDS_ON = True

pp.completeness_comparison(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p)


In [None]:
# SV3 function of # passes plots (USING CENTERS ONLY)
bgs_sv3_simple_4_7p.color = 'red'
bgs_sv3_simple_4_7p.centered.color = 'red'

pp.LEGENDS_ON = False
pp.plots(
    bgs_sv3_simple_4_10p.centered,
    bgs_sv3_simple_4_9p.centered,
    bgs_sv3_simple_4_8p.centered,
    bgs_sv3_simple_4_7p.centered,
    bgs_sv3_simple_4_6p.centered,
    bgs_sv3_simple_4_5p.centered,
    bgs_sv3_simple_4_4p.centered,
    bgs_sv3_simple_4_3p.centered,
    bgs_sv3_simple_4_2p.centered,
    bgs_sv3_simple_4_1p.centered
)    
pp.LEGENDS_ON = True

In [None]:
# Cutting SDSS to remove regions with poor BGS overlap barely improves the completeness
pp.plots(sdss_bgscut, sdss_vanilla)
print(f"{spectroscopic_complete_percent(sdss_bgscut.all_data['z_assigned_flag']):.2f}% spectroscopic complete for BGS cut")
print(f"{spectroscopic_complete_percent(sdss_vanilla.all_data['z_assigned_flag']):.2f}% spectroscopic complete for Vanilla")

In [None]:
pp.hod_plots(bgs_sv3_simple_4_10p)

In [None]:
# View plots for my SDSS results from my MCMC chains
to_compare = sdss_vanilla
to_compare.run_group_finder(popmock=True)
to_compare.run_corrfunc()
to_compare.postprocess()

pp.proj_clustering_plot(sdss_vanilla)
pp.lsat_data_compare_plot(sdss_vanilla)


#sdss_colors_chi.color = 'r'
#pp.plots(sdss_colors_mine, sdss_colors, sdss_colors_chi)

In [None]:
#simple4_BGS.name = 'DESI BGS Y1'
pp.plots(simple4_BGS, sdss_vanilla, sdss_colors, sdss_colors_chi)
#pp.plots(cat.sdss_published, sdss_colors_chi)

In [None]:
# Why doesn't mstar missing % exactly match z_assigned_flag? 
# Probably redshift failures. Still have a spectra so still have mstar
print(np.sum(np.isnan(simple4_BGS.all_data.mstar)) / len(simple4_BGS.all_data.mstar))
print(np.sum(simple4_BGS.all_data.z_assigned_flag != 0) / len(simple4_BGS.all_data.z_assigned_flag))

In [None]:
pp.compare_fsat_color_split(sdss_vanilla_old, sdss_vanilla, project_percent=0.52)


In [None]:
bgs_simple_4_1pass.color = 'r'
pp.plots(simple4_BGS, bgs_simple_4_1pass)

In [None]:
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, sdss_vanilla)


In [None]:
#pp.compare_fsat_color_split(all, simple_2)
#pp.compare_fsat_color_split(all, simple_4)
#pp.compare_fsat_color_split(sdss_vanilla, simple4_BGS)
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, simple4_BGS)
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, bgs_sv3_fiberonly_10p)
#pp.compare_fsat_color_split(bgs_sv3_10p, bgs_sv3_10p_all)
#pp.compare_fsat_color_split(sdss_vanilla, bgs_simple_4_1pass)


In [None]:
pp.qf.centered_plot(simple4_BGS)
pp.qf.centered_plot(sdss_published)

In [None]:
pp.fsat_by_z_bins(simple4_BGS, z_bins=np.array([0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]), show_plots=True)
#pp.fsat_by_z_bins(mxxl_simple_4, z_bins=np.array([0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]), show_plots=False, aggregation=pp.fsat_truth_vmax_weighted)


In [None]:
plt.hist(bgs_y1_pz_2_4.all_data['mstar'].dropna(), np.logspace(6, 13, 100))
plt.xlabel('Stellar Mass')
plt.ylabel('Frequency')
plt.title('Distribution of Stellar Masses for bgs_y1_pz_2_4.all_data')
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
bgs_y1_pz_2_4.all_data['Mstar_bin'].value_counts()

In [None]:
#bgs_y1_pz_2_4.postprocess()
#bgs_y1_pz_2_4.all_data['Mstar_bin'] = pd.cut(x = bgs_y1_pz_2_4.all_data['mstar'], bins = mstar_bins, labels = mstar_labels, include_lowest = True)
pp.qf_cen_plot(bgs_y1_pz_2_4, mstar=True)

In [None]:
pp.fsat_by_z_bins(simple4_BGS, z_bins=np.array([0.0, 0.2, 1.0]))


In [None]:
pp.plots(fiberonly_BGS, nn_BGS, simple4_BGS)


In [None]:
pp.plots(bgs_sv3_simple_4_10p, bgs_sv3_fiberonly_10p)


In [None]:
# Print out biggest group size
for dataset in [simple4_BGS, fiberonly_BGS, fiberonly_1pass_BGS, sdss_vanilla]:
    print(dataset.name)
    print(dataset.all_data.groupby('igrp').size().max())


In [None]:
# SDSS Examine Bimodality

z=sdss_vanilla.all_data['z']
gmr=sdss_vanilla.all_data['Mag_g'] - sdss_vanilla.all_data['Mag_r']
junk=plt.hist(gmr, bins=np.linspace(-1,3,300), alpha=0.4)
#junk=plt.hist(k_correct(sdss_vanilla.all_data['Mag_g'], z, gmr, band='g')  - k_correct(sdss_vanilla.all_data['Mag_r'], z, gmr, band='r'), bins=500, alpha=0.4)
junk=plt.hist(sdss_vanilla.all_data['Dn4000'], bins=np.linspace(0,4,300), alpha=0.4)
plt.xlim(-1, 3)

In [None]:
# Investigate changes in halo mass function from wcen
m1=np.log10(sdss_vanilla.all_data['M_halo'])
m2=np.log10(sdss_colors.all_data['M_halo'])
m3=np.log10(sdss_colors_chi.all_data['M_halo'])

# bin m1,m2,m3 the same way
n_bins = 20
bins = np.linspace(10.8, 15.0, n_bins)
d1 = np.digitize(m1, bins)
d2 = np.digitize(m2, bins)
d3 = np.digitize(m3, bins)

# count the number of galaxies in each bin
n1 = np.array([np.sum(d1==i) for i in range(1, n_bins+1)])
n2 = np.array([np.sum(d2==i) for i in range(1, n_bins+1)])
n3 = np.array([np.sum(d3==i) for i in range(1, n_bins+1)])

# Do the same but for log10(counts)
n1 = np.log10(n1)
n2 = np.log10(n2)
n3 = np.log10(n3)
print(n1,n2,3)

# Log difference
p1 = np.abs(n1-n2)
p2 = np.abs(n1-n3)

plt.plot(bins, p1, label='SDSS Colors vs Vanilla')
plt.plot(bins, p2, label='SDSS Colors+Chi vs Vanilla')

plt.xlabel('log10(M_halo)')
plt.ylabel('Log10 Difference in Counts')
plt.legend()

## Make single group CSV for legacysurvey.org/viewer visualization

In [None]:
df = pd.read_csv(OUTPUT_FOLDER + 'NERSC_BGS_1pass_v1.out')
centrals_of_big_groups = df['N_sat'] > 0
group_ids = df.loc[centrals_of_big_groups].igrp.unique()

In [None]:
df[df.igrp == 1644058]

In [None]:
print(group_ids[0:10])

In [None]:
#for i in group_ids[0:10]:
for i in [1644058, 1644051]:
    #df.loc[df.igrp == i, ['RA', 'Dec']].to_csv(OUTPUT_FOLDER + f'group{i}.csv', index=False)
    print(df.loc[df.igrp == i, ['RA', 'Dec', 'z', 'z_assigned_flag']])

## Study z_phot vs z_spectra

In [None]:
df = bgs_sv3_pz_2_4_10p.all_data

low_cut = 0.0001 # not a dramatic shift when moving from here to 0.1
quality = (df['z_phot'] != NO_PHOTO_Z) & z_flag_is_spectro_z(df['z_assigned_flag']) & (df['z_obs'] > low_cut) & (df['L_gal'] < 1E9)
#quality = (df['z_phot'] != NO_PHOTO_Z) & z_flag_is_spectro_z(df['z_assigned_flag']) & (df['z_obs'] > low_cut)

# Investigate the photo-z error distribution for red and blue galaxies
# Blue exhibits and offset and a less peaked distribution than red
# Red does have some skew
data = df.loc[np.logical_and(df.quiescent, quality)]
blue = df.loc[np.logical_and(~df.quiescent, quality)]
delta_red = data['z_phot'] - data['z_obs']
delta_blue = blue['z_phot'] - blue['z_obs']
delta_all = df.loc[quality, 'z_phot'] - df.loc[quality, 'z_obs']

plt.hist(delta_red, bins=100, range=(-0.06, 0.06), histtype='step', color='red', label='Red')
plt.hist(delta_blue, bins=100, range=(-0.06, 0.06), histtype='step', color='blue', label='Blue')
plt.hist(delta_all, bins=100, range=(-0.06, 0.06), histtype='step', color='k', label='All')
#plt.yscale('log')
plt.legend()

# draw a vertical line at 0
plt.axvline(0, color='black', lw=1)
plt.axvline(-SIM_Z_THRESH, color='green')
plt.axvline(SIM_Z_THRESH, color='green')

percentiles = np.percentile(delta_all, [16, 50, 84])
print(f"Median delta z: {percentiles[1]:.4f}, 16th percentile: {percentiles[0]:.4f}, 84th percentile: {percentiles[2]:.4f}")
# add bars for the percentiles
#plt.axvline(percentiles[0], color='green')
#plt.axvline(percentiles[2], color='green')

# What % fall within 0.005 of the true redshift?
within_5_milli = np.abs(delta_all) < SIM_Z_THRESH
print(f"{np.sum(within_5_milli) / len(delta_all) * 100:.2f}% of galaxies have a photometric redshift within {SIM_Z_THRESH} of the spectroscopic redshift.")
print(f"For red: {np.sum(np.abs(delta_red) < SIM_Z_THRESH) / len(delta_red) * 100:.2f}%")
print(f"For blue: {np.sum(np.abs(delta_blue) < SIM_Z_THRESH) / len(delta_blue) * 100:.2f}%")


## BGS and SDSS Target Overlap Analysis

TODO: need to use a version of SDSS data that doesn't have nearest-neighbor assigned redshifts in it!

In [None]:
pd.options.mode.copy_on_write = True

# For this comparison, use pure NN BGS 
bgs_to_use = simple4_BGS.all_data
lost_bgs = bgs_to_use.loc[bgs_to_use['z_assigned_flag'] != 0]
sdss_cat = sdss_vanilla.all_data

catalog = coord.SkyCoord(ra=sdss_cat.RA.to_numpy()*u.degree, dec=sdss_cat.Dec.to_numpy()*u.degree, frame='icrs')
to_match = coord.SkyCoord(ra=lost_bgs.RA.to_numpy()*u.degree, dec=lost_bgs.Dec.to_numpy()*u.degree, frame='icrs')

idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=1, storekdtree=False)

# if angular distance is < 3", then we consider it a match to SDSS catalog
lost_bgs['sdss_z'] = np.where(d2d < 3*u.arcsec, sdss_cat.iloc[idx]['z'], np.nan)
lost_bgs_matched = lost_bgs.loc[~np.isnan(lost_bgs['sdss_z'])]
print(f"Matched {len(lost_bgs_matched)} out of {len(lost_bgs)} lost BGS galaxies to SDSS catalog, ({len(lost_bgs_matched)/len(lost_bgs)*100:.2f}%)")

good_match = np.isclose(lost_bgs_matched.z, lost_bgs_matched.sdss_z, atol=0.001).sum()
print(f"Good match: {good_match} out of {len(lost_bgs_matched)}, or {good_match/len(lost_bgs_matched)*100:.2f}%")

## Info for clustering

In [None]:
# Figure out abs mag bins redshift maxes to use
df = bgs_sv3_pz_2_4_10p.all_data
df['MAG'] = log_solar_L_to_abs_mag_r(np.log10(df['L_gal']))

df = df.loc[z_flag_is_spectro_z(df['z_assigned_flag'])]

mags = np.array([-14, -15, -16, -17, -18, -19, -20, -21, -22, -23])

# TODO why are some observed redshifts higher than my theoretical max?
print("DIM")
for m in mags:
    print(f"Mag-5log(h) > {m}:  zmax theory={get_max_observable_z(m, 19.5).value:.5f}  zmax obs={df.loc[df['MAG'] > m, 'z'].max()}")
print("BRIGHT")

In [None]:
bgs_sv3_pz_2_4_10p.calculate_projected_clustering()
# why is n=0? # BUG
pp.wp_rp(bgs_sv3_pz_2_4_10p.wp_all[0], bgs_sv3_pz_2_4_10p.wp_all[1])#, bgs_sv3_pz_2_4_10p.wp_all[2], bgs_sv3_pz_2_4_10p.wp_all[3])

In [None]:
print(f"Co-moving Dist:  {get_cosmology().comoving_distance([0.01, 0.1, 0.2, 0.4]).value}") # / Mpc/h
print(f"Co-moving Dist:  {get_cosmology().luminosity_distance([0.01, 0.1, 0.2, 0.4]).value / np.array([1.01, 1.1, 1.2, 1.4])}") # / Mpc/h
print(f"Luminosity Dist: {get_cosmology().luminosity_distance([0.01, 0.1, 0.2, 0.4]).value}") # / Mpc/h

### SDSS Tutorial of Corrfunc

In [None]:
# Mock catalog (SDSS-North) supplied with Corrfunc
mock_catalog = pjoin(dirname(abspath(Corrfunc.__file__)), "../mocks/tests/data/", "Mr19_mock_northonly.rdcz.ff")
RA, DEC, CZ = read_catalog(mock_catalog)

# Randoms catalog (SDSS-North) supplied with Corrfunc
randoms_catalog = pjoin(dirname(abspath(Corrfunc.__file__)), "../mocks/tests/data/", "Mr19_randoms_northonly.rdcz.ff")
RAND_RA, RAND_DEC, RAND_CZ = read_catalog(randoms_catalog)

rbins, wp = calculate_wp(RA, DEC, CZ, RAND_RA, RAND_DEC, RAND_CZ)

pp.wp_rp(rbins, wp)

In [None]:
j=plt.hist(RAND_CZ, bins=100, histtype='step', density=True)
j=plt.hist(CZ, bins=100, histtype='step', density=True)

## Viraj Targets

In [52]:
# Viraj Compare
path = DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian.fits'
table = Table.read(path)
viraj_df = table.to_pandas()
viraj_df.set_index('TARGETID', inplace=True)

In [53]:
def merge_viraj_ian(viraj_df: pd.DataFrame, gc : GroupCatalog):
    ian_df = gc.all_data.set_index('target_id').loc[:, ['z', 'L_gal', 'V_max', 'P_sat', 'M_halo', 'N_sat', 'L_tot',
       'igrp', 'weight', 'app_mag', 'z_assigned_flag', 'g_r', 'z_phot', 'is_sat', 'quiescent']]
    print(ian_df.igrp.dtype)
    #ian_df['quiescent'] = ian_df['quiescent'].astype(float)
    ian_df['N_sat'] = ian_df['N_sat'].astype(int)
    together = viraj_df.join(ian_df, how='inner', validate='one_to_one')
    print(together.igrp.dtype)
    print(f"Viraj targets: {len(viraj_df):,}, Ian {gc.name} Catalog: {len(ian_df):,}, # of Viraj Targets found in Ian's: {(~np.isnan(together.is_sat)).sum():,}")
    return together

In [None]:
together1 = merge_viraj_ian(viraj_df, bgs_simple_4_1pass)
together2 = merge_viraj_ian(viraj_df, simple4_BGS)
together3 = merge_viraj_ian(viraj_df, bgs_y3_simple_5)

In [None]:
missing=together3.loc[np.isnan(together3.is_sat)]
missing

In [None]:
print(bgs_y3_simple_5.all_data.igrp.dtype)
print(together3.igrp.dtype)

In [57]:
to_write = Table.from_pandas(together3, index=True)
to_write.write(DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian_matched.fits', overwrite=True)

In [58]:
combined = Table.read(DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian_matched.fits', format='fits')
df = combined.to_pandas()

In [None]:
df

## Mock and SV3 Analysis

### UCHUU Issues

In [None]:
plt.hist(all_u.all_data['M_halo'], bins=pp.Mhalo_bins, alpha=0.4)
plt.hist(all_u.all_data['uchuu_halo_mass']*10**10, bins=pp.Mhalo_bins, alpha=0.4)
plt.loglog()

# TODO do we expect the mass distribution of halos to be so different from the UCHUU SHAM catalog and our assigned halo?

In [None]:
# TODO 1 / VMax corrections do odd thing to UCHUU Truth. Why?
pp.hod_plots(all_u)

### What effect does Fiber Assignment have on the luminosity function?

In [None]:
#pp.group_finder_centrals_halo_masses_plots(mxxl_all, [mxxl_fiberonly, mxxl_simple_4])
pp.group_finder_centrals_halo_masses_plots(bgs_sv3_pz_2_4_10p, [bgs_sv3_pz_1_7p, bgs_sv3_simple_5_7p])

### Compare halos to truth

In [None]:
pp.assigned_halo_analysis(mxxl_simple_4)

### Compare assigned implied abs mags to truth from MXXL

In [None]:
all_unobs_counts = mxxl_all.all_data[mxxl_all.all_data.z_assigned_flag != 0].groupby('Lgal_bin').RA.count()
simple_4_ubobs_counts = mxxl_simple_4.all_data.groupby('Lgal_bin').RA.count()


In [None]:
pp.L_func_plot([mxxl_all, mxxl_simple_4], [all_unobs_counts, simple_4_ubobs_counts])



#pp.L_func_plot([all, simple_4], [all.all_data.L_gal[all.all_data.z_assigned_flag == 0], simple_4.all_data.L_gal[simple_4.all_data.z_assigned_flag == 0]])


## SV3 Edge Effects Quantification

In [None]:
inner_galaxies = filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p)
inner_galaxies.color = 'k'
inner_galaxies.name = 'SV3 Inner Galaxies'
pp.plots(inner_galaxies, bgs_sv3_simple_4_10p)

In [None]:
fig = pp.make_map(bgs_sv3_simple_4_10p.all_data.RA.to_numpy(), bgs_sv3_simple_4_10p.all_data.Dec.to_numpy())
fig = pp.make_map(inner_galaxies.all_data.RA.to_numpy(), inner_galaxies.all_data.Dec.to_numpy(), fig=fig)

In [None]:
bgs_sv3_simple_4_10p.all_data.groupby('Lgal_bin')['z'].median()

In [None]:
centering_versions = [
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.5),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.4),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.3),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.2),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.1),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.0),
    filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 0.9),
]
pickle.dump(centering_versions, open('centering_versions.pkl', 'wb'))

In [None]:
import copy
lowz = bgs_sv3_simple_4_10p.all_data.loc[bgs_sv3_simple_4_10p.all_data.z < 0.03]
lowz_gc = copy.deepcopy(bgs_sv3_simple_4_10p)
lowz_gc.all_data = lowz
lowz_gc.refresh_df_views()
centering_versions_lowz = [
    filter_SV3_to_avoid_edges(lowz_gc, 1.5),
    filter_SV3_to_avoid_edges(lowz_gc, 1.4),
    filter_SV3_to_avoid_edges(lowz_gc, 1.3),
    filter_SV3_to_avoid_edges(lowz_gc, 1.2),
    filter_SV3_to_avoid_edges(lowz_gc, 1.1),
    filter_SV3_to_avoid_edges(lowz_gc, 1.0),
    filter_SV3_to_avoid_edges(lowz_gc, 0.9),
]

In [None]:
pickle.load(open('centering_versions.pkl', 'rb'))

for i, d in enumerate(centering_versions):
    d.color = [0, i/len(centering_versions), 0]
    d.name = f'SV3 10p, {1.5-i*0.1:.1f} deg center cut'

pp.LEGENDS_ON = False
bgs_sv3_simple_4_10p.color = 'blue'
pp.fsat_by_zbins_sv3_centers(bgs_sv3_simple_4_10p, *centering_versions, z_bins=np.array([0.0, 0.03, 1.0]))
pp.single_plots(bgs_sv3_simple_4_10p)
pp.single_plots(centering_versions[2])
pp.single_plots(centering_versions[4])
pp.single_plots(centering_versions[6])
pp.LEGENDS_ON = True

#pp.fsat_by_z_bins(bgs_sv3_simple_4_10p, z_bins=np.array([0.0, 0.03, 1.0]))
#for d in centering_versions:
#    pp.fsat_by_z_bins(d, z_bins=np.array([0.0, 0.03, 1.0]))


In [None]:
lowz_gc.color = 'blue'
pp.single_plots(lowz_gc)
pp.single_plots(centering_versions_lowz[2])
pp.single_plots(centering_versions_lowz[4])
pp.single_plots(centering_versions_lowz[6])


In [None]:
#fig = pp.make_map(bgs_sv3_simple_4_10p.all_data.RA.to_numpy(), bgs_sv3_simple_4_10p.all_data.Dec.to_numpy())

#for i, gc in enumerate(centering_versions):
#    fig = pp.make_map(gc.all_data.RA.to_numpy(), gc.all_data.Dec.to_numpy(), fig=fig)

#plot_positions(bgs_sv3_simple_4_10p, *centering_versions, tiles_df=None, split=False, DEG_LONG=7, ra_min = 186.5, dec_min = 60)
# BUG pass in all_data, not the GroupCatalog object
plot_positions(bgs_sv3_simple_4_10p.all_data, *centering_versions, tiles_df=None, split=False, DEG_LONG=6, ra_min = 147, dec_min = -1)

## Lost Galaxy Luminosity Function

Take a cut of SV3 whose completeness is similar to Y1 BGS.

Question: is the luminosity function of lost galaxies (that were later observed) is different from the luminosity function observed galaxies?

They seem similar; perhaps a mild slant. Overall it seems that trying to match the observed luminosity function with the lost ones is ok.

Now for lost galaxies in 6pass that we have later got redshifts for.

Question: What did our processing do to the luminosity function for lost galaxies?

Our processing squeezes the luminosity function. We move galaxies from the wings towards the middle.

## Galaxy Neighborhood Examiner

In [None]:

add_halo_columns(bgs_sv3_pz_2_4_7p)
data = bgs_sv3_pz_2_4_7p.all_data
lost_galaxies = data.loc[z_flag_is_not_spectro_z(data['z_assigned_flag'])]
obs_galaxies = data.loc[z_flag_is_spectro_z(data['z_assigned_flag'])]
print("Lost galaxies: ", len(lost_galaxies), "Observed Galaxies: ", len(obs_galaxies))

In [None]:
obs_galaxies[np.logical_and(np.isclose(obs_galaxies['Mh_bin'], Mhalo_bins[16]), close_enough(0.03, obs_galaxies['z']))]

In [None]:
PLOTS_TO_MAKE = 5
GALAXY_POOL = lost_galaxies

#START_INDEX = 777
#for i in range(START_INDEX, START_INDEX + PLOTS_TO_MAKE):
#    index = lost_galaxies.index[i]
#    examine_around(index)
print("Number of galaxies to choose from: ", len(GALAXY_POOL))
indexes = np.random.randint(0, len(GALAXY_POOL)-1, size=PLOTS_TO_MAKE)
for i in indexes:
    target = GALAXY_POOL.iloc[i]
    pp.examine_around(target, data)

# Tests

In [None]:
# How many halos were assigned below a certain cutoff?
df = simple4_BGS.all_data
M_HALL_CUT = 10**11
small_halo_df = df[df.M_halo < M_HALL_CUT]

print(len(small_halo_df), len(df))

junk=plt.hist(small_halo_df.z, bins=100)
