In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import astropy.coordinates as coord
import astropy.units as u
import emcee
import sys
import copy
from astropy.table import Table

if './SelfCalGroupFinder/py/' not in sys.path:
    sys.path.append('./SelfCalGroupFinder/py/')
from pyutils import *
import plotting as pp
from dataloc import *
import catalog_definitions as cat
from groupcatalog import deserialize, serialize, GroupCatalog, SDSSGroupCatalog, mstar_vmax_weighted, add_halo_columns
import groupcatalog as gc

%load_ext autoreload
%autoreload 2

In [50]:
DPI = 150
pp.DPI = DPI

After the group finder is run, this notebook is used to post pp.process the results, generating plots and such for analysis.


## Loading existing datasets 

In [60]:
#mxxl_all=deserialize(cat.mxxl_all)
#mxxl_fiberonly=deserialize(cat.mxxl_fiberonly)
#mxxl_nn=deserialize(cat.mxxl_nn)
mxxl_simple_4=deserialize(cat.mxxl_simple_4)

In [None]:
sdss_vanilla = deserialize(cat.sdss_vanilla_v2)
sdss_colors = deserialize(cat.sdss_colors_v2)
sdss_colors_chi = deserialize(cat.sdss_colors_chi_v2)
cat.sdss_published.postprocess()
sdss_published = cat.sdss_published # It really is ~exactly sdss_colors_chi, which is great news for reproducibility
sdss_vanilla_old = deserialize(cat.sdss_vanilla)

In [3]:
fiberonly_BGS=deserialize(cat.bgs_fiberonly)
fiberonly_1pass_BGS=deserialize(cat.bgs_fiberonly_1pass)
nn_BGS=deserialize(cat.bgs_nn)
nn_BGS_sdsslike=deserialize(cat.bgs_nn_sdsslike)
#simple2_BGS=deserialize(cat.bgs_simple_2)
#simple2_BGS_c=deserialize(cat.bgs_simple_2_c)
simple4_BGS=deserialize(cat.bgs_simple_4)
#simple4_BGS_old=deserialize(cat.bgs_simple_4_old)
#simple4_BGS_c=deserialize(cat.bgs_simple_4_c)
#bgs_simple_4_no_sdss=deserialize(cat.bgs_simple_4_no_sdss)
bgs_simple_4_4p = deserialize(cat.bgs_simple_4_4p)
bgs_simple_4_1pass = deserialize(cat.bgs_simple_4_1pass)
bgs_simple_5 = deserialize(cat.bgs_simple_5)


In [51]:
bgs_y3_simple_4 = deserialize(cat.bgs_y3_simple_4)
bgs_y3_simple_5 = deserialize(cat.bgs_y3_simple_5)
#bgs_y3_simple_4_4p = deserialize(cat.bgs_y3_simple_4_4p)

In [61]:
# LOAD ALL INCLUSIVE SV3 VERSIONS
bgs_sv3_simple_4_10p = deserialize(cat.bgs_sv3_simple_4_10p)
bgs_sv3_simple_4_9p = deserialize(cat.bgs_sv3_simple_4_9p)
bgs_sv3_simple_4_8p = deserialize(cat.bgs_sv3_simple_4_8p)
bgs_sv3_simple_4_7p = deserialize(cat.bgs_sv3_simple_4_7p)
bgs_sv3_simple_4_6p = deserialize(cat.bgs_sv3_simple_4_6p)
bgs_sv3_simple_4_5p = deserialize(cat.bgs_sv3_simple_4_5p)
bgs_sv3_simple_4_4p = deserialize(cat.bgs_sv3_simple_4_4p)
bgs_sv3_simple_4_3p = deserialize(cat.bgs_sv3_simple_4_3p)
bgs_sv3_simple_4_2p = deserialize(cat.bgs_sv3_simple_4_2p)
bgs_sv3_simple_4_1p = deserialize(cat.bgs_sv3_simple_4_1p)

bgs_sv3_fiberonly_10p = deserialize(cat.bgs_sv3_fiberonly_10p)

bgs_sv3_nn_7p = deserialize(cat.bgs_sv3_nn_7p)
bgs_sv3_nn_6p = deserialize(cat.bgs_sv3_nn_6p)

In [62]:
bgs_sv3_simple_5_10p = deserialize(cat.bgs_sv3_simple_5_10p)
bgs_sv3_simple_5_9p = deserialize(cat.bgs_sv3_simple_5_9p)
bgs_sv3_simple_5_8p = deserialize(cat.bgs_sv3_simple_5_8p)
bgs_sv3_simple_5_7p = deserialize(cat.bgs_sv3_simple_5_7p)
bgs_sv3_simple_5_6p = deserialize(cat.bgs_sv3_simple_5_6p)
bgs_sv3_simple_5_5p = deserialize(cat.bgs_sv3_simple_5_5p)
bgs_sv3_simple_5_4p = deserialize(cat.bgs_sv3_simple_5_4p)
bgs_sv3_simple_5_3p = deserialize(cat.bgs_sv3_simple_5_3p)
bgs_sv3_simple_5_2p = deserialize(cat.bgs_sv3_simple_5_2p)
bgs_sv3_simple_5_1p = deserialize(cat.bgs_sv3_simple_5_1p)

bgs_sv3_pz_1_10p = deserialize(cat.bgs_sv3_pz_1_10p)
bgs_sv3_pz_1_7p = deserialize(cat.bgs_sv3_pz_1_7p)

In [None]:
all_u = deserialize(cat.uchuu_all)

In [None]:
run = 1
path = f'/mount/sirocco1/imw2293/GROUP_CAT/MCMC/mcmc_{run}/mcmc_{run}.dat'
reader = emcee.backends.HDFBackend(path, read_only=True)
sdss_colors_mine = SDSSGroupCatalog.from_MCMC(reader, "SDSS Colors Mine", SDSS_v2_DAT_FILE, SDSS_v2_GALPROPS_FILE)
sdss_colors_mine.run_group_finder(popmock=True)
sdss_colors_mine.run_corrfunc()
sdss_colors_mine.postprocess()
serialize(sdss_colors_mine)

## Publishing / Sharing

In [None]:
# Make a call like this to write a csv for sharing on NERSC
simple4_BGS.write_sharable_output_file()
bgs_simple_4_1pass.write_sharable_output_file()

## View Plots

In [None]:
# Primary comparison v5
bgs_y3_simple_5.color = 'darkorange'
bgs_sv3_simple_5_7p.centered.color = 'red'
bgs_simple_5.color = 'k'
pp.plots(bgs_simple_5, bgs_y3_simple_5,bgs_sv3_simple_5_10p.centered, bgs_sv3_simple_5_7p.centered, sdss_vanilla)

In [None]:
# Primary Comparison v4
bgs_y3_simple_4.color = 'darkorange'
bgs_sv3_simple_4_7p.centered.color = 'red'
pp.plots(simple4_BGS, bgs_y3_simple_4,bgs_sv3_simple_4_10p.centered, bgs_sv3_simple_4_7p.centered, sdss_vanilla)


In [None]:
# Photo-z-plus
pp.plots(bgs_sv3_pz_1_10p, bgs_sv3_simple_4_10p, bgs_sv3_pz_1_7p, bgs_sv3_simple_4_7p, sdss_vanilla, mxxl_simple_4)

In [None]:
# Simple v4 vs v5
bgs_y3_simple_5.marker = '--'
bgs_y3_simple_5.color = 'darkorange'
bgs_y3_simple_4.color = 'darkorange'
bgs_sv3_simple_5_7p.centered.marker = '--'
bgs_sv3_simple_4_7p.centered.color = 'red'
bgs_sv3_simple_5_7p.centered.color = 'red'
bgs_simple_5.marker = '--'
bgs_simple_5.color = 'k'
pp.plots(simple4_BGS, bgs_simple_5, bgs_y3_simple_4, bgs_y3_simple_5, bgs_sv3_simple_4_7p.centered, bgs_sv3_simple_5_7p.centered)

bgs_y3_simple_5.marker = '-'
bgs_sv3_simple_5_7p.centered.marker = '-'
bgs_simple_5.marker = '-'


In [None]:
pp.single_plots(bgs_y3_simple_5)

In [None]:
# BGS Completeness and NN/Other z assignment stats
def completeness_stats(d: GroupCatalog):
    name = d.name.replace("BGS sv3", "SV3")
    print(f"{name} completeness: {np.sum(d.all_data.z_assigned_flag == 0) / len(d.all_data.z_assigned_flag):.1%}")
    print(f"  NN/Other ratio: {np.sum(d.all_data.z_assigned_flag == 1) / np.sum(d.all_data.z_assigned_flag != 0):.1%}")

#for d in [simple4_BGS, simple4_BGS_old, bgs_y3_simple_4, bgs_simple_5, bgs_y3_simple_5]:
#    completeness_stats(d)

#for d in [bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p]:
#    completeness_stats(d)

#for d in [bgs_sv3_simple_4_10p.centered, bgs_sv3_simple_4_9p.centered, bgs_sv3_simple_4_8p.centered, bgs_sv3_simple_4_7p.centered, bgs_sv3_simple_4_6p.centered, bgs_sv3_simple_4_5p.centered, bgs_sv3_simple_4_4p.centered, bgs_sv3_simple_4_3p.centered, bgs_sv3_simple_4_2p.centered, bgs_sv3_simple_4_1p.centered]:
#    completeness_stats(d)

#for d in [bgs_sv3_simple_5_10p, bgs_sv3_simple_5_9p, bgs_sv3_simple_5_8p, bgs_sv3_simple_5_7p, bgs_sv3_simple_5_6p, bgs_sv3_simple_5_5p, bgs_sv3_simple_5_4p, bgs_sv3_simple_5_3p, bgs_sv3_simple_5_2p, bgs_sv3_simple_5_1p]:
#    completeness_stats(d)

#for d in [bgs_sv3_simple_5_10p.centered, bgs_sv3_simple_5_9p.centered, bgs_sv3_simple_5_8p.centered, bgs_sv3_simple_5_7p.centered, bgs_sv3_simple_5_6p.centered, bgs_sv3_simple_5_5p.centered, bgs_sv3_simple_5_4p.centered, bgs_sv3_simple_5_3p.centered, bgs_sv3_simple_5_2p.centered, bgs_sv3_simple_5_1p.centered]:
#    completeness_stats(d)

for d in [bgs_sv3_simple_5_7p, bgs_simple_5, simple4_BGS]:
    completeness_stats(d)


In [None]:
# SV3 PLOTS (FULL 10 pass FOOTPRINT)
bgs_sv3_simple_4_7p.color = 'red'

pp.LEGENDS_ON = False
pp.plots(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p)
pp.LEGENDS_ON = True

pp.completeness_comparison(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p)


In [None]:
pp.z_assigned_fraction(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_9p, bgs_sv3_simple_4_8p, bgs_sv3_simple_4_7p, bgs_sv3_simple_4_6p, bgs_sv3_simple_4_5p, bgs_sv3_simple_4_4p, bgs_sv3_simple_4_3p, bgs_sv3_simple_4_2p, bgs_sv3_simple_4_1p)

In [None]:
# SV3 function of # passes plots (USING CENTERS ONLY)
bgs_sv3_simple_4_7p.color = 'red'
bgs_sv3_simple_4_7p.centered.color = 'red'

pp.LEGENDS_ON = False
pp.plots(
    bgs_sv3_simple_4_10p.centered,
    bgs_sv3_simple_4_9p.centered,
    bgs_sv3_simple_4_8p.centered,
    bgs_sv3_simple_4_7p.centered,
    bgs_sv3_simple_4_6p.centered,
    bgs_sv3_simple_4_5p.centered,
    bgs_sv3_simple_4_4p.centered,
    bgs_sv3_simple_4_3p.centered,
    bgs_sv3_simple_4_2p.centered,
    bgs_sv3_simple_4_1p.centered
)    
pp.LEGENDS_ON = True

In [None]:
# SV3 function of # passes plots (USING CENTERS ONLY)
bgs_sv3_simple_5_7p.color = 'red'
bgs_sv3_simple_5_7p.centered.color = 'red'

pp.LEGENDS_ON = False
pp.plots(
    bgs_sv3_simple_5_10p.centered,
    bgs_sv3_simple_5_9p.centered,
    bgs_sv3_simple_5_8p.centered,
    bgs_sv3_simple_5_7p.centered,
    bgs_sv3_simple_5_6p.centered,
    bgs_sv3_simple_5_5p.centered,
    bgs_sv3_simple_5_4p.centered,
    bgs_sv3_simple_5_3p.centered,
    bgs_sv3_simple_5_2p.centered,
    bgs_sv3_simple_5_1p.centered
)    
pp.LEGENDS_ON = True

In [None]:
# SV3 with pure NN
bgs_sv3_nn_6p.color = [0.6, 0.6, 0.0]
pp.plots(bgs_sv3_simple_4_10p, bgs_sv3_simple_4_6p, bgs_sv3_nn_6p)

In [None]:
# View plots for my SDSS results from my MCMC chains
to_compare = sdss_vanilla
to_compare.run_group_finder(popmock=True)
to_compare.run_corrfunc()
to_compare.postprocess()

pp.proj_clustering_plot(sdss_vanilla)
pp.lsat_data_compare_plot(sdss_vanilla)


#sdss_colors_chi.color = 'r'
#pp.plots(sdss_colors_mine, sdss_colors, sdss_colors_chi)

In [None]:
#simple4_BGS.name = 'DESI BGS Y1'
pp.plots(simple4_BGS, sdss_vanilla, sdss_colors, sdss_colors_chi)
#pp.plots(cat.sdss_published, sdss_colors_chi)

In [None]:
# Why doesn't mstar missing % exactly match z_assigned_flag? 
# Probably redshift failures. Still have a spectra so still have mstar
print(np.sum(np.isnan(simple4_BGS.all_data.mstar)) / len(simple4_BGS.all_data.mstar))
print(np.sum(simple4_BGS.all_data.z_assigned_flag != 0) / len(simple4_BGS.all_data.z_assigned_flag))

In [None]:
pp.compare_fsat_color_split(sdss_vanilla_old, sdss_vanilla, project_percent=0.52)


In [None]:
bgs_simple_4_1pass.color = 'r'
pp.plots(simple4_BGS, bgs_simple_4_1pass)

In [None]:
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, sdss_vanilla)


In [None]:
#pp.compare_fsat_color_split(all, simple_2)
#pp.compare_fsat_color_split(all, simple_4)
#pp.compare_fsat_color_split(sdss_vanilla, simple4_BGS)
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, simple4_BGS)
pp.compare_fsat_color_split(bgs_sv3_simple_4_10p, bgs_sv3_fiberonly_10p)
#pp.compare_fsat_color_split(bgs_sv3_10p, bgs_sv3_10p_all)
#pp.compare_fsat_color_split(sdss_vanilla, bgs_simple_4_1pass)


In [None]:
pp.qf.centered_plot(simple4_BGS)
pp.qf.centered_plot(sdss_published)

In [None]:
pp.fsat_by_z_bins(simple4_BGS, z_bins=np.array([0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]), show_plots=True)
#pp.fsat_by_z_bins(mxxl_simple_4, z_bins=np.array([0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 1.0]), show_plots=False, aggregation=pp.fsat_truth_vmax_weighted)


In [None]:
pp.fsat_by_z_bins(simple4_BGS, z_bins=np.array([0.0, 0.2, 1.0]))


In [None]:
pp.plots(mxxl_all, mxxl_fiberonly, mxxl_nn, mxxl_simple_4)


In [None]:
pp.plots(fiberonly_BGS, nn_BGS, simple4_BGS)


In [None]:
simple4_BGS.name = 'Our Algorithm BGS Y1'

In [None]:
pp.plots(bgs_sv3_simple_4_10p, bgs_sv3_fiberonly_10p)


In [None]:
# Print out biggest group size
for dataset in [simple4_BGS, simple2_BGS, fiberonly_BGS, fiberonly_1pass_BGS, sdss_vanilla]:
    print(dataset.name)
    print(dataset.all_data.groupby('igrp').size().max())


In [None]:
# SDSS Examine Bimodality

z=sdss_vanilla.all_data['z']
gmr=sdss_vanilla.all_data['Mag_g'] - sdss_vanilla.all_data['Mag_r']
junk=plt.hist(gmr, bins=np.linspace(-1,3,300), alpha=0.4)
#junk=plt.hist(k_correct(sdss_vanilla.all_data['Mag_g'], z, gmr, band='g')  - k_correct(sdss_vanilla.all_data['Mag_r'], z, gmr, band='r'), bins=500, alpha=0.4)
junk=plt.hist(sdss_vanilla.all_data['Dn4000'], bins=np.linspace(0,4,300), alpha=0.4)
plt.xlim(-1, 3)

In [None]:
# Investigate changes in halo mass function from wcen
m1=np.log10(sdss_vanilla.all_data['M_halo'])
m2=np.log10(sdss_colors.all_data['M_halo'])
m3=np.log10(sdss_colors_chi.all_data['M_halo'])

# bin m1,m2,m3 the same way
n_bins = 20
bins = np.linspace(10.8, 15.0, n_bins)
d1 = np.digitize(m1, bins)
d2 = np.digitize(m2, bins)
d3 = np.digitize(m3, bins)

# count the number of galaxies in each bin
n1 = np.array([np.sum(d1==i) for i in range(1, n_bins+1)])
n2 = np.array([np.sum(d2==i) for i in range(1, n_bins+1)])
n3 = np.array([np.sum(d3==i) for i in range(1, n_bins+1)])

# Do the same but for log10(counts)
n1 = np.log10(n1)
n2 = np.log10(n2)
n3 = np.log10(n3)
print(n1,n2,3)

# Log difference
p1 = np.abs(n1-n2)
p2 = np.abs(n1-n3)

plt.plot(bins, p1, label='SDSS Colors vs Vanilla')
plt.plot(bins, p2, label='SDSS Colors+Chi vs Vanilla')

plt.xlabel('log10(M_halo)')
plt.ylabel('Log10 Difference in Counts')
plt.legend()

## Make single group CSV for legacysurvey.org/viewer visualization

In [None]:
df = pd.read_csv(OUTPUT_FOLDER + 'NERSC_BGS_1pass_v1.out')
centrals_of_big_groups = df['N_sat'] > 0
group_ids = df.loc[centrals_of_big_groups].igrp.unique()

In [None]:
df[df.igrp == 1644058]

In [None]:
print(group_ids[0:10])

In [None]:
#for i in group_ids[0:10]:
for i in [1644058, 1644051]:
    #df.loc[df.igrp == i, ['RA', 'Dec']].to_csv(OUTPUT_FOLDER + f'group{i}.csv', index=False)
    print(df.loc[df.igrp == i, ['RA', 'Dec', 'z', 'z_assigned_flag']])

## Study z_phot vs z_spectra

In [None]:
df = bgs_y3_simple_5.all_data
good_idx = np.flatnonzero((df['z_phot'] != -99.0) & df['z_assigned_flag'] == 0)


delta_z = df['z_phot'][good_idx] - df['z'][good_idx]
plt.hist(delta_z, bins=500, range=(-0.1, 0.1))
plt.yscale("log")
plt.title("Photo-z Quality")
plt.ylabel("Count")
plt.xlabel("z_phot - z_spec")

# add bars for my z_thresh
plt.axvline(-SIM_Z_THRESH, color='red')
plt.axvline(SIM_Z_THRESH, color='red')

percentiles = np.percentile(delta_z, [16, 50, 84])
print(f"Median delta z: {percentiles[1]:.4f}, 16th percentile: {percentiles[0]:.4f}, 84th percentile: {percentiles[2]:.4f}")
# add bars for the percentiles
#plt.axvline(percentiles[0], color='green')
#plt.axvline(percentiles[2], color='green')



# What % fall within 0.005 of the true redshift?
within_5_milli = np.abs(delta_z) < SIM_Z_THRESH
print(f"{np.sum(within_5_milli) / len(delta_z) * 100:.2f}% of galaxies have a photometric redshift within {SIM_Z_THRESH} of the spectroscopic redshift.")

# Now look only at quiescent galaxies less than 10^9 solar luminosities
# TODO 
#luminosity = abs_mag_r_to_log_solar_L(app_mag_to_abs_mag_k(app_mag_r, z_obs, g_r_apparent))


## BGS and SDSS Target Overlap Analysis

TODO: need to use a version of SDSS data that doesn't have nearest-neighbor assigned redshifts in it!

In [None]:
pd.options.mode.copy_on_write = True

# For this comparison, use pure NN BGS 
bgs_to_use = simple4_BGS.all_data
lost_bgs = bgs_to_use.loc[bgs_to_use['z_assigned_flag'] != 0]
sdss_cat = sdss_vanilla.all_data

catalog = coord.SkyCoord(ra=sdss_cat.RA.to_numpy()*u.degree, dec=sdss_cat.Dec.to_numpy()*u.degree, frame='icrs')
to_match = coord.SkyCoord(ra=lost_bgs.RA.to_numpy()*u.degree, dec=lost_bgs.Dec.to_numpy()*u.degree, frame='icrs')

idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=1, storekdtree=False)

# if angular distance is < 3", then we consider it a match to SDSS catalog
lost_bgs['sdss_z'] = np.where(d2d < 3*u.arcsec, sdss_cat.iloc[idx]['z'], np.nan)
lost_bgs_matched = lost_bgs.loc[~np.isnan(lost_bgs['sdss_z'])]
print(f"Matched {len(lost_bgs_matched)} out of {len(lost_bgs)} lost BGS galaxies to SDSS catalog, ({len(lost_bgs_matched)/len(lost_bgs)*100:.2f}%)")

good_match = np.isclose(lost_bgs_matched.z, lost_bgs_matched.sdss_z, atol=0.001).sum()
print(f"Good match: {good_match} out of {len(lost_bgs_matched)}, or {good_match/len(lost_bgs_matched)*100:.2f}%")

## Viraj Targets

In [52]:
# Viraj Compare
path = DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian.fits'
table = Table.read(path)
viraj_df = table.to_pandas()
viraj_df.set_index('TARGETID', inplace=True)

In [53]:
def merge_viraj_ian(viraj_df: pd.DataFrame, gc : GroupCatalog):
    ian_df = gc.all_data.set_index('target_id').loc[:, ['z', 'L_gal', 'V_max', 'P_sat', 'M_halo', 'N_sat', 'L_tot',
       'igrp', 'weight', 'app_mag', 'z_assigned_flag', 'g_r', 'z_phot', 'is_sat', 'quiescent']]
    print(ian_df.igrp.dtype)
    #ian_df['quiescent'] = ian_df['quiescent'].astype(float)
    ian_df['N_sat'] = ian_df['N_sat'].astype(int)
    together = viraj_df.join(ian_df, how='inner', validate='one_to_one')
    print(together.igrp.dtype)
    print(f"Viraj targets: {len(viraj_df):,}, Ian {gc.name} Catalog: {len(ian_df):,}, # of Viraj Targets found in Ian's: {(~np.isnan(together.is_sat)).sum():,}")
    return together

In [None]:
together1 = merge_viraj_ian(viraj_df, bgs_simple_4_1pass)
together2 = merge_viraj_ian(viraj_df, simple4_BGS)
together3 = merge_viraj_ian(viraj_df, bgs_y3_simple_5)

In [None]:
missing=together3.loc[np.isnan(together3.is_sat)]
missing

In [None]:
print(bgs_y3_simple_5.all_data.igrp.dtype)
print(together3.igrp.dtype)

In [57]:
to_write = Table.from_pandas(together3, index=True)
to_write.write(DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian_matched.fits', overwrite=True)

In [58]:
combined = Table.read(DATA_FOLDER + 'VIRAJ/jura_bgs_bright_catalog_for_ian_matched.fits', format='fits')
df = combined.to_pandas()

In [None]:
df

## Mock and SV3 Analysis

### UCHUU Issues

In [None]:
plt.hist(all_u.all_data['M_halo'], bins=pp.Mhalo_bins, alpha=0.4)
plt.hist(all_u.all_data['uchuu_halo_mass']*10**10, bins=pp.Mhalo_bins, alpha=0.4)
plt.loglog()

# TODO do we expect the mass distribution of halos to be so different from the UCHUU SHAM catalog and our assigned halo?

In [None]:
# TODO 1 / VMax corrections do odd thing to UCHUU Truth. Why?
pp.hod_plots(all_u)

### What effect does Fiber Assignment have on the luminosity function?

In [None]:
pp.group_finder_centrals_halo_masses_plots(mxxl_all, [mxxl_fiberonly, mxxl_simple_4])

### Purity and Completeness

In [None]:
pp.test_purity_and_completeness(mxxl_all, mxxl_fiberonly, mxxl_nn, mxxl_simple_4)

In [None]:
pp.purity_complete_plots(mxxl_all, mxxl_fiberonly, mxxl_nn, mxxl_simple_4)

### Compare halos to truth

In [None]:
pp.assigned_halo_analysis(mxxl_simple_4)

### Compare assigned implied abs mags to truth from MXXL

In [None]:
all_unobs_counts = mxxl_all.all_data[mxxl_all.all_data.z_assigned_flag != 0].groupby('Lgal_bin').RA.count()
simple_4_ubobs_counts = mxxl_simple_4.all_data.groupby('Lgal_bin').RA.count()


In [None]:
pp.L_func_plot([mxxl_all, mxxl_simple_4], [all_unobs_counts, simple_4_ubobs_counts])



#pp.L_func_plot([all, simple_4], [all.all_data.L_gal[all.all_data.z_assigned_flag == 0], simple_4.all_data.L_gal[simple_4.all_data.z_assigned_flag == 0]])


### Correct Redshifts

In [None]:
def merit_score(simz_score):
    return 2*(simz_score - 0.5) # 1 to -1

# For all the z_assigned_flag galaxies, compare the z we assigned to the z_obs
# Need to ignore ones where z_obs is nan or -99 or whatever
sets = [bgs_sv3_pz_1_7p, bgs_sv3_simple_5_7p, bgs_sv3_simple_4_7p, bgs_sv3_nn_7p, mxxl_simple_4]
scores_all_lost = []
scores_n_only = []

for s in sets:
    print(f"*** Summarizing results for {s.name} ***")

    print(f"All lost galaxies (with 'truth'):")
    valid_idx = (s.all_data['z_obs'] != -99) & (~np.isnan(s.all_data['z_obs']))
    assigned_z = s.all_data.loc[valid_idx & (s.all_data['z_assigned_flag'] != 0), 'z']
    observed_z = s.all_data.loc[valid_idx & (s.all_data['z_assigned_flag'] != 0), 'z_obs']
    score = sim_z_score(assigned_z, observed_z)
    scores_all_lost.append(score.mean())
    print(f" Galaxies to compare: {len(assigned_z)} ({len(assigned_z) / len(s.all_data):.1%})")
    print(f" Neighbor z used {np.sum(s.all_data.z_assigned_flag == 1) / np.sum(s.all_data.z_assigned_flag != 0):.1%}")
    print(f" Score Mean: {score.mean():.4f}")

    print("Neighbor-assigned Only:")
    assigned_z2 = s.all_data.loc[valid_idx & (s.all_data['z_assigned_flag'] == 1), 'z']
    observed_z2 = s.all_data.loc[valid_idx & (s.all_data['z_assigned_flag'] == 1), 'z_obs']
    score2 = sim_z_score(assigned_z2, observed_z2)
    scores_n_only.append(score2.mean())
    print(f" Galaxies to compare: {len(assigned_z2)} ({len(assigned_z2) / len(s.all_data):.1%})")
    print(f" Score Mean: {score2.mean():.4f}")

# Plotting the results
labels = [s.name for s in sets]
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, scores_all_lost, width, label='All Lost')
rects2 = ax.bar(x + width/2, scores_n_only, width, label='Neighbor-assigned Only')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Catalogs')
ax.set_ylabel('Mean Scores')
ax.set_title('Mean Scores by Catalog and Assignment Type')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()

fig.tight_layout()

plt.show()


In [None]:
def merit_score(simz_score):
    return 2*(simz_score - 0.5) # 1 to -1

# do like the above, but use 

## SV3 Edge Effects Quantification

In [None]:
inner_galaxies = gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p)
inner_galaxies.color = 'k'
inner_galaxies.name = 'SV3 Inner Galaxies'
pp.plots(inner_galaxies, bgs_sv3_simple_4_10p)

In [None]:
fig = pp.make_map(bgs_sv3_simple_4_10p.all_data.RA.to_numpy(), bgs_sv3_simple_4_10p.all_data.Dec.to_numpy())
fig = pp.make_map(inner_galaxies.all_data.RA.to_numpy(), inner_galaxies.all_data.Dec.to_numpy(), fig=fig)

In [None]:
bgs_sv3_simple_4_10p.all_data.groupby('Lgal_bin')['z'].median()

In [None]:
centering_versions = [
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.5),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.4),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.3),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.2),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.1),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 1.0),
    gc.filter_SV3_to_avoid_edges(bgs_sv3_simple_4_10p, 0.9),
]
pickle.dump(centering_versions, open('centering_versions.pkl', 'wb'))

In [None]:
import copy
lowz = bgs_sv3_simple_4_10p.all_data.loc[bgs_sv3_simple_4_10p.all_data.z < 0.03]
lowz_gc = copy.deepcopy(bgs_sv3_simple_4_10p)
lowz_gc.all_data = lowz
lowz_gc.refresh_df_views()
centering_versions_lowz = [
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.5),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.4),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.3),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.2),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.1),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 1.0),
    gc.filter_SV3_to_avoid_edges(lowz_gc, 0.9),
]

In [None]:
pickle.load(open('centering_versions.pkl', 'rb'))

for i, d in enumerate(centering_versions):
    d.color = [0, i/len(centering_versions), 0]
    d.name = f'SV3 10p, {1.5-i*0.1:.1f} deg center cut'

pp.LEGENDS_ON = False
bgs_sv3_simple_4_10p.color = 'blue'
pp.fsat_by_zbins_sv3_centers(bgs_sv3_simple_4_10p, *centering_versions, z_bins=np.array([0.0, 0.03, 1.0]))
pp.single_plots(bgs_sv3_simple_4_10p)
pp.single_plots(centering_versions[2])
pp.single_plots(centering_versions[4])
pp.single_plots(centering_versions[6])
pp.LEGENDS_ON = True

#pp.fsat_by_z_bins(bgs_sv3_simple_4_10p, z_bins=np.array([0.0, 0.03, 1.0]))
#for d in centering_versions:
#    pp.fsat_by_z_bins(d, z_bins=np.array([0.0, 0.03, 1.0]))


In [None]:
lowz_gc.color = 'blue'
pp.single_plots(lowz_gc)
pp.single_plots(centering_versions_lowz[2])
pp.single_plots(centering_versions_lowz[4])
pp.single_plots(centering_versions_lowz[6])


In [None]:
#fig = pp.make_map(bgs_sv3_simple_4_10p.all_data.RA.to_numpy(), bgs_sv3_simple_4_10p.all_data.Dec.to_numpy())

#for i, gc in enumerate(centering_versions):
#    fig = pp.make_map(gc.all_data.RA.to_numpy(), gc.all_data.Dec.to_numpy(), fig=fig)

#plot_positions(bgs_sv3_simple_4_10p, *centering_versions, tiles_df=None, split=False, DEG_LONG=7, ra_min = 186.5, dec_min = 60)
# BUG pass in all_data, not the GroupCatalog object
plot_positions(bgs_sv3_simple_4_10p.all_data, *centering_versions, tiles_df=None, split=False, DEG_LONG=6, ra_min = 147, dec_min = -1)

## Lost Galaxy Luminosity Function

In [None]:
bgs_sv3_simple_4_9p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_8p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_7p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_6p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_5p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_4p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_3p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_2p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_simple_4_1p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)
bgs_sv3_nn_6p.get_true_z_from(bgs_sv3_simple_4_10p.all_data)

In [None]:
def compare_L_funcs(one: pd.DataFrame, two: pd.DataFrame):
    one_counts = one.groupby('Lgal_bin').RA.count()
    two_counts = two.groupby('Lgal_bin').RA.count()
    pp.L_func_plot([one, two], [one_counts, two_counts])

In [None]:
data = bgs_sv3_nn_6p.all_data
lostrows = data.z_assigned_flag != 0
lost_and_havetruth_rows = np.logical_and(data.z_assigned_flag != 0, data.z_T > 0)
lost_galaxies = data.loc[lostrows]
lost_withT_galaxies = data.loc[lost_and_havetruth_rows]
obs_galaxies = data.loc[~lostrows]
#.loc[obs_galaxies.z_truth > 0]

assert np.isclose(obs_galaxies.z, obs_galaxies.z_T).all()
assert np.isclose(obs_galaxies.L_gal, obs_galaxies.L_gal_T).all()

bins = np.log10(gc.L_gal_bins)

obs_counts = obs_galaxies.groupby("Lgal_bin").RA.count()
lost_assumed_counts_with_all = lost_galaxies.groupby("Lgal_bin").RA.count()
lost_assumed_counts = lost_withT_galaxies.groupby("Lgal_bin").RA.count()
lost_truth_counts = lost_withT_galaxies.groupby("Lgal_bin_T").RA.count()

print("Before Correction")
print(obs_counts.sum())
print(lost_assumed_counts_with_all.sum())
print(lost_assumed_counts.sum())
print(lost_truth_counts.sum())

# Boost them all up to the counts of obs_counts for more even comparison
lost_assumed_counts_with_all = (lost_assumed_counts_with_all * len(obs_galaxies) / len(lost_galaxies)).astype(int)
lost_assumed_counts = (lost_assumed_counts * len(obs_galaxies) / len(lost_withT_galaxies)).astype(int)
lost_truth_counts = (lost_truth_counts * len(obs_galaxies) / len(lost_withT_galaxies)).astype(int)

print("After Correction")
print(obs_counts.sum())
print(lost_assumed_counts_with_all.sum())
print(lost_assumed_counts.sum())
print(lost_truth_counts.sum())

percent_diff_with_obs = ((obs_counts - lost_assumed_counts_with_all) / lost_assumed_counts_with_all) * 100

percent_diff_with_all = ((lost_assumed_counts- lost_assumed_counts_with_all) / lost_assumed_counts_with_all) * 100

percent_diff = ((lost_assumed_counts - lost_truth_counts) / lost_truth_counts) * 100



Take a cut of SV3 whose completeness is similar to Y1 BGS.

Question: is the luminosity function of lost galaxies (that were later observed) is different from the luminosity function observed galaxies?


In [None]:
plt.figure()
j=plt.hist(np.log10(obs_galaxies['L_gal']), bins=bins, density=True, histtype='step', color='b', label='Obs galaxies')
#j=plt.hist(np.log10(obs_galaxies['L_gal_T']), bins=bins, density=True, histtype='step', label='Obs galaxies (True)')
#j=plt.hist(np.log10(lost_galaxies['L_gal']), bins=bins, density=True, histtype='step', label='Lost galaxies')
#j=plt.hist(np.log10(lost_withT_galaxies['L_gal']), bins=bins, density=True, histtype='step', label='Lost gals w/ Truth')
j=plt.hist(np.log10(lost_withT_galaxies['L_gal_T']), bins=bins, density=True, histtype='step', color='g', label='Lost gals w/ Truth (True)')
plt.legend()
plt.title("Do Obs and Lost Gals Have Different Luminosity Funcs?")
plt.xlabel('$log(L_{gal})$')
plt.draw()

plt.figure()
obs_vs_losttruth = ((obs_counts - lost_truth_counts) / lost_truth_counts) * 100
plt.plot(obs_vs_losttruth.index, obs_vs_losttruth.values, label="Observed => Lost Truth")
plt.title("Do Obs and Lost Gals Have Different Luminosity Funcs?")
plt.xlabel("$L_{gal}$ bin")
plt.ylabel("% Change in counts")
plt.xscale('log')
plt.xlim(1E8, 1E11)
plt.ylim(-25, 25)
plt.legend()
plt.axhline(0, color='black', lw=1)
plt.draw()

They seem similar; perhaps a mild slant. Overall it seems that trying to match the observed luminosity function with the lost ones is ok.


In [None]:
# This compares lost galaxies vs lost galaxies with truth, but using the assumed redshifts for all them. 
"""
plt.figure()
plt.plot(percent_diff_with_all.index, percent_diff_with_all.values, color='orange', label="All Lost => Lost w/ Truth")
plt.title("But...")
plt.xlabel("$L_{gal}$ bin")
plt.ylabel("% Change in counts")
plt.xscale('log')
plt.xlim(1E8, 1E11)
plt.ylim(-25, 25)
plt.legend()
plt.axhline(0, color='black', lw=1)
plt.draw()
"""

Now for lost galaxies in 6pass that we have later got redshifts for.

Question: What did our processing do to the luminosity function for lost galaxies?

In [None]:
plt.figure()
j=plt.hist(np.log10(lost_withT_galaxies['L_gal']), bins=bins, histtype='step', color='orange', label='Lost gals w/ Truth (Assumed)')
j=plt.hist(np.log10(lost_withT_galaxies['L_gal_T']), bins=bins, histtype='step', color='g', label='Lost gals w/ Truth (True)')
plt.legend()
plt.title("Luminosity function")
plt.xlabel('$log(L_{gal})$')
plt.draw()

plt.figure()
plt.plot(percent_diff.index, percent_diff.values, color='orange', label="Truth => Assumed")
plt.title("Effect on Luminosity Function of Lost Gals")
plt.xlabel("$L_{gal}$ bin")
plt.ylabel("% Change in counts")
plt.xscale('log')
plt.xlim(1E8, 1E11)
plt.ylim(-60, 60)
plt.legend()
plt.axhline(0, color='black', lw=1)
plt.draw()

Our processing squeezes the luminosity function. We move galaxies from the wings towards the middle.

## Galaxy Neighborhood Examiner

In [None]:
add_halo_columns(bgs_sv3_simple_4_7p)
data = bgs_sv3_simple_4_7p.all_data

In [None]:
lost_galaxies = data.loc[data['z_assigned_flag'] != 0]
#lost_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 0]
obs_galaxies = data.loc[data['z_assigned_flag'] == 0]
#obs_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 1]
print("Lost galaxies: ", len(lost_galaxies), "Observed Galaxies: ", len(obs_galaxies))

In [None]:
#%matplotlib qt 

In [None]:
np.log10(gc.Mhalo_bins[16])

In [None]:
obs_galaxies[np.logical_and(np.isclose(obs_galaxies['Mh_bin'], gc.Mhalo_bins[16]), close_enough(0.03, obs_galaxies['z']))]

In [None]:
PLOTS_TO_MAKE = 10
GALAXY_POOL = obs_galaxies

#START_INDEX = 777
#for i in range(START_INDEX, START_INDEX + PLOTS_TO_MAKE):
#    index = lost_galaxies.index[i]
#    examine_around(index)
print("Number of galaxies to choose from: ", len(GALAXY_POOL))
indexes = np.random.randint(0, len(GALAXY_POOL)-1, size=PLOTS_TO_MAKE)
for i in indexes:
    target = GALAXY_POOL.iloc[i]
    pp.examine_around(target, data)

### Idea: analyze entire neighborhood and look for groups of similar z galaxies, choose a z from the biggest group

In [None]:
def neighbor_exam(target):
    nearby_angle = coord.Angle('5m')
    z_eff = target.z
    target_dist_true = z_to_ldist(target.z_obs)

    ra_max = (coord.Angle(target.RA*u.degree) + nearby_angle).value
    ra_min = (coord.Angle(target.RA*u.degree) - nearby_angle).value
    dec_max = (coord.Angle(target.Dec*u.degree) + nearby_angle).value
    dec_min = (coord.Angle(target.Dec*u.degree) - nearby_angle).value

    nearby = obs_galaxies.query('RA < @ra_max and RA > @ra_min and Dec < @dec_max and Dec > @dec_min')

    close_neighbors = 0
    if len(nearby) > 0:
        close_neighbors = np.isclose(nearby.ldist_true.to_numpy(), target_dist_true, rtol=0.0, atol=20)

    return (np.sum(close_neighbors), len(nearby), np.sum(close_neighbors)/len(nearby))

MAX = 300
close = np.empty(MAX)
total = np.empty(MAX)
frac = np.empty(MAX)
for i in range(0,MAX):
    target = lost_galaxies.iloc[i]
    close[i], total[i], frac[i] = neighbor_exam(target)

In [None]:
max_finished = 81408
finished_close = close[0:max_finished]
finished_total = total[0:max_finished]
with open(OUTPUT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'wb') as f:
    np.save(f, finished_close)
with open(OUTPUT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'wb') as f:
    np.save(f, finished_total)

In [None]:
with open(OUTPUT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'rb') as f:
    close = np.load(f)

with open(OUTPUT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'rb') as f:
    total = np.load(f)

frac = close / total

In [None]:
bins = np.linspace(0,30,31)
trash = plt.hist(close, bins=bins)
plt.title("Lost Galaxies Neighbors at ~Correct z")
plt.xlabel("Count of Similar z Neighbors")
plt.ylabel("Count of Lost Galaxies")
print("Hopeless Fraction: ", np.sum(close==0) / len(close))
print("Essentially Hopeless Fraction: ", (np.sum(close==0) + np.sum(close==1)) / len(close))

In [None]:
viable = close > 1
frac[viable]
trash=plt.hist(frac[viable], bins=30)
plt.title("Viable Lost Galaxies: Correct z Neighbor Fraction")
plt.xlabel("Fraction with Similar z")
plt.ylabel("Count of Viable Lost Galaxies")

# Tests

In [None]:
# How many halos were assigned below a certain cutoff?
df = simple4_BGS.all_data
M_HALL_CUT = 10**11
small_halo_df = df[df.M_halo < M_HALL_CUT]

print(len(small_halo_df), len(df))

junk=plt.hist(small_halo_df.z, bins=100)
