In [2]:
import numpy as np
import astropy.coordinates as coord
import sys
import matplotlib.pyplot as plt
from astropy.table import Table,join

if './SelfCalGroupFinder/py/' not in sys.path:
    sys.path.append('./SelfCalGroupFinder/py/')
from groupcatalog import GroupCatalog, BGSGroupCatalog, TestGroupCatalog, serialize, deserialize, SDSSGroupCatalog
import catalog_definitions as cat
from dataloc import *
import pyutils as pyutils
import plotting as pp
%load_ext autoreload
%autoreload 2

## Running Group Finder

This notebook will call the functions to preprocess, run group finder, and run postprocessing code to build up a GroupCatalog object, which mostly wraps a pandas DataFrame containing the resulting group catalog data.

After running this on a given GroupCatalog definition, a serialized (via pickle) version of the GroupCatalog object will exist which can be deserialized elsewhere for analysis. See post_plots.ipynb for that.

In [3]:
sdss_list : list[GroupCatalog] = [
    cat.sdss_vanilla,
    cat.sdss_colors,
    cat.sdss_colors_chi,
    cat.sdss_vanilla_v2,
    cat.sdss_colors_v2,
    cat.sdss_colors_chi_v2,
]
uchuu_list : list[GroupCatalog] = [
    cat.uchuu_all,
]
# TODO NEED TO move MXXL file to make available to re-run these.
mxxl_list : list[GroupCatalog] = [
    cat.mxxl_all,
    cat.mxxl_all_c,
    cat.mxxl_fiberonly,
    cat.mxxl_fiberonly_c,
    cat.mxxl_nn,
    cat.mxxl_nn_c,
    cat.mxxl_simple_2,
    cat.mxxl_simple_2_c,
    cat.mxxl_simple_4,
    cat.mxxl_simple_4_c,
]
bgs_sv3_list : list[GroupCatalog] = [
    cat.bgs_sv3_nn_6p,
    cat.bgs_sv3_fiberonly_10p,
    cat.bgs_sv3_simple_4_10p,
    cat.bgs_sv3_simple_4_9p,
    cat.bgs_sv3_simple_4_8p,
    cat.bgs_sv3_simple_4_7p,
    cat.bgs_sv3_simple_4_6p,
    cat.bgs_sv3_simple_4_5p,
    cat.bgs_sv3_simple_4_4p,
    cat.bgs_sv3_simple_4_3p,
    cat.bgs_sv3_simple_4_2p,
    cat.bgs_sv3_simple_4_1p,
    cat.bgs_sv3_simple_5_10p,
    cat.bgs_sv3_simple_5_9p,
    cat.bgs_sv3_simple_5_8p,
    cat.bgs_sv3_simple_5_7p,
    cat.bgs_sv3_simple_5_6p,
    cat.bgs_sv3_simple_5_5p,
    cat.bgs_sv3_simple_5_4p,
    cat.bgs_sv3_simple_5_3p,
    cat.bgs_sv3_simple_5_2p,
    cat.bgs_sv3_simple_5_1p,
]
bgs_y1_list : list[GroupCatalog] = [
    cat.bgs_simple_4_old,
    cat.bgs_simple_4,
    cat.bgs_simple_4_1pass,
    cat.bgs_simple_4_no_sdss,
    cat.bgs_simple_4_4p,
    cat.bgs_simple_4_c,
    cat.bgs_fiberonly,
    cat.bgs_fiberonly_1pass,
    cat.bgs_nn,
    cat.bgs_nn_sdsslike,
    cat.bgs_simple_2,
    cat.bgs_simple_2_c,
    cat.bgs_simple_5,
]
bgs_y3_list : list[GroupCatalog] = [
    cat.bgs_y3_simple_4,
    #cat.bgs_y3_simple_4_4p,
    #cat.bgs_y3_fiberonly_1pass,
    #cat.bgs_y3_fiberonly,
    cat.bgs_y3_simple_5,
]

datasets_to_run: list[GroupCatalog] = []
#datasets_to_run.extend(sdss_list)
#datasets_to_run.extend(uchuu_list)
#datasets_to_run.extend(mxxl_list)
#datasets_to_run.extend(bgs_sv3_list)  
datasets_to_run.extend(bgs_y1_list)
datasets_to_run.extend(bgs_y3_list)

# To just run postprocessing on GF output, comment out run_group_finder()
for d in datasets_to_run:
    #d = deserialize(d)
    d.run_group_finder(popmock=False)
    d.postprocess()
    #d.run_corrfunc()
    serialize(d)
    #del(d)


Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_IRON/ian_BGS_merged.fits

Mode SIMPLE v4
10805993 objects in FITS file
5760804 galaxies in the NN catalog.
5842048 galaxies left after filters.
2212814 remaining galaxies that need redshifts
37.9% of remaining galaxies need redshifts
Matching 2212814 lost galaxies to 542516 SDSS galaxies
111466 of 2212814 redshifts taken from SDSS.
2101348 remaining galaxies need redshifts.
Initializing v4.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 831075.
Random draw uses: 1270273.
Quick NN bailed: 8754. Affected: 0.0041658973192445995
3089220 quiescent galaxies, 2752828 star-forming galaxies
Galprops pickling took 2.9300 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 1pass.dat
Building output file string... done
Writing output files... done
Time for file writing: 31.675775289535522


input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001011-0.799862, frac_area: 0.187600
input> wcen OFF
input> Bsat OFF
input> SECOND_PARAMETER= 0
Allocating space for [5842047] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 1pass.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 5842047
iter 1 ngroups=4760869 fsat=0.183232 (kdtime=187.64 196.36)
iter 2 ngroups=4719915 fsat=0.191215 (kdtime=111.77 120.97)
iter 3 ngroups=4704889 fsat=0.194370 (kdtime=111.88 120.55)
iter 4 ngroups=4699118 fsat=0.195492 (kdtime=104.84 113.43)
iter 5 ngroups=4696760 fsat=0.195985 (kdtime=108.79 117.31)


Post-processing...
Getting fastspecfit data... done
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_IRON/ian_BGS_merged.fits

Mode SIMPLE v4
10805993 objects in FITS file
5760804 galaxies in the NN catalog.
2039867 galaxies left after filters.
431599 remaining galaxies that need redshifts
21.2% of remaining galaxies need redshifts
Initializing v4.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 189145.
Random draw uses: 242454.
Quick NN bailed: 3233. Affected: 0.007490749515174966
1072945 quiescent galaxies, 966922 star-forming galaxies
Galprops pickling took 1.0311 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 no-sdss.dat
Building output file string... done
Writing output files... done
Time for file writing: 10.838062763214111


input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001026-0.799862, frac_area: 0.064968
input> wcen OFF
input> Bsat OFF
input> SECOND_PARAMETER= 0
Allocating space for [2039866] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 no-sdss.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 2039866
iter 1 ngroups=1652688 fsat=0.187691 (kdtime=37.82 40.86)
iter 2 ngroups=1634868 fsat=0.197636 (kdtime=21.92 25.07)
iter 3 ngroups=1628517 fsat=0.201342 (kdtime=22.30 25.32)
iter 4 ngroups=1626477 fsat=0.202550 (kdtime=21.61 24.65)
iter 5 ngroups=1625713 fsat=0.202980 (kdtime=21.68 24.65)


Post-processing...
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_IRON/ian_BGS_merged.fits

Mode SIMPLE v4
10805993 objects in FITS file
5760804 galaxies in the NN catalog.
2039867 galaxies left after filters.
431599 remaining galaxies that need redshifts
21.2% of remaining galaxies need redshifts
Matching 431599 lost galaxies to 542516 SDSS galaxies
24084 of 431599 redshifts taken from SDSS.
407515 remaining galaxies need redshifts.
Initializing v4.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 178179.
Random draw uses: 229336.
Quick NN bailed: 2492. Affected: 0.006115112327153602
1073610 quiescent galaxies, 966257 star-forming galaxies
Galprops pickling took 1.0217 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 c.dat
Building output file string... done
Writing output files... done
Time for file writing: 10.8885219097137

input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001026-0.799862, frac_area: 0.064968
input> wcen ON: 13.100000 2.420000 12.900000 4.840000 17.400000 2.670000
input> Bsat ON: -0.920000 10.250000 12.993000 -8.040000
input> SECOND_PARAMETER= 0
Allocating space for [2039866] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS <19.5 c.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 2039866
iter 1 ngroups=1668828 fsat=0.178231 (kdtime=33.65 36.91)
iter 2 ngroups=1695806 fsat=0.166668 (kdtime=17.42 21.54)
iter 3 ngroups=1690161 fsat=0.170445 (kdtime=17.79 21.19)
iter 4 ngroups=1685483 fsat=0.172997 (kdtime=18.76 22.06)
iter 5 ngroups=1683398 fsat=0.174330 (kdtime=17.68 21.06)


Post-processing...
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_IRON/ian_BGS_merged.fits

Mode FIBER ASSIGNED ONLY 1+ PASSES
10805993 objects in FITS file
3629234 galaxies left after filters.
0 remaining galaxies that need redshifts
0.0% of remaining galaxies need redshifts
1873684 quiescent galaxies, 1755550 star-forming galaxies
Galprops pickling took 1.8168 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Observed 1pass+ BGS <19.5.dat
Building output file string... done
Writing output files... done
Time for file writing: 19.41002583503723


input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001011-0.799862, frac_area: 0.187600
input> wcen OFF
input> Bsat OFF
input> SECOND_PARAMETER= 0
Allocating space for [3629233] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Observed 1pass+ BGS <19.5.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 3629233
iter 1 ngroups=3213353 fsat=0.113292 (kdtime=77.21 83.18)
iter 2 ngroups=3198863 fsat=0.118119 (kdtime=49.61 55.96)
iter 3 ngroups=3190616 fsat=0.120688 (kdtime=50.07 55.99)
iter 4 ngroups=3187255 fsat=0.121697 (kdtime=48.65 54.63)
iter 5 ngroups=3185800 fsat=0.122132 (kdtime=49.84 55.67)


Post-processing...
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_IRON/ian_BGS_merged.fits

Mode SIMPLE v2
10805993 objects in FITS file
5760804 galaxies in the NN catalog.
2039867 galaxies left after filters.
431599 remaining galaxies that need redshifts
21.2% of remaining galaxies need redshifts
Matching 431599 lost galaxies to 542516 SDSS galaxies
24084 of 431599 redshifts taken from SDSS.
407515 remaining galaxies need redshifts.
Initializing v2.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 178195.
Random draw uses: 229320.
Quick NN bailed: 2431. Affected: 0.0059654245855980765
1069559 quiescent galaxies, 970308 star-forming galaxies
Galprops pickling took 1.0243 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v2 BGS <19.5 c.dat
Building output file string... done
Writing output files... done
Time for file writing: 11.055818796157

input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001026-0.799862, frac_area: 0.064968
input> wcen ON: 13.100000 2.420000 12.900000 4.840000 17.400000 2.670000
input> Bsat ON: -0.920000 10.250000 12.993000 -8.040000
input> SECOND_PARAMETER= 0
Allocating space for [2039866] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v2 BGS <19.5 c.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 2039866
iter 1 ngroups=1671206 fsat=0.177119 (kdtime=43.98 47.31)
iter 2 ngroups=1700005 fsat=0.164696 (kdtime=23.13 27.05)
iter 3 ngroups=1694363 fsat=0.168381 (kdtime=23.72 27.19)
iter 4 ngroups=1689901 fsat=0.170869 (kdtime=24.36 27.77)
iter 5 ngroups=1687636 fsat=0.172263 (kdtime=24.20 27.76)


Post-processing...
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_JURA/ian_BGS_Y3_merged.fits

Mode SIMPLE v4


  return 22.5 - 2.5*np.log10(FLUX_R)
  return 22.5 - 2.5*np.log10(FLUX_R)


18966006 objects in FITS file
12428858 galaxies in the NN catalog.
7705987 galaxies left after filters.
1552859 remaining galaxies that need redshifts
20.2% of remaining galaxies need redshifts
Matching 1552859 lost galaxies to 542516 SDSS galaxies
90956 of 1552859 redshifts taken from SDSS.
1461903 remaining galaxies need redshifts.
Initializing v4.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 632070.
Random draw uses: 829833.
Quick NN bailed: 7491. Affected: 0.005124142983494801
4044758 quiescent galaxies, 3661229 star-forming galaxies
Galprops pickling took 4.1181 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS Y3 <19.5.dat
Building output file string... done
Writing output files... done
Time for file writing: 41.260539293289185


input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001000-0.799994, frac_area: 0.233920
input> wcen OFF
input> Bsat OFF
input> SECOND_PARAMETER= 0
Allocating space for [7705986] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v4 BGS Y3 <19.5.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 7705986
iter 1 ngroups=6316189 fsat=0.178208 (kdtime=319.82 331.40)
iter 2 ngroups=6332062 fsat=0.177311 (kdtime=173.82 190.02)
iter 3 ngroups=6473763 fsat=0.156487 (kdtime=250.40 267.34)
iter 4 ngroups=6326530 fsat=0.174349 (kdtime=178.11 192.87)
iter 5 ngroups=6422917 fsat=0.164218 (kdtime=240.35 257.01)


Post-processing...
Post-processing done.
Pre-processing...
Reading FITS data from  /mount/sirocco1/imw2293/GROUP_CAT/DATA/BGS_JURA/ian_BGS_Y3_merged.fits

Mode SIMPLE v5


  return 22.5 - 2.5*np.log10(FLUX_R)
  return 22.5 - 2.5*np.log10(FLUX_R)


18966006 objects in FITS file
12428858 galaxies in the NN catalog.
7705987 galaxies left after filters.
1552859 remaining galaxies that need redshifts
20.2% of remaining galaxies need redshifts
Matching 1552859 lost galaxies to 542516 SDSS galaxies
90956 of 1552859 redshifts taken from SDSS.
1461903 remaining galaxies need redshifts.
Initializing v5.0 of SimpleRedshiftGuesser
Assigning missing redshifts... 
Assigning missing redshifts complete.
Quick NN uses: 384016.
Random draw uses: 1077887.
Quick NN bailed: 2067. Affected: 0.001413910498849787
3997361 quiescent galaxies, 3708626 star-forming galaxies
Galprops pickling took 3.8703 seconds
Output file will be /mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v5 BGS Y3 <19.5.dat
Building output file string... done
Writing output files... done
Time for file writing: 41.68934202194214


input> FLUXLIM: 1, COLOR: 1, STELLAR_MASS: 0 
input> z: 0.001000-0.799994, frac_area: 0.233920
input> wcen OFF
input> Bsat OFF
input> SECOND_PARAMETER= 0
Allocating space for [7705986] galaxies
Done reading in from [/mount/sirocco1/imw2293/GROUP_CAT/OUTPUT/Simple v5 BGS Y3 <19.5.dat]
sorting galaxies...
done sorting galaxies.
Starting inverse-sham...
Done inverse-sham.
Building KD-tree...
Done building KD-tree. 7705986
iter 1 ngroups=6525751 fsat=0.151414 (kdtime=337.14 349.34)
iter 2 ngroups=6512721 fsat=0.154049 (kdtime=205.30 220.93)
iter 3 ngroups=6689052 fsat=0.129736 (kdtime=255.60 272.81)
iter 4 ngroups=6539370 fsat=0.147791 (kdtime=183.42 198.55)
iter 5 ngroups=6507368 fsat=0.154407 (kdtime=223.45 238.40)


Post-processing...
Post-processing done.


In [None]:
import pickle
with open('SimpleRedshiftGuesserMap.pkl', 'rb') as f:    
    app_mag_bins, the_map = pickle.load(f)

In [None]:
the_map

In [None]:
indexes = [10,11,30,45]
# histogram of the map at those indexes
for i in indexes:
    plt.hist(the_map[i], bins = 20)
    plt.title(f'app mag ~ {app_mag_bins[i-1]}')
    plt.show()

# Tests

## Python Unit Tests

In [None]:
import pyutils as pyu
test_mags = np.linspace(12.0, 20.0, 10000)
test_z = np.linspace(0.0, 0.5, 10000) * np.random.rand(10000)
app_mag_bins, the_map = pyu.build_app_mag_to_z_map_new(test_mags, test_z)

print(the_map)

In [None]:
SV3_test = BGSGroupCatalog("SV3 Test", pyutils.Mode.SIMPLE_v4, 19.5, 21.0, num_passes=10, drop_passes=3, data_cut='sv3', sdss_fill=False)
SV3_test.GF_props = cat.GF_PROPS_VANILLA.copy()

SV3_test.preprocess()

# Read in BGS_SV3_ANY_FULL_FILE and ensure no precision is lost from there to SV3_test.preprocess_file and the like
merged_table = Table.read(IAN_BGS_SV3_MERGED_FILE, format='fits')
print(merged_table['RA'][0:10])

# read in and print out the first few lines of SV3_test.preprocess_file
with open(SV3_test.preprocess_file, 'r') as f:
    for i in range(10):
        print(f.readline(), end='')

#with open(SV3_test.preprocess_file + "~", 'r') as f:
#    for i in range(10):
#        print(f.readline(), end='')

galprops_file = str.replace(SV3_test.GF_outfile, ".out", "_galprops.dat")
with open(galprops_file, 'r') as f:
    for i in range(10):
        print(f.readline(), end='')

#with open(galprops_file + "~", 'r') as f:
#    for i in range(10):
#        print(f.readline(), end='')


## GF Tests

In [None]:
# Only needs to be run once, unless you want to change the test data
#catalog = TestGroupCatalog("Test")
#catalog.create_test_dat_files() 

In [None]:
np.set_printoptions(threshold=sys.maxsize)

# Baseline vanilla group finder test 
catalog = TestGroupCatalog("Test")
catalog.run_group_finder(silent=True) 
catalog.postprocess()
df=catalog.all_data
baseline_total_mass = df['M_halo'].sum()
assert len(np.unique(df['igrp'])) == 200
assert len(df) == 246 
assert df['quiescent'].sum() == 129
assert np.isclose(df['weight'].sum(), 246 * 1.0) # no weights, just 1 per gal
m1=df['M_halo'].to_numpy()

# Test that when omega0 are 0, the others don't matter
catalog = TestGroupCatalog("Test")
catalog.GF_props['omegaL_sf'] = 123
catalog.GF_props['sigma_sf'] = 345
catalog.GF_props['omegaL_q'] = 456
catalog.GF_props['sigma_q'] = 678
catalog.GF_props['omega0_sf'] = 0.0
catalog.GF_props['omega0_q'] = 0.0
catalog.run_group_finder(silent=True)
catalog.postprocess()
df=catalog.all_data
assert len(np.unique(df['igrp'])) == 200
assert len(df) == 246 
assert df['quiescent'].sum() == 129
assert np.isclose(df['weight'].sum(), 246 * 1.0) # no weights, just 1 per gal
assert np.isclose(df['M_halo'].sum(), baseline_total_mass)
m2=df['M_halo'].to_numpy()

catalog = TestGroupCatalog("Test")
#catalog.GF_props['colors'] = 1
catalog.GF_props['omegaL_sf'] = 10.0
catalog.GF_props['sigma_sf'] = 3.0
catalog.GF_props['omegaL_q'] = 0.0
catalog.GF_props['sigma_q'] = 0.0
catalog.GF_props['omega0_sf'] = 10.0
catalog.GF_props['omega0_q'] = 0.0
catalog.run_group_finder(silent=True)
catalog.postprocess()
df=catalog.all_data
assert len(np.unique(df['igrp'])) >= 200 # these parameters make assigned halos smaller
assert len(df) == 246 
assert df['quiescent'].sum() == 129
assert df['weight'].sum() < 246 
# TODO BUG I feel like this should be true, but it's not. Weighting doesn't preseve the halo mass function
#assert np.isclose(df['M_halo'].sum(), baseline_total_mass) 
m3=df['M_halo'].to_numpy()

plt.hist(np.stack([np.log10(m1), np.log10(m2), np.log10(m3)], axis=-1))


print("All tests passed")

In [None]:
pp.examine_area(np.min(df.RA), np.max(df.RA), np.min(df.Dec), np.max(df.Dec), df)
