In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import astropy.coordinates as coord
import astropy.units as u
import astropy.io.fits as fits
from astropy.table import Table,join,vstack,unique,QTable
from astropy.table import Table,join,vstack,unique,QTable
import sys
from urllib.parse import urljoin

if './SelfCalGroupFinder/py/' not in sys.path:
    sys.path.append('./SelfCalGroupFinder/py/')
from pyutils import *
from dataloc import *
from photoz import *
import groupcatalog as gc
from nnanalysis import *

%load_ext autoreload
%autoreload 2

In [61]:
KEEP_PASSES = 1
#APP_MAG_CUT = 19.5 # BGS BRIGHT, though 19.54 for some cameras. I don't know if FLUX_R has been corrected for this.
APP_MAG_CUT = 20.175 # BGS FAINT, and in SV3 it is 20.3 instead
Z_MIN = 0.001
Z_MAX = 0.8

# This corresponds to 8.33 square degrees and empircally makes sense by looking at the randoms
TILE_RADIUS = 5862.0 * u.arcsec # arcsec

def find_tiles_for_galaxies(tiles_df, gals_df, num_tiles_to_find):
    num_galaxies = len(gals_df.RA)
    num_tiles = len(tiles_df.RA)

    tiles_coord = coord.SkyCoord(ra=tiles_df.RA.to_numpy()*u.degree, dec=tiles_df.Dec.to_numpy()*u.degree, frame='icrs')
    gals_coord = coord.SkyCoord(ra=gals_df.RA.to_numpy()*u.degree, dec=gals_df.Dec.to_numpy()*u.degree, frame='icrs')

    # Structure for resultant data
    nearest_tile_ids = np.zeros((num_galaxies, num_tiles_to_find), dtype=int)
    ntiles_inside = np.zeros((num_galaxies), dtype=int)

    for n in range(num_tiles_to_find):
        idx, d2d, d3d = coord.match_coordinates_sky(gals_coord, tiles_coord, nthneighbor=n+1, storekdtree='kdtree_tiles')
        nearest_tile_ids[:,n] = tiles_df.iloc[idx].TILEID
        ntiles_inside += (d2d < TILE_RADIUS).astype(int)

    
    return ntiles_inside, nearest_tile_ids

def add_mag_columns(table):
    app_mag_r = get_app_mag(table['FLUX_R'])
    app_mag_g = get_app_mag(table['FLUX_G'])
    g_r = app_mag_g - app_mag_r

    if np.ma.is_masked(table['Z']):
        z_obs = table['Z'].data.data
    else:
        z_obs = table['Z']
    
    abs_mag_R = app_mag_to_abs_mag(app_mag_r, z_obs)
    abs_mag_R_k = k_correct(abs_mag_R, z_obs, g_r, band='r')
    abs_mag_G = app_mag_to_abs_mag(app_mag_g, z_obs)
    abs_mag_G_k = k_correct(abs_mag_G, z_obs, g_r, band='g')
    log_L_gal = abs_mag_r_to_log_solar_L(abs_mag_R_k) 
    G_R_k = abs_mag_G_k - abs_mag_R_k
    quiescent = is_quiescent_BGS_gmr(log_L_gal, G_R_k)

    table.add_column(app_mag_r, name='APP_MAG_R')
    table.add_column(app_mag_g, name='APP_MAG_G')
    table.add_column(abs_mag_R, name='ABS_MAG_R')
    table.add_column(abs_mag_R_k, name='ABS_MAG_R_K')
    table.add_column(abs_mag_G, name='ABS_MAG_G')
    table.add_column(abs_mag_G_k, name='ABS_MAG_G_K')
    table.add_column(log_L_gal, name='LOG_L_GAL')
    table.add_column(quiescent, name='QUIESCENT')


def add_photz_columns(table_file, phot_z_file):
    """
    Reads an astropy table and adds columns from the legacy survey file we built (photo-z, etc.).
    """

    table = Table.read(table_file, format='fits') # astropy
    if 'Z_PHOT' in table.columns:
        print("Z_PHOT already in table, replacing it.")
        table.remove_columns(['Z_PHOT', 'RELEASE', 'BRICKID', 'OBJID', 'REF_CAT'])

    phot_z_table = pickle.load(open(phot_z_file, 'rb'))
    phot_z_table['TARGETID'] = phot_z_table.index # in the DataFrame TARGETID is the index, not a column, so copy it over so the conversion keeps it
    # Merge in the photo-z and whatever else info we took from Legacy Surveys sweeps

    percent_complete = (phot_z_table['Z_LEGACY_BEST'] != -99.0).sum() / len(phot_z_table)
    print(f"Phot-z file has phot-z for {percent_complete:.2%} of targets.")

    final_table = join(table, QTable.from_pandas(phot_z_table), join_type='left', keys="TARGETID")

    print(len(table))
    print(len(phot_z_table))
    print(len(final_table))

    final_table.rename_column('Z_LEGACY_BEST', 'Z_PHOT')
    final_table.rename_column('RA_1', 'RA')
    final_table.rename_column('DEC_1', 'DEC')
    final_table.remove_columns(['RA_2', 'DEC_2'])
    print(final_table.columns)

    # TODO I should switch to having the merged file be pickle.dump of a DataFrame. 
    # Only thing is NEAREST_TILEIDS cannot be a lit
    final_table.write(table_file, format='fits', overwrite=True)



def table_to_df(table: Table):
    """
    This does not work for all purposes yet.
    """
    # TODO why not use to_pandas()?
    #df = table.to_pandas()
    
    obj_type = table['SPECTYPE'].data.data
    dec = table['DEC'].astype("<f8") # Big endian vs little endian regression in pandas. Convert more of these fields like this
    ra = table['RA'].astype("<f8") # as needed if using pandas with this data
    z_obs = table['Z'].data.data
    target_id = table['TARGETID']
    #flux_r = table['FLUX_R']
    #flux_g = table['FLUX_G']
    app_mag_r = get_app_mag(table['FLUX_R'])
    app_mag_g = get_app_mag(table['FLUX_G'])
    g_r_apparent = app_mag_g - app_mag_r
    #sdss_g_r = table['ABSMAG_SDSS_G'] - table['ABSMAG_SDSS_R'] 
    #G_R_JM1 = table['ABSMAG01_SDSS_G'] - table['ABSMAG01_SDSS_R']
    p_obs = table['PROB_OBS'] 
    unobserved = table['Z'].mask
    deltachi2 = table['DELTACHI2'].data.data
    ntiles = table['NTILE']
    #abs_mag_sdss = table['ABSMAG_SDSS_R']
    dn4000 = table['DN4000'].data.data

    df = pd.DataFrame({
        'SPECTYPE': obj_type,
        'Dec': dec,
        'RA': ra,
        'z': z_obs,
        'TARGETID': target_id,
        #'FLUX_R': flux_r,
        #'FLUX_G': flux_g,
        'APP_MAG_R': app_mag_r,
        'APP_MAG_G': app_mag_g,
        'G_R_APPARENT': g_r_apparent,
        #'SDSS_G_R': sdss_g_r,
        #'G_R_JM1': G_R_JM1,
        'PROB_OBS': p_obs,
        'UNOBSERVED': unobserved,
        'DELTACHI2': deltachi2,
        'NTILE': ntiles,
        #'ABS_MAG_SDSS': abs_mag_sdss,
        'DN4000': dn4000
        })

    return df

def read_tiles_Y1_main():
    tiles_table = Table.read(BGS_TILES_FILE, format='csv')
    tiles_table.keep_columns(['TILEID', 'FAFLAVOR', 'TILERA', 'TILEDEC'])
    tiles_df = pd.DataFrame({'RA': tiles_table['TILERA'].astype("<f8"), 'Dec': tiles_table['TILEDEC'].astype("<f8"), 'FAFLAVOR': tiles_table['FAFLAVOR'], 'TILEID': tiles_table['TILEID']})
    tiles_df = tiles_df[tiles_df.FAFLAVOR == 'mainbright']
    tiles_df.reset_index(drop=True, inplace=True)
    return tiles_df

def read_tiles_Y3_file():
    tiles_table = Table.read(BGS_Y3_TILES_FILE, format='csv')
    tiles_table.keep_columns(['TILEID', 'FAFLAVOR', 'TILERA', 'TILEDEC'])
    tiles_df = pd.DataFrame({'RA': tiles_table['TILERA'].astype("<f8"), 'Dec': tiles_table['TILEDEC'].astype("<f8"), 'FAFLAVOR': tiles_table['FAFLAVOR'], 'TILEID': tiles_table['TILEID']})
    return tiles_df

def read_tiles_Y3_main():
    tiles_table = Table.read(BGS_Y3_TILES_FILE, format='csv')
    tiles_table.keep_columns(['TILEID', 'FAFLAVOR', 'TILERA', 'TILEDEC'])
    tiles_df = pd.DataFrame({'RA': tiles_table['TILERA'].astype("<f8"), 'Dec': tiles_table['TILEDEC'].astype("<f8"), 'FAFLAVOR': tiles_table['FAFLAVOR'], 'TILEID': tiles_table['TILEID']})
    tiles_df = tiles_df[tiles_df.FAFLAVOR == 'mainbright']
    tiles_df.reset_index(drop=True, inplace=True)
    return tiles_df

In [3]:
def fix_columns_in_phot_z_file(f):
    phot_z_table = pickle.load(open(f, 'rb'))
    phot_z_table['REF_CAT_NEW'] = phot_z_table['REF_CAT'].astype('S2')
    phot_z_table.loc[(phot_z_table.REF_CAT_NEW == b'  '), 'REF_CAT_NEW'] = b''
    phot_z_table.loc[(phot_z_table.REF_CAT_NEW == b'na'), 'REF_CAT_NEW'] = b''
    phot_z_table.drop('REF_CAT', inplace=True, axis=1)
    phot_z_table.rename(columns={'REF_CAT_NEW':'REF_CAT'}, inplace=True)
    pickle.dump(phot_z_table, open(f, 'wb'))

# Run this after building photo-z file
#fix_columns_in_phot_z_file(IAN_PHOT_Z_FILE_NOSPEC)

# Building Merged Files

## Build SV3 Merged File
This requires Y3 Merged file to be built first, because we add in Y3 galaxies to supplement the NN catalog.

In [None]:
sv3_table = Table.read(BGS_SV3_ANY_FULL_FILE, format='fits')
tiles = read_tiles_Y3_file()
SV3_tiles = tiles.loc[tiles.FAFLAVOR == 'sv3bright']
print(sv3_table.columns)

In [7]:
# Filter to needed columns only and save
# TILELOCID
# TODO Investigate ZTILEID and NUMOBS
sv3_table.keep_columns(['TARGETID', 'SPECTYPE', 'DEC', 'RA', 'Z_not4clus', 'ZTILEID', 'NUMOBS', 'FLUX_R', 'FLUX_G', 'PROB_OBS', 'ZWARN', 'DELTACHI2', 'NTILE', 'TILES', 'TILEID', 'MASKBITS'])
sv3_table.rename_column('Z_not4clus', 'Z')

sv3_df = sv3_table.to_pandas()
sv3_df.rename(columns={'DEC': 'Dec'}, inplace=True)

In [8]:
ntiles_inside, nearest_tile_ids = find_tiles_for_galaxies(SV3_tiles, sv3_df, 10) # 
sv3_table.add_column(ntiles_inside, name="NTILE_MINE")
sv3_table.add_column(nearest_tile_ids, name="NEAREST_TILEIDS")

add_mag_columns(sv3_table)

sv3_table.write(IAN_BGS_SV3_MERGED_NOY3_FILE, format='fits', overwrite='True')
del(sv3_table)
del(sv3_df)

In [None]:
add_photz_columns(IAN_BGS_SV3_MERGED_NOY3_FILE, IAN_PHOT_Z_FILE_NOSPEC) # For SV3 Analysis we want to analyze pure photo-z's, so use the version without external spec-z

In [None]:
# Now add Y3 galaxies to supplement the NN catalog, especialy valuable at the edges of the regions
# They won't go into the main catalog because their NTILE_MINE is < 10
sv3_table: Table = Table.read(IAN_BGS_SV3_MERGED_NOY3_FILE, format='fits')
y3_table: Table = Table.read(IAN_BGS_Y3_MERGED_FILE, format='fits')

# Let's cut Y3 data down to targets somewhat close to SV3 regions
# No point in keeping rest and slowing down code
gals_coord = coord.SkyCoord(ra=y3_table['RA']*u.degree, dec=y3_table['DEC']*u.degree, frame='icrs')
close_array = gc.get_objects_near_sv3_regions(gals_coord, 2.5) # 2.5 deg radius is generously around the center of each SV3 region

y3_table = y3_table[close_array]

print(f"{len(y3_table)} galaxies will be added for SV3 NN catalog")
assert (y3_table['NTILE_MINE'] > 9).sum() == 0, "Y3 shouldn't add any galaxies that will go into the catalog"

# if the y3 table doesn't have prob_obs as is the case in JURA, add a prob_obs with 0.5 for everything
if 'PROB_OBS' not in y3_table.columns:
    print("Adding PROB_OBS column with 0.5 for all Y3 galaxies")
    y3_table.add_column(np.full(len(y3_table), 0.5), name="PROB_OBS")

# Remove rows from y3_table that have TARGETID already in sv3_table
y3_in_sv3 = np.isin(y3_table['TARGETID'], sv3_table['TARGETID'])
print(f"Removing {y3_in_sv3.sum()} (of {len(y3_table)}) Y3 galaxies that are already in SV3 catalog")
y3_table.remove_rows(y3_in_sv3)

#colname = 'NEAREST_TILEIDS'
#print(sv3_table[colname].shape)
#print(y3_table[colname].shape)

sv3_table.remove_column('TILES')
y3_table.remove_column('TILES')

combined = vstack([sv3_table, y3_table], join_type='outer')

# Ensure resultant data is what we want
print(sv3_table.columns)
print(y3_table.columns)
print(combined.columns)


In [13]:
combined.write(IAN_BGS_SV3_MERGED_FILE, format='fits', overwrite=True)

## Make Empty Photo-z Ledger file
 
See photoz.py for the code that populates this file.

In [9]:
# INITIALIZE LIGHTWEIGHT DESI BGS PHOTO-Z TABLE
# Don't re-run! Will overwrite the file.
"""
desi_table = Table.read(IAN_BGS_Y3_MERGED_FILE, format='fits')
desi_table2 = Table.read(IAN_BGS_SV3_MERGED_NOY3_FILE, format='fits')

assert len(np.unique(desi_table['TARGETID'])) == len(desi_table), "There are duplicate TARGETIDs in the Y3 file"
assert len(np.unique(desi_table2['TARGETID'])) == len(desi_table2), "There are duplicate TARGETIDs in the SV3 file"

desi_table.keep_columns(['TARGETID', 'RA', 'DEC'])
desi_table2.keep_columns(['TARGETID', 'RA', 'DEC'])

desi_targets_table = vstack([desi_table, desi_table2], join_type='inner')
desi_targets_table = unique(desi_targets_table, 'TARGETID')
desi_targets_table['Z_LEGACY_BEST'] = -99.0

# add columns for 'RELEASE', 'BRICKID', 'OBJID', 'REF_CAT', 'MATCH_DIST' with no values
desi_targets_table.add_column(np.zeros(len(desi_targets_table), dtype=int), name='RELEASE')
desi_targets_table.add_column(np.zeros(len(desi_targets_table), dtype=int), name='BRICKID')
desi_targets_table.add_column(np.zeros(len(desi_targets_table), dtype=int), name='OBJID')
desi_targets_table.add_column(np.zeros(len(desi_targets_table), dtype='S2'), name='REF_CAT')
desi_targets_table.add_column(np.full(len(desi_targets_table), 999999, dtype=float), name='MATCH_DIST')


desi_targets_table = desi_targets_table.to_pandas()
desi_targets_table.set_index('TARGETID', inplace=True)
pickle.dump(desi_targets_table, open(IAN_PHOT_Z_FILE, 'wb'))
"""


In [None]:
# Read BGS_IMAGES_FOLDER + "terminal.txt" into an array of strings, 1 per line
f = open(BGS_IMAGES_FOLDER + "terminal.txt", "r")
lines = f.readlines()
f.close()

# Now filter it down to ones like this:
#Start processing brick #X
#Matched 0 out of Z
#start_lines = [line for line in lines if "Start processing brick" in line]
matched_lines = [line for line in lines if "Matched" in line]

#print(len(start_lines))
print(len(matched_lines))

# Extract X, Y, Z from each line 
#start_lines = [line.split()[3] for line in start_lines]    
matched_lines = [int(line.split()[1]) for line in matched_lines]

# Strip away # and convert to int
#start_lines = [int(line[1:]) for line in start_lines]

# The index is the brick number for matched_lines
# Find the brick numbers where the value is 0 and remember those indexes
zero_indexes = [i for i in range(len(matched_lines)) if matched_lines[i] == 0]

#pickle.dump(zero_indexes, open(BRICKS_TO_SKIP_S_FILE, 'wb'))

In [None]:
#pickle.dump([], open(BRICKS_TO_SKIP_N_FILE, 'wb'))

## Build Y1 Merged File

In [None]:
hdul = fits.open(BGS_FASTSPEC_FILE, memmap=True)
#print(hdul[1].columns)
data = hdul[1].data
fastspecfit_id = data['TARGETID']
DN4000 = data['DN4000'] # TODO there is also DN4000_OBS and DN4000_MODEL (and inverse variance)
FSF_G = data['ABSMAG01_SDSS_G']
FSF_R = data['ABSMAG01_SDSS_R']
hdul.close()

print(len(fastspecfit_id))
print(len(DN4000))

fastspecfit_table = Table([fastspecfit_id, DN4000, FSF_G, FSF_R], names=('TARGETID', 'DN4000', 'ABSMAG01_SDSS_G', 'ABSMAG01_SDSS_R'))


In [None]:
main_table = Table.read(BGS_ANY_FULL_FILE, format='fits')
print(main_table.columns)

In [None]:
# ALREADY DONE FOR US; only needed to do this in Iron v1.2 due to a bug.
# Prob obs file
#p_table = Table.read(BGS_PROB_OBS_FILE, format='fits')
#print(len(p_table))

# Join them all on TARGETID
#joined_table = join(main_table, p_table, keys="TARGETID")
#print(len(joined_table))

to_join = main_table
#to_join = p_table

# The lost galaxies will not have fastspecfit rows I think
final_table = join(to_join, fastspecfit_table, join_type='left', keys="TARGETID")
print(len(final_table))

# Sanity check that everything went as intended
assert len(final_table) == len(main_table)

# Filter to needed columns only and save
final_table.keep_columns(['TARGETID', 'SPECTYPE', 'DEC', 'RA', 'Z_not4clus', 'FLUX_R', 'FLUX_G', 'BITWEIGHTS', 'PROB_OBS', 'ZWARN', 'DELTACHI2', 'NTILE', 'TILES', 'DN4000', 'ABSMAG01_SDSS_G', 'ABSMAG01_SDSS_R', 'MASKBITS'])
final_table.rename_column('Z_not4clus', 'Z')

add_mag_columns(final_table)


final_table.write(IAN_BGS_MERGED_FILE, format='fits', overwrite='True')

del(main_table)
#del(p_table)
del(fastspecfit_table)
del(final_table)

In [None]:
def add_NTILE_MINE_to_table(table_file):
    tiles_df = read_tiles_Y1_main()
    table = Table.read(table_file, format='fits')
    galaxies_df = table_to_df(table)
    
    ntiles_inside, nearest_tile_ids = find_tiles_for_galaxies(tiles_df, galaxies_df, 15)
    if 'NTILE_MINE' in table.columns:
        table.remove_columns(['NTILE_MINE', 'NEAREST_TILEIDS'])
    table.add_column(ntiles_inside, name="NTILE_MINE")
    table.add_column(nearest_tile_ids, name="NEAREST_TILEIDS")

    table.write(table_file, format='fits', overwrite='True')

In [None]:
add_NTILE_MINE_to_table(IAN_BGS_MERGED_FILE)
#add_NTILE_MINE_to_table(IAN_BGS_MERGED_FILE_OLD)

In [None]:
add_photz_columns(IAN_BGS_MERGED_FILE, IAN_PHOT_Z_FILE_WSPEC)

## Build Y3 Merged File

In [None]:
# Don't have all files needed to use the above pipline yet, so just working with what we have

# Main file
main_table = Table.read(BGS_Y3_ANY_FULL_FILE, format='fits')
print(len(main_table))

# Filter to needed columns only and save
# TODO need PROB_OBS
main_table.keep_columns(['TARGETID', 'SPECTYPE', 'DEC', 'RA', 'Z_not4clus', 'FLUX_R', 'FLUX_G', 'ZWARN', 'DELTACHI2', 'NTILE', 'TILES', 'MASKBITS'])
main_table.rename_column('Z_not4clus', 'Z')

galaxies_df = pd.DataFrame({
    'Dec': main_table['DEC'],
    'RA': main_table['RA'],
    })

tiles_BGS = read_tiles_Y3_main()

ntiles_inside, nearest_tile_ids = find_tiles_for_galaxies(tiles_BGS, galaxies_df, 10)

main_table.add_column(ntiles_inside, name="NTILE_MINE")
main_table.add_column(nearest_tile_ids, name="NEAREST_TILEIDS")

add_mag_columns(main_table)

main_table.write(IAN_BGS_Y3_MERGED_FILE, format='fits', overwrite='True')

del(main_table)
del(tiles_BGS)
del(galaxies_df)

In [None]:
add_photz_columns(IAN_BGS_Y3_MERGED_FILE, IAN_PHOT_Z_FILE_WSPEC)

# Examine data in a Merged BGS File (SV3, Y1, Y3, whatever)

In [62]:
# Pick one
#table = Table.read(IAN_BGS_MERGED_FILE, format='fits')
table = Table.read(IAN_BGS_SV3_MERGED_FILE, format='fits')
#table = Table.read(IAN_BGS_SV3_MERGED_NOY3_FILE, format='fits')
#table = Table.read(IAN_BGS_Y3_MERGED_FILE, format='fits')

In [None]:
# See two equivalent ways of determining which rows are for unobserved galaxies
one=table['ZWARN'] == 999999
two=table['Z'].mask
three=table['Z'] == 999999.0
assert(np.all(one == two))
assert(np.all(one == three))

## Cut to the galaxy data we actually need

In [None]:
# TODO this gets easilly out of sync with the .py file that does the 'production' filtering

if np.ma.is_masked(table['Z']):
    print("Masked table")
    z_obs = table['Z'].data.data.astype("<f8")
    obj_type = table['SPECTYPE'].data.data
    unobserved = table['Z'].mask # the masked values are what is unobserved
    deltachi2 = table['DELTACHI2'].data.data  
    maskbits = table['MASKBITS'].data.data
else:
    print("Unmasked table")
    # SV3 version didn't do this
    z_obs = table['Z']
    obj_type = table['SPECTYPE']
    unobserved = table['Z'].astype("<i8") == 999999
    deltachi2 = table['DELTACHI2']
    maskbits = table['MASKBITS']
    
dec = table['DEC'].astype("<f8")
ra = table['RA'].astype("<f8")
z_phot = table['Z_PHOT'].astype("<f8")
target_id = table['TARGETID']
app_mag_r = get_app_mag(table['FLUX_R'])
app_mag_g = get_app_mag(table['FLUX_G'])
flux_r = table['FLUX_R'].astype("<f8")
flux_g = table['FLUX_G'].astype("<f8")
g_r_apparent = app_mag_g - app_mag_r
#sdss_g_r = table['ABSMAG_SDSS_G'] - table['ABSMAG_SDSS_R'] 
#G_R_JM1 = table['ABSMAG01_SDSS_G'] - table['ABSMAG01_SDSS_R']
p_obs = table['PROB_OBS'] 
ntiles = table['NTILE'].astype("<i8")
#tiles = table['TILES']
#ztileid = table['ZTILEID']
tile_id = table['TILEID']
#numobs = table['NUMOBS']
#tile_locid = table['TILELOCID']
ntiles_mine = table['NTILE_MINE']
tileids = table['NEAREST_TILEIDS'][:,0].astype("<i8") # TODO there are 10 here, we want NTILES_MINE many...
#abs_mag_sdss = table['ABSMAG_SDSS_R']
#dn4000 = table['DN4000'].data.data
ref_cat = table['REF_CAT']


before_count = len(dec)
print(before_count, "objects in FITS file")

# TODO BUG Can we be mistaking STARS for GALAXIES?
# Make filter array (True/False values)
PASSES_REQUIRED = [1,2,3,4,10]

galaxy_observed_filter = obj_type == b'GALAXY'
app_mag_filter = app_mag_r < APP_MAG_CUT
redshift_filter = z_obs > Z_MIN
redshift_hi_filter = z_obs < Z_MAX
deltachi2_filter = deltachi2 > 40
#abs_mag_sdss_filter = abs_mag_sdss < 100
#observed_requirements = np.all([galaxy_observed_filter, app_mag_filter, redshift_filter, redshift_hi_filter, deltachi2_filter, abs_mag_sdss_filter], axis=0)
observed_requirements = np.all([galaxy_observed_filter, app_mag_filter, redshift_filter, redshift_hi_filter, deltachi2_filter], axis=0)

treat_as_unobserved = np.all([galaxy_observed_filter, app_mag_filter, np.invert(deltachi2_filter)], axis=0)

unobserved = np.all([app_mag_filter, np.logical_or(unobserved, treat_as_unobserved)], axis=0)
keep = np.all([np.logical_or(observed_requirements, unobserved)], axis=0)

print("\nWhole sample:")
print(f"There are {len(obj_type):,} objects in the entire sample, of which {np.sum(galaxy_observed_filter):,} are observed galaxies.") 

for n in PASSES_REQUIRED:
    n_pass_filter = ntiles_mine >= n
    n_pass_filter_old = ntiles >= n
    unobserved_n = np.all([n_pass_filter, unobserved], axis=0)
    observed_requirements_n = np.all([n_pass_filter, observed_requirements], axis=0)
    keepn = np.all([np.logical_or(observed_requirements_n, unobserved_n)], axis=0)

    print(f"\n{n}-pass analysis (NTILE_MINE):")
    print(f"There are {np.sum(observed_requirements_n):,} galaxies in the <{APP_MAG_CUT} mag sample that pass our quality checks.")
    print(f"There are {np.sum(unobserved_n):,} unobserved galaxies, including bad observed galaxies.")
    print(f"This {n}-pass catalog would have {np.sum(keepn):,} galaxies ({np.sum(unobserved_n) / np.sum(keepn) * 100:.2f}% lost).")

    # We've demonstratred this is definetely not what we want
    #unobserved_n_old = np.all([n_pass_filter_old, unobserved], axis=0)
    #observed_requirements_n_old = np.all([n_pass_filter_old, observed_requirements], axis=0)
    #keepn_old = np.all([np.logical_or(observed_requirements_n_old, unobserved_n_old)], axis=0)
    #print(f"\n{n}-pass analysis (NTILE):")
    #print(f"There are {np.sum(observed_requirements_n_old):,} galaxies in the bright (<{APP_MAG_CUT} mag) sample that pass our quality checks.")
    #print(f"There are {np.sum(unobserved_n_old):,} unobserved galaxies, including bad observed galaxies.")
    #print(f"This {n}-pass catalog would have {np.sum(keepn_old):,} galaxies ({np.sum(unobserved_n_old) / np.sum(keepn_old) * 100:.2f}% lost).")

# FOR PARTS BELOW SET WHAT YOU WANT TO KEEP!
keep = np.all([keep, ntiles_mine >= KEEP_PASSES, ~unobserved], axis=0)
#keep = np.all([keep, ntiles_mine >= KEEP_PASSES], axis=0)

obj_type = obj_type[keep]
dec = dec[keep]
ra = ra[keep]
z_phot = z_phot[keep]
z_obs = z_obs[keep]
target_id = target_id[keep] 
flux_r = flux_r[keep]
app_mag_r = app_mag_r[keep]
app_mag_g = app_mag_g[keep]
g_r_apparent = g_r_apparent[keep]
p_obs = p_obs[keep]
unobserved = unobserved[keep]
deltachi2 = deltachi2[keep]
ntiles = ntiles[keep]
#tiles = tiles[keep]
#ztileid = ztileid[keep]
ntiles_mine = ntiles_mine[keep]
tileids = tileids[keep]
tile_id = tile_id[keep]
#numobs = numobs[keep]
#tile_locid = tile_locid[keep]
#abs_mag_sdss = abs_mag_sdss[keep]
#sdss_g_r = sdss_g_r[keep]
#G_R_JM1 = G_R_JM1[keep]
#dn4000 = dn4000[keep]
ref_cat = ref_cat[keep]
maskbits = maskbits[keep]
indexes_not_assigned = np.argwhere(unobserved)

after_count = len(dec)

print(f"\nAfter all filters we have {after_count:,} of the original {before_count:,} rows.")

In [25]:
# Make maps
two_pass_filter = ntiles_mine >= 2 
three_pass_filter = ntiles_mine >= 3 
four_pass_filter = ntiles_mine >= 4 

ra2 = ra[two_pass_filter]
dec2 = dec[two_pass_filter]
tileids2 =  tileids[two_pass_filter]
unobserved2 = unobserved[two_pass_filter]

ra3 = ra[three_pass_filter]
dec3 = dec[three_pass_filter]
tileids3 =  tileids[three_pass_filter]
unobserved3 = unobserved[three_pass_filter]

ra4 = ra[four_pass_filter]
dec4 = dec[four_pass_filter]
tileids4 =  tileids[four_pass_filter]
unobserved4 = unobserved[four_pass_filter]

one_pass_df = pd.DataFrame({'RA': ra, 'Dec': dec, 'z_assigned_flag': unobserved, 'TILEID': tileids})
two_pass_df = pd.DataFrame({'RA': ra2, 'Dec': dec2, 'z_assigned_flag': unobserved2, 'TILEID': tileids2})
three_pass_df = pd.DataFrame({'RA': ra3, 'Dec': dec3, 'z_assigned_flag': unobserved3, 'TILEID': tileids3})
four_pass_df = pd.DataFrame({'RA': ra4, 'Dec': dec4, 'z_assigned_flag': unobserved4, 'TILEID': tileids4})

#fig=make_map(ra, dec)
#ra_4 = ra[four_pass_filter]
#dec_4 = dec[four_pass_filter]
#print(f"Number of 4-pass galaxies: {len(ra_4)}, number of 3-pass galaxies: {len(ra)}")
#fig=make_map(ra_4, dec_4, fig=fig, alpha=0.1)

In [None]:
plot_positions(one_pass_df, three_pass_df, tiles_df=tiles_BGS, DEG_LONG=2, split=False)


In [None]:
# See where missing photo-z's are
idx_no_zphot = z_phot == -99.0
no_photoz_df = pd.DataFrame({'RA': ra[idx_no_zphot], 'Dec': dec[idx_no_zphot], 'z_assigned_flag': unobserved[idx_no_zphot], 'TILEID': tileids[idx_no_zphot]})
#plot_positions(one_pass_df, no_photoz_df, tiles_df=tiles_BGS, DEG_LONG=3, split=False)

fig=make_map(ra, dec)
#ra_4 = ra[four_pass_filter]
#dec_4 = dec[four_pass_filter]
#print(f"Number of 4-pass galaxies: {len(ra_4)}, number of 3-pass galaxies: {len(ra)}")
#fig=make_map(ra[idx_no_zphot], dec[idx_no_zphot], fig=fig, alpha=0.1)
fig=make_map(ra[ntiles_mine >= 10 ], dec[ntiles_mine >= 10 ], fig=fig, alpha=0.2)



### Siena Galaxy Atlas Analysis (SGA)

In [None]:
np.unique(ref_cat)

In [None]:
# Understand how many galaxies are affected by Siena Galaxy Atlas (SGA) Masks
has_a_maskbit = maskbits != 0
idx_with_masks = np.flatnonzero(maskbits)
print(f"{np.sum(has_a_maskbit):,} galaxies ({np.sum(has_a_maskbit) / len(maskbits) * 100:.2f}%) have a maskbit set.")

unobserved_with_maskbits = np.logical_and(has_a_maskbit, unobserved)
print(f"{np.sum(unobserved_with_maskbits):,} galaxies ({np.sum(unobserved_with_maskbits) / len(maskbits) * 100:.2f}%) have a maskbit set and are unobserved.")

# See https://www.legacysurvey.org/dr9/bitmasks/
# https://github.com/legacysurvey/legacypipe/blob/master/py/legacypipe/bits.py
BITMASK_SGA = 0x1000 
sga_collision = (maskbits & BITMASK_SGA) != 0
print(f"{np.sum(sga_collision):,} galaxies ({np.sum(sga_collision) / len(maskbits) * 100:.2f}%) have a SGA collision.")

sga_collision_unobs = np.logical_and(sga_collision, unobserved)
print(f"{np.sum(sga_collision_unobs):,} galaxies ({np.sum(sga_collision_unobs) / len(maskbits) * 100:.2f}%) have a SGA collision and are unobserved.")

#sga_central = np.logical_or(ref_cat == b'L3', ref_cat == b'G2', ref_cat == b'GE')
sga_central = ref_cat == b'L3'
print(f"{np.sum(sga_central):,} galaxies ({np.sum(sga_central) / len(maskbits) * 100:.2f}%) are SGA centrals.")

to_remove = sga_collision & ~sga_central
print(f"{np.sum(to_remove):,} galaxies ({np.sum(to_remove) / len(maskbits) * 100:.2f}%) have a SGA collision and are not SGA centrals.")

sga_ra = ra[to_remove]
sga_dec = dec[to_remove]
df = pd.DataFrame({'RA': sga_ra, 'Dec': sga_dec})
df.to_csv(OUTPUT_FOLDER + f'sga_collisions.csv', index=False)

# Inspecting these, I see that most are galaxies, not HII regions. Not sure if we want to remove

### Photo-Z vs Spec-Z Analysis 

In [None]:
print(f"There are {(z_phot != -99.0).sum():,} ({(z_phot != -99.0).sum() / len(z_phot):.1%}) targets with phot-z")

good_idx = np.flatnonzero((z_phot != -99.0) & ~unobserved)
print(f"Amongst observed galaxies there are {len(good_idx):,} ({len(good_idx) / np.sum(~unobserved) * 100:.2f}%) galaxies with photo-z.")

unobserved_with_photz = np.flatnonzero((z_phot != -99.0) & unobserved)
print(f"Amongst unobserved galaxies there are {len(unobserved_with_photz):,} ({len(unobserved_with_photz) / np.sum(unobserved) * 100:.2f}%) with photo-z.")

In [None]:
delta_z = z_phot[good_idx] - z_obs[good_idx]
plt.hist(delta_z, bins=500, range=(-0.1, 0.1))
plt.yscale("log")
plt.title("Photo-z Quality")
plt.ylabel("Count")
plt.xlabel("z_phot - z_spec")

# add bars for my z_thresh
plt.axvline(-SIM_Z_THRESH, color='red')
plt.axvline(SIM_Z_THRESH, color='red')

percentiles = np.percentile(delta_z, [16, 50, 84])
print(f"Median delta z: {percentiles[1]:.4f}, 16th percentile: {percentiles[0]:.4f}, 84th percentile: {percentiles[2]:.4f}")
# add bars for the percentiles
#plt.axvline(percentiles[0], color='green')
#plt.axvline(percentiles[2], color='green')



# What % fall within 0.005 of the true redshift?
within_5_milli = np.abs(delta_z) < SIM_Z_THRESH
print(f"{np.sum(within_5_milli) / len(delta_z) * 100:.2f}% of galaxies have a photometric redshift within {SIM_Z_THRESH} of the spectroscopic redshift.")

# Now look only at quiescent galaxies less than 10^9 solar luminosities
# TODO 
#luminosity = abs_mag_r_to_log_solar_L(app_mag_to_abs_mag_k(app_mag_r, z_obs, g_r_apparent))


# SV3 Analysis

SV3 is composed of 20 regions where 10 or 11 exposures eacj were taken, almost completely on top of each other.  Our SV3 analysis takes the inner part of these patches (NTILE_MINE >= 10) of these regions as the data set.  

Then, we can eliminate 1 tile from each of these regions to make test sets in order to view our systematics as a function of NTILE_MINE. The order they are eliminated in matters; we need to go backwards in time.}

In [None]:
# Make a DataFrame filtered down to the galaxies we want to keep
sv3_merged_table = Table.read(IAN_BGS_SV3_MERGED_FILE, format='fits')
sv3_merged_table.remove_column('NEAREST_TILEIDS')
sv3_df = sv3_merged_table.to_pandas()
print(len(sv3_df))
sv3_df['app_mag'] = get_app_mag(sv3_df['FLUX_R'])
unobserved = sv3_merged_table['Z'].astype("<i8") == 999999
galaxy_observed_filter = sv3_df['SPECTYPE'] == b'GALAXY'
redshift_filter = sv3_df['Z'] > Z_MIN
redshift_hi_filter = sv3_df['Z'] < Z_MAX
deltachi2_filter = sv3_df['DELTACHI2'] > 40
app_mag_filter = sv3_df['app_mag'] < 20.3
observed_requirements = np.all([galaxy_observed_filter, app_mag_filter, redshift_filter, redshift_hi_filter, deltachi2_filter], axis=0)
treat_as_unobserved = np.all([galaxy_observed_filter, app_mag_filter, np.invert(deltachi2_filter)], axis=0)

unobserved = np.all([app_mag_filter, np.logical_or(unobserved, treat_as_unobserved)], axis=0)
sv3_df['OBSERVED'] = np.invert(unobserved)
keep = np.all([np.logical_or(observed_requirements, unobserved)], axis=0)
keep = np.all([keep, sv3_df['NTILE_MINE'] >= 10], axis=0)

sv3_df = sv3_df.loc[keep] 
sv3_df.reset_index(drop=True, inplace=True)
print(len(sv3_df))

# Initialize new columns for observed as function of N pass
for i in range(0, 12):
    sv3_df[f'OBSERVED_{i}'] = sv3_df['OBSERVED']

for FAINT in [False, True]:

    if not FAINT:
        mag_filter = sv3_df['app_mag'] < 19.5
    else:
        mag_filter = sv3_df['app_mag'] > 19.5
        
    print(f"{len(sv3_df[mag_filter]) / 138.192} galaxies per sq degree")

    for patch_number in range(len(gc.sv3_regions_sorted)):
        tilelist = gc.sv3_regions_sorted[patch_number]
        #print(f'Patch {patch_number} - TILE IDs: {tilelist}')
        
        row_selector = np.logical_and(sv3_df['TILEID'].isin(tilelist), mag_filter)

        #one_patch_df = sv3_df[sv3_df['TILEID'].isin(tilelist)]
        #print(f"{len(one_patch_df)} galaxies, {np.sum(one_patch_df['OBSERVED']) / len(one_patch_df) :.1%} of the targets are observed")
        #one_patch_df[f'OBSERVED_{len(tilelist)}'] = one_patch_df['OBSERVED']
        
        #print ("Remove tiles in reverse TILEID order:")
        for i in np.flip(np.arange(0, len(tilelist))):
            tileid = tilelist[i]
            observed_by_this_tile = sv3_df.loc[row_selector, 'TILEID'] == tileid
            #print(f'{np.sum(observed_by_this_tile)} galaxies were observed by tile {tileid} ({i+1}/{len(tilelist)})')
            prev = sv3_df.loc[row_selector, f'OBSERVED_{i+1}']
            sv3_df.loc[row_selector, f'OBSERVED_{i}'] = np.where(observed_by_this_tile, False, prev)
            
        #for i in np.flip(np.arange(0, len(tilelist)+1)):
        #    if FAINT:
        #        totals_observed_faint[i] += np.sum(sv3_df.loc[row_selector, f'OBSERVED_{i}'])
        #        totals_all_faint[i] += len(sv3_df.loc[row_selector])
        #    else:
        #        totals_observed_bright[i] += np.sum(sv3_df.loc[row_selector, f'OBSERVED_{i}'])
        #        totals_all_bright[i] += len(sv3_df.loc[row_selector])

            #print(f"{np.sum(one_patch_df[f'OBSERVED_{i}']) / len(one_patch_df) :.1%} of the targets are observed with {i} passes")
                
    #for i in range(1, 12):
    #    if FAINT:
    #        print(f"{totals_observed_faint[i]:,} ({totals_observed_faint[i] / totals_all_faint[i]:.1%}) faint galaxies are observed with {i} passes")
    #    else: 
    #        print(f"{totals_observed_bright[i]:,} ({totals_observed_bright[i] / totals_all_bright[i]:.1%}) bright galaxies are observed with {i} passes")


In [None]:
observed_faint = np.zeros(12, dtype=int)
observed_bright = np.zeros(12, dtype=int)

total_faint = np.sum(sv3_df['app_mag'] > 19.5)
total_bright = np.sum(sv3_df['app_mag'] < 19.5)

for i in range(0, 12):
    observed_faint[i] = np.sum(sv3_df.loc[sv3_df['app_mag'] > 19.5, f'OBSERVED_{i}'])
    observed_bright[i] = np.sum(sv3_df.loc[sv3_df['app_mag'] < 19.5, f'OBSERVED_{i}'])


plt.plot(observed_bright / total_bright, color='b', label="BGS BRIGHT ME")
plt.plot(observed_faint / total_faint, color='orange', label="BGS FAINT ME")
plt.plot([1,2,3,4], [.29, .52, 0.68, .81], '--', color='b', label="BGS BRIGHT PAPER")
plt.plot([1,2,3,4], [.15, .32, 0.47, .62], '--', color='orange', label="BGS FAINT PAPER")
plt.xlabel("Number of passes")
plt.ylabel("Fraction of targets observed")
plt.title("SV3 BGS Completeness")
plt.xticks(np.arange(0, 12))
plt.legend()

In [None]:
print(sv3_df.loc[sv3_df['app_mag'] < 19.5, 'OBSERVED'].count() - sv3_df.loc[sv3_df['app_mag'] < 19.5, 'OBSERVED'].sum())
print(sv3_df.loc[sv3_df['app_mag'] < 19.5, 'OBSERVED'].count())

The above numbers do not seem to track with the Y1 data. In Y1 no region has more than 4 passes so how do I have a fiber incompleteness better than the above number?

Also Figure 17 of https://iopscience.iop.org/article/10.3847/1538-3881/accff8/pdf disagrees with my above analysis. So what is above is wrong.

# Nearest Neighbor Analysis (for SV3 Merged File)

Have a nearest neighbor catalog that is all the actually observed galaxies in SV3 with my Y3 supplement. 

Can run analysis on entire sample or a subset of 'pretend-to-be unobserved' galaxies (~obs_7p).

In [None]:
# MAKE SURE that above KEEP_PASSES is set to 1 and you filter out unobserved galaxies as we need a source of 'truth'
assert KEEP_PASSES == 1 and np.sum(unobserved) == 0

gmr = app_mag_g - app_mag_r
Rk = app_mag_to_abs_mag_k(app_mag_r, z_obs, gmr, band='r')
Gk = app_mag_to_abs_mag_k(app_mag_g, z_obs, gmr, band='g')
quiescent = is_quiescent_BGS_gmr(None, Gk-Rk)

# We will only consider the observed galaxies in SV3 as we need a source of truth for the analysis.
# This is over 98% so this analysis should be representative.
# We then pretend to have not observed about 20% of galaxies, as is the case in the main survey.
# TODO BUG Ashley says my method is wrong for doing this.
in_10p_zone = ntiles_mine >= 10
obs_7p = ~gc.drop_SV3_passes(3, tile_id, unobserved)

print(np.sum(quiescent))
print(np.sum(in_10p_zone))
print(np.sum(obs_7p))

In [None]:
tile_id.mask.sum() # BUG prevents using LOST_GALAXIES_ONLY=True. Is it because of Y3 supplement has no TILE_ID?

In [None]:
obj = NNAnalyzer(dec, ra, z_obs, app_mag_r, Rk, None, g_r_apparent, quiescent, obs_7p, p_obs)
#obj = NNAnalyzer(dec[obs_10p], ra[obs_10p], z_obs[obs_10p], app_mag_r[obs_10p], Rk[obs_10p], None, g_r_apparent[obs_10p], quiescent[obs_10p], obs_7p, p_obs[obs_10p])
#obj.set_row_locator( np.logical_and(obs_10p, app_mag_r < 19.5) ) # 10p inner regions and BRIGHT only
obj.set_row_locator(in_10p_zone) # 10p inner regions and BRIGHT only
obj.find_nn_properties(LOST_GALAXIES_ONLY=False) # True (look only at lost ones) would be better,
# but the galaxy count is so small that binning makes it hard to interpret. 
# Plus, our method of making the lost galaxies is sus.
# TODO make True after using CIC and see if its OK
obj.make_bins()

print(np.sum(obj.all_ang_bincounts))
print(np.sum(obj.all_sim_z_bincounts))

In [None]:
newobj = NNAnalyzer_cic.from_data(dec, ra, z_obs, app_mag_r, Rk, g_r_apparent, quiescent, obs_7p, p_obs)
#newobj.set_row_locator( np.logical_and(obs_10p, app_mag_r < 19.5) ) # 10p inner regions and BRIGHT only
newobj.set_row_locator(in_10p_zone)
newobj.find_nn_properties(LOST_GALAXIES_ONLY=False) 
newobj.make_bins()
newobj.save(OUTPUT_FOLDER + 'BGS_cic_binned_data.pkl')

print(np.sum(obj.all_ang_bincounts))
print(np.sum(obj.all_sim_z_bincounts))

In [None]:
loaded_nna = NNAnalyzer_cic.from_results_file(OUTPUT_FOLDER + 'BGS_cic_binned_data.pkl')
assert np.all(loaded_nna.all_ang_bincounts == newobj.all_ang_bincounts)
assert np.all(loaded_nna.all_sim_z_bincounts == newobj.all_sim_z_bincounts)

In [None]:
delta = obj.all_ang_bincounts - newobj.all_ang_bincounts
assert np.isclose(np.sum(delta), 0), f"Sum of differences is {np.sum(delta)}"

delta_z = obj.all_sim_z_bincounts - newobj.all_sim_z_bincounts
assert np.isclose(np.sum(delta_z), 0), f"Sum of differences is {np.sum(delta_z)}"
# This difference could be from the clipping of the data in the CIC method

In [None]:
# p_obs is messed up for SV3 so these don't look great
obj.plot_angdist_pobs_per_zbin_cc()

In [None]:
# These look OK 
obj.plot_angdist_appmag_per_zbin_cc()

In [None]:
# These look OK 
newobj.plot_angdist_appmag_per_zbin_cc()

In [None]:
d = obj.frac_aa - newobj.frac_aa
j=plt.hist(d.flatten(), bins=10)

In [None]:
# TODO de-dupe with pre_process_BGS(...)

# Examine photo-z NN relation
NUM_NEIGHBORS = 30
unobs_7p = ~obs_7p
bright = app_mag_r[in_10p_zone] < 19.5
use = bright & unobs_7p
neighbor_indexes = np.zeros(shape=(NUM_NEIGHBORS, use.sum()), dtype=np.int32) # indexes point to CATALOG locations
ang_distances = np.zeros(shape=(NUM_NEIGHBORS, use.sum()))

catalog = coord.SkyCoord(ra=ra[in_10p_zone][obs_7p]*u.degree, dec=dec[in_10p_zone][obs_7p]*u.degree, frame='icrs')

print(f"Finding nearest {NUM_NEIGHBORS} neighbors... ", end='\r')   
for n in range(0, NUM_NEIGHBORS):
    to_match = coord.SkyCoord(ra=ra[in_10p_zone][use]*u.degree, dec=dec[in_10p_zone][use]*u.degree, frame='icrs')
    idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=n+1, storekdtree='sv3')
    neighbor_indexes[n] = idx
    ang_distances[n] = d2d.to(u.arcsec).value
print(f"Finding nearest {NUM_NEIGHBORS} neighbors... done!")   


In [32]:
ztruth = z_obs[in_10p_zone][use]
# fill zphot_fake with ztruth + a random gaussian draw around 0.0 with sigma of 0.01
zphot = z_phot[in_10p_zone][use]


In [None]:
one_pass_df.z_assigned_flag = ~obs_7p
plot_positions(one_pass_df, tiles_df=None, DEG_LONG=2.5, split=True, ra_min=one_pass_df.RA[0], dec_min=one_pass_df.Dec[0])

In [None]:
paramspace = np.arange(0.001, 0.05, 0.001)
                       
cumulative_percent_correct_by_n_zp_firstonly = np.zeros((NUM_NEIGHBORS, len(paramspace)) , dtype=float)

i = 0
for PHOTOZ_MATCHING_THRESHOLD in paramspace:
    
    this_neighbor_correct = np.zeros(shape=(NUM_NEIGHBORS, len(ztruth)), dtype=bool)
    z_phot_neighbor_match = np.zeros(shape=(NUM_NEIGHBORS, len(ztruth)), dtype=bool)
    cumulative_percent_z_phot_neighbor_match = []
    z_phot_first_neighbor_match_idx = np.ones(len(ztruth), dtype=int) * 999 # sentinal value for no match
    z_phot_match_correct = np.zeros(shape=(NUM_NEIGHBORS, len(ztruth)), dtype=bool)
    delta_z = np.zeros(shape=(NUM_NEIGHBORS, len(ztruth)), dtype=float)

    percent_correct_by_n = []
    percent_correct_by_n_zp = []
    cumulative_percent_correct_by_n = []
    cumulative_percent_correct_by_n_zp = []
    cumulative_percent_correct_by_n_zp_closestonly = []
    first_matched_but_incorrect = []

    for n in range(0, NUM_NEIGHBORS):

        z_neighbor = z_obs[in_10p_zone][obs_7p][neighbor_indexes[n]]
        this_neighbor_correct[n] = close_enough(ztruth, z_neighbor)
        percent_correct_by_n.append(this_neighbor_correct[n].sum() / len(ztruth))
        any_neighbor_correct = this_neighbor_correct[0:n+1].max(axis=0) # max will be True if any neighbor is True
        cumulative_percent_correct_by_n.append(any_neighbor_correct.sum() / len(ztruth))

        z_phot_neighbor_match[n] = close_enough(zphot, z_neighbor, threshold=PHOTOZ_MATCHING_THRESHOLD)
        any_z_phot_match = z_phot_neighbor_match[0:n+1].max(axis=0) # will be True if z_phot matches a neighbor
        cumulative_percent_z_phot_neighbor_match.append(any_z_phot_match.sum() / len(ztruth))
        z_phot_match_correct[n] = z_phot_neighbor_match[n] & this_neighbor_correct[n]
        any_neighbor_zp_correct = z_phot_match_correct[0:n+1].max(axis=0) # will be True if z_phot matches a neighbor
        percent_correct_by_n_zp.append(z_phot_match_correct[n].sum() / len(ztruth))
        cumulative_percent_correct_by_n_zp.append(any_neighbor_zp_correct.sum() / len(ztruth))


        # Now only consider the closest neighbor, by photo-z matching
        delta_z[n] = zphot - z_neighbor
        best_match_idx = np.argmin(delta_z[0:n+1], axis=0, keepdims=True)
        closest_delta_z_correct = np.take_along_axis(z_phot_match_correct, best_match_idx, axis=0)[0] # will be True bset zphot match is correct
        cumulative_percent_correct_by_n_zp_closestonly.append(closest_delta_z_correct.sum() / len(ztruth))

        # Now only consider the first neighbor in order of angular distance
        z_phot_first_neighbor_match_idx = np.minimum(z_phot_first_neighbor_match_idx, np.where(z_phot_neighbor_match[n],n,999))
        has_match = z_phot_first_neighbor_match_idx != 999

        # if z_phot_first_neighbor_match_idx is 999 for  row, first_correct will be False
        # if z_phot_first_neighbor_match_idx is an index for row, first_correct will be z_phot_match_correct value for that row at that index
        first_correct = np.repeat(False, len(ztruth))
        first_correct[has_match] = z_phot_match_correct[z_phot_first_neighbor_match_idx[has_match], np.arange(len(ztruth))[has_match]]
        cumulative_percent_correct_by_n_zp_firstonly[n,i] = (first_correct.sum() / len(ztruth))

        first_matched_but_incorrect.append(cumulative_percent_z_phot_neighbor_match[n] - cumulative_percent_correct_by_n_zp_firstonly[n,i])

    if i == 26:
        plt.figure(figsize=(10, 5))
        plt.plot(range(1, NUM_NEIGHBORS+1), percent_correct_by_n, label="This Neighbor is correct z")
        plt.plot(range(1, NUM_NEIGHBORS+1), percent_correct_by_n_zp, label="This Neighbor photo-z matched and correct (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n, label="Any Neighbor has correct z")
        plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp, label="Any neighbor photo-z matched and correct (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_closestonly, label="Minimum delta-z matched neighbor correct (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,i], label=f"First photo-z matched neighbor correct (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_z_phot_neighbor_match, label="Any Neighbor photo-z matched (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.ylabel("Fraction")
        plt.xlabel("Nth Nearest Neighbor")
        plt.xticks(np.arange(1, NUM_NEIGHBORS+1))
        plt.ylim(0, 0.8)
        #plt.axhline(y=0.0125, color='r', linestyle='--', label="Random Chance")
        plt.legend()
        
        plt.figure(figsize=(10, 5))
        # stacked bar chart of 
        plt.bar(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,i], label=f"First photo-z matched neighbor correct (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})")
        plt.bar(range(1, NUM_NEIGHBORS+1), first_matched_but_incorrect, label=f"First photo-z matched neighbor incorrect (thresh={PHOTOZ_MATCHING_THRESHOLD:.3})", bottom = cumulative_percent_correct_by_n_zp_firstonly[:,i])

        plt.ylabel("Fraction")
        plt.xlabel("Nth Nearest Neighbor")
        plt.xticks(np.arange(1, NUM_NEIGHBORS+1))
        plt.ylim(0, 1.0)
        plt.legend()
    
    i = i + 1





In [None]:
# Find the maximum point
best = np.max(cumulative_percent_correct_by_n_zp_firstonly)
best_idx = np.unravel_index(np.argmax(cumulative_percent_correct_by_n_zp_firstonly), cumulative_percent_correct_by_n_zp_firstonly.shape)
print(cumulative_percent_correct_by_n_zp_firstonly.shape)
print(best_idx)
print(f"Best photo-z match threshold: {paramspace[best_idx[1]]:.3f}, Neighbors={best_idx[0]+1}, {best:.2%}")


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,4],        label=f"thresh={paramspace[4]:.3})", color=[0.1, 0.8, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,9],        label=f"thresh={paramspace[9]:.3})", color=[0.2, 0.7, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,19],       label=f"thresh={paramspace[19]:.3})", color=[0.3, 0.6, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,26], '--', label=f"thresh={paramspace[26]:.3})", color=[0.4, 0.5, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,39],       label=f"thresh={paramspace[39]:.3})", color=[0.5, 0.4, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,49],       label=f"thresh={paramspace[49]:.3})", color=[0.6, 0.3, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,59],       label=f"thresh={paramspace[59]:.3})", color=[0.7, 0.2, 0])
plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,99],       label=f"thresh={paramspace[99]:.3})", color=[0.8, 0.1, 0])
plt.ylabel("Fraction Correct")
plt.xlabel("Nth Nearest Neighbor")
plt.xticks(np.arange(1, NUM_NEIGHBORS+1))
plt.ylim(0, 0.45)
#plt.axhline(y=0.0125, color='r', linestyle='--', label="Random Chance")
plt.legend()

In [None]:
# Make a figure showing the best photo-z match for each galaxy
plt.figure(figsize=(10, 5))
for i in range(0, len(paramspace)):
    color = [i/(len(paramspace)+1),0.5,0]
    plt.plot(range(1, NUM_NEIGHBORS+1), cumulative_percent_correct_by_n_zp_firstonly[:,i], label=f"Photo-z Match Threshold {paramspace[i]:.3f}", color=color)
plt.ylabel("Fraction Correct")
plt.xlabel("Nth Nearest Neighbor")
plt.xticks(np.arange(1, NUM_NEIGHBORS+1))
plt.ylim(0, 0.8)

# Color Analysis

Lesson from this analysis: the BGS data, workign with my 0.1^G-R with GAMA k-corrections, does not distribute a per logLgal bin G-R; the global 0.76 split seems to work for all bins.

In [None]:
G = app_mag_to_abs_mag(app_mag_g, z_obs)
R = app_mag_to_abs_mag(app_mag_r, z_obs)

G_R = G - R

In [None]:
# It doesn't matter if you use g_r_apparent or G_R as the difference between the is the same!

Gk = k_correct_bgs(G, z_obs, g_r_apparent, band='g')
Rk = k_correct_bgs(R, z_obs, g_r_apparent, band='r')
G_R_k_BGS1 = Gk - Rk

Gk_GAMA = k_correct_gama(G, z_obs, g_r_apparent, band='g')
Rk_GAMA = k_correct_gama(R, z_obs, g_r_apparent, band='r')
G_R_k_GAMA = Gk_GAMA - Rk_GAMA

Gk_BGS2 = k_correct_bgs_v2(G, z_obs, g_r_apparent, band='g')
Rk_BGS2 = k_correct_bgs_v2(R, z_obs, g_r_apparent, band='r')
G_R_k_BGS2 = Gk_BGS2 - Rk_BGS2

In [None]:
# Comparison of g-r computed a few ways
bins = np.linspace(0, 2.0, 200)

plt.figure()
#junk=plt.hist(g_r_apparent, bins=bins, label="g-r", histtype='step', density=True)
#junk=plt.hist(sdss_g_r, bins=bins, label='From LSS Pipeline (JM?)', histtype='step', density=True)
junk=plt.hist(G_R_JM1, bins=bins, label="0.1^(G-R) JM", histtype='step', density=True)
#junk=plt.hist(G_R, bins=bins, label="G-R", histtype='step', density=True)
junk=plt.hist(G_R_k_BGS1, bins=bins, label="0.1^(G-R) BGS poly v1", histtype='step', density=True)
junk=plt.hist(G_R_k_GAMA, bins=bins, label="0.1^(G-R) GAMA poly", histtype='step', density=True)
junk=plt.hist(G_R_k_BGS2, bins=bins, label="0.1^(G-R) BGS poly v2", histtype='step', density=True)
plt.xlabel("g-r")
plt.ylabel("Count")
plt.legend()
plt.xlim(0.2, 1.3)
plt.title("Comparison of g-r computed a few ways")
plt.tight_layout()
plt.ylim(0,3.5)

In [None]:
# Can see global GLOBAL_RED_COLOR_CUT=0.76 here
junk=plt.hist(G_R_k_GAMA, bins=300, alpha=0.5, label="0.1^(G-R) GAMA-style")
plt.legend()
plt.xlim(0.5, 1.0)

In [None]:
print(is_quiescent_lost_gal_guess(g_r_apparent).sum() / len(g_r_apparent))
assert len(G_R_k_GAMA) == len(g_r_apparent)
print(is_quiescent_BGS_gmr(None, G_R_k_GAMA).sum() / len(G_R_k_GAMA))

In [None]:
from pyutils import *
print(BGS_LOGLGAL_BINS)
print(BINWISE_RED_COLOR_CUT)

In [None]:
is_quiescent_BGS_gmr(np.array([5.8, 9.0, 14.5]), np.array([0.5, 0.9, 0.9]))

In [None]:
# Get logLgal bins
log_L_gal = abs_mag_r_to_log_solar_L(Rk) 
logLgal_bin_idx = np.digitize(log_L_gal, BGS_LOGLGAL_BINS)
# 0 is less than the lowest, len(BGS_LOGLGAL_BINS) is greater than the highest entry in BGS_LOGLGAL_BINS

In [None]:
print(np.min(log_L_gal))
print(np.max(log_L_gal))
print(np.min(logLgal_bin_idx))
print(np.max(logLgal_bin_idx))
plt.hist(log_L_gal, bins=BGS_LOGLGAL_BINS, align='mid')
#plt.yscale('log')

In [None]:
# Make a plot of G_R_k in each logLgal bin
for i in range(0, len(BGS_LOGLGAL_BINS)+1):
    galaxy_idx_for_this_bin = logLgal_bin_idx == i

    plt.figure(dpi=80, figsize=(10, 6))
    junk=plt.hist(G_R_k_BGS1[galaxy_idx_for_this_bin], bins=np.arange(0,1.3,0.02), label=f"0.1^(G-R) Bin {i}", align='mid')
    plt.legend()
    plt.xlim(0.4, 1.2)
    plt.xticks(np.arange(0.4, 1.2, 0.04))

In [None]:
mag1 = abs_mag_sdss
mag2 = R

In [None]:
# Compare Absolute Magnitudes
# Difference is how we k-correct I believe
bins = np.linspace(-25, -10, 100)
my_counts, my_bins, my_p = plt.hist(mag2, label="my abs_mag", bins=bins, alpha=0.5)
alex_counts, alex_bins, alex_p = plt.hist(mag1, label="ABSMAG_SDSS_R", bins=bins, alpha=0.5)
plt.xlabel("Absolute Mag")
plt.ylabel("Count")
plt.title("Compare Absolute Mags")
#plt.yscale('log')
plt.legend()

print(f"The peak is shifted from ABSMAG_SDSS_R {alex_bins[np.argmax(alex_counts)]:.1f} to my {my_bins[np.argmax(my_counts)]:.1f}")


In [None]:
fig=make_map(ra, dec)

## Dn4000 Comparison (BGS, SDSS)

In [None]:
sdss = pd.read_csv(SDSS_v1_DAT_FILE, delimiter=' ', names=('RA', 'Dec', 'z', 'logLgal', 'V_max', 'quiescent', 'chi'), index_col=False)
sdss_galprops = pd.read_csv("../data/sdss_galprops_v1.0.dat", delimiter=' ', names=('Mag_g', 'Mag_r', 'sigma_v', 'Dn4000', 'concentration', 'log_M_star'))
sdss = pd.merge(sdss, sdss_galprops, left_index=True, right_index=True)


In [None]:
plt.hist(dn4000, bins=np.linspace(-0.5, 5.0, 100), alpha=0.6, label="BGS Y1")
plt.hist(sdss.Dn4000, bins=np.linspace(-0.5, 5.0, 100), alpha=0.8, label="SDSS")
plt.yscale('log')
plt.legend()
plt.xlabel('Dn4000')
plt.ylabel('Count')

In [None]:
plt.hist(dn4000, bins=np.linspace(-0.5, 5.0, 100), alpha=0.6, label="BGS Y1")
plt.hist(sdss.Dn4000, bins=np.linspace(-0.5, 5.0, 100), alpha=0.8, label="SDSS")
plt.legend()
plt.xlabel('Dn4000')
plt.ylabel('Count')
plt.xlim(0.9,2.5)

In [None]:
sdss_catalog = coord.SkyCoord(ra=sdss.RA.to_numpy()*u.degree, dec=sdss.Dec.to_numpy()*u.degree, frame='icrs')
BGS_catalog = coord.SkyCoord(ra=ra*u.degree, dec=dec*u.degree, frame='icrs')

neighbor_indexes, d2d, d3d = coord.match_coordinates_sky(BGS_catalog, sdss_catalog, storekdtree='sdss')
ang_distances = d2d.to(u.arcsec).value

match_found_filter = ang_distances < 3.0
bgs_matches = dn4000[match_found_filter]
sdss_indexes = neighbor_indexes[match_found_filter]
sdss_matches = sdss.iloc[sdss_indexes].Dn4000.to_numpy()

In [None]:
print(f"{np.isclose(bgs_matches, sdss_matches, atol=0.05).sum() / len(bgs_matches)} of the matches are within 0.05 of each other.")
print(f"{np.isclose(bgs_matches, sdss_matches, atol=0.1).sum() / len(bgs_matches)} of the matches are within 0.1 of each other.")
print(f"{np.isclose(bgs_matches, sdss_matches, atol=0.2).sum() / len(bgs_matches)} of the matches are within 0.2 of each other.")
print(f"{np.isclose(bgs_matches, sdss_matches, atol=0.3).sum() / len(bgs_matches)} of the matches are within 0.3 of each other.")


In [None]:
fig=make_map(ra, dec)
fig=make_map(sdss.RA.to_numpy(), sdss.Dec.to_numpy(), fig=fig, alpha=0.05)

In [None]:
plt.scatter(sdss_matches, bgs_matches, s=1, alpha=.2)
plt.xlabel("SDSS Dn4000")
plt.ylabel("BGS Dn4000")
plt.xlim(1, 2.3)
plt.ylim(1, 2.3)

In [None]:
df = pd.DataFrame({'SDSS_Dn4000': sdss_matches, 'BGS_Dn4000': bgs_matches})
df['diff_frac'] =  (df['BGS_Dn4000'] - df['SDSS_Dn4000']) / df['SDSS_Dn4000']
bins = np.linspace(-1, 5, 60)
labels = bins[0:len(bins)-1] 
df['dn4000_sdssbin'] = pd.cut(x = sdss_matches, bins = bins, labels = labels, include_lowest = True)

In [None]:
plt.figure(dpi=80)
diff_mean = df.groupby('dn4000_sdssbin').diff_frac.mean()
diff_std= df.groupby('dn4000_sdssbin').diff_frac.std()

plt.errorbar(labels, diff_mean, yerr=diff_std)
plt.xlabel("SDSS Dn4000")
plt.ylabel("< (BGS-SDSS) / SDSS >")
plt.xlim(0.8, 2.4)
plt.ylim(-0.75, 0.75)
plt.draw()

## Dn4000 Lgal Bin Analysis

Run Color Analysis and Dn4000 Comparison first

In [None]:
# Make a plot of Dn4000 in each logLgal bin
fig,axes=plt.subplots(dpi=80, figsize=(10, 3*len(BGS_LOGLGAL_BINS)//2), ncols=2, nrows=len(BGS_LOGLGAL_BINS)//2)
axes = np.ravel(axes)

for i in range(0, len(BGS_LOGLGAL_BINS)-1):
    galaxy_idx_for_this_bin = logLgal_bin_idx == i+1

    junk=axes[i].hist(dn4000[galaxy_idx_for_this_bin], bins=np.arange(1,2.2,0.02), label=f"Dn4000 for logLgal Bin {i+1}", align='mid')
    axes[i].legend()
    axes[i].set_xlim(1, 2.2)
    axes[i].set_xticks(np.arange(1, 2.2, 0.1))

    # draw a vertical line at get_SDSS_Dcrit(logLgal)
    axes[i].axvline(x=get_SDSS_Dcrit(BGS_LOGLGAL_BINS[i]), color='r', linestyle='-')

axes = np.reshape(axes, (2, len(BGS_LOGLGAL_BINS)//2))


# Randoms Analysis for Footprint

In [36]:
RANDOMS_DENSITY = 2500 # per square degree, Ashley Ross paper on LSS pipeline or elsewhere in docs

In [37]:
rtable = Table.read(BGS_Y3_RAND_FILE, format='fits')
rtable.columns
rtable.keep_columns(['LOCATION', 'FIBER', 'TARGETID', 'RA', 'DEC', 'PRIORITY', 'TILEID', 'TILELOCID', 'NTILE', 'TILES'])

In [38]:
r_dec = rtable['DEC'].astype("<f8")
r_ra = rtable['RA'].astype("<f8")
r_ntiles = rtable['NTILE'].astype("<i8")
r_tileid = rtable['TILEID'].astype("<i8")

# TODO TILEID is just one, TILES has them all... look at rtable['TILES'][135000] for example
randoms_df = pd.DataFrame({'RA': r_ra, 'Dec': r_dec, 'NTILE': r_ntiles, 'TILEID': r_tileid})


onepass_footprint = len(r_dec) / RANDOMS_DENSITY # in degrees squared
onepass_frac_area = onepass_footprint / DEGREES_ON_SPHERE

three_pass_filter = r_ntiles >= 3 # 3pass coverage
r_dec3 = r_dec[three_pass_filter]
r_ra3 = r_ra[three_pass_filter]

threepass_footprint = len(r_dec3) / RANDOMS_DENSITY # in degrees squared
threepass_frac_area = threepass_footprint / DEGREES_ON_SPHERE

# My estimation procedure, which we don't use
estimate = estimate_frac_area(r_ra, r_dec)
estimate3 = estimate_frac_area(r_ra3, r_dec3)

In [None]:
print("INCORRECT RESULTS THAT USE NTILE, NOT NTILE_MINE")
print(f"BGS 1pass Footprint calculated from randoms is {onepass_footprint} square degrees or frac_area={onepass_frac_area}")
print(f"BGS 3pass Footprint calculated from randoms is {threepass_footprint} square degrees or frac_area={threepass_frac_area}")
#print(f"BGS 1pass Footprint estimated from my algorithm: frac_area={estimate}")
#print(f"BGS 3pass Footprint estimated from my algorithm: frac_area={estimate3}")

### Make map showing why we cannot use NTILE >= 3

In [None]:
# Make maps
#tiles_BGS = read_tiles_Y1_file()
tiles_BGS = read_tiles_Y3_main()
#tiles_BGS = tiles_BGS.loc[tiles_BGS.FAFLAVOR == 'sv3bright']

# Plot the galaxy positions (randoms) and return the set of tile_id associated with them
plot_positions(randoms_df, randoms_df[randoms_df.NTILE >= 3], tiles_df=tiles_BGS, DEG_LONG=10, ra_min=180, dec_min=-5, split=False)

### Make map with NTILE_MINE

Uses a NN lookup of the tiles for each 'galaxy'.

In [None]:
# LOAD FROM LAST TIME WE DID THIS
randoms_df = pickle.load(open(BIN_FOLDER + "randoms_df.p", "rb"))

In [None]:
# TO MAKE IT ANEW
NTILE_MIN = 10
ntiles_inside, nearest_tile_ids = find_tiles_for_galaxies(tiles_BGS, randoms_df, NTILE_MIN)
randoms_df['NTILE_MINE'] = ntiles_inside

print(np.sum(ntiles_inside >= 3) / len(ntiles_inside))
print(np.sum(ntiles_inside >= 3))

# Not sure if I really need the nearest tile IDs for anything
pickle.dump(randoms_df, open(OUTPUT_FOLDER + "randoms_df_y3kibo.pkl", "wb"))

In [None]:
# Plot the galaxy positions (randoms) and return the set of tile_id associated with them
plot_positions(randoms_df, randoms_df[randoms_df.NTILE_MINE >= 3], tiles_df=tiles_BGS, DEG_LONG=5, split=False)

In [None]:
# Recalculate Footprint given this definition
for i in range(1, 11):
    n_pass_filter = randoms_df.NTILE_MINE >= i
    n_pass_footprint = len(randoms_df[n_pass_filter].RA) / RANDOMS_DENSITY # in degrees squared
    n_pass_frac_area = n_pass_footprint / DEGREES_ON_SPHERE
    print(f"BGS {i}pass Footprint calculated from randoms is {n_pass_footprint} square degrees or frac_area={n_pass_frac_area}")
    