In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as c
import h5py
from astropy.wcs import WCS
import astropy.coordinates as coord
import astropy.units as u
from pyutils import *
import types
import numpy.ma as ma
import sys
from random import randint
from matplotlib.patches import Circle
from ctypes import c_uint64
import pickle
from astropy.table import Table

#ROOT_FOLDER = "/Volumes/Seagate Backup Plus Drive/galaxy-groups-data/"
#ROOT_FOLDER = "/mnt/f/galaxy-groups-data/"
ROOT_FOLDER = "bin/"
BIG_FILES_FOLDER="/export/sirocco2/tinker/DESI/MXXL_MOCKS/"


In [None]:
%load_ext autoreload
%autoreload 2

## Basic read-in of HDF5 data from MXXL


In [None]:
DATA_CUT_INDEX = 300000 #21201544 #3000000 

In [None]:
weights = h5py.File(BIG_FILES_FOLDER + 'weights_3pass.hdf5', 'r')
print(list(weights))
print(list(weights['Data']))
print(list(weights['Weight']))


In [None]:
# Common PLT helpers
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
def get_color(i):
    co = colors[i%len(colors)]
    return co

DPI = 1200


# Shared bins for various purposes
Mhalo_bins = np.logspace(10, 15.5, 40)
Mhalo_labels = Mhalo_bins[0:len(Mhalo_bins)-1] 

L_gal_bins = np.logspace(6, 12.5, 40)
L_gal_labels = L_gal_bins[0:len(L_gal_bins)-1]

# Experiments on MXXL Data Directly

## Simple plots of basic data

In [None]:
small_gal_type = weights['Data/galaxy_type'][0:DATA_CUT_INDEX] # 0 1 2 3 possible
bins = plt.hist(small_gal_type, bins=50)

In [None]:
small_z_obs = weights['Data/z_obs'][0:DATA_CUT_INDEX]
bins = plt.hist(small_z_obs, bins=50)
plt.xlabel("$z_{obs}$")
plt.title("Histogram of Observed Redshifts")


In [None]:
ra = weights['Data/ra'][0:DATA_CUT_INDEX]
dec = weights['Data/dec'][0:DATA_CUT_INDEX]

In [None]:
# Build a map of the galaxies

ra_angles = coord.Angle(ra*u.degree)
ra_angles = ra_angles.wrap_at(180*u.degree)
dec_angles = coord.Angle(dec*u.degree)

fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(111, projection="mollweide")
ax.scatter(ra_angles.radian, dec_angles.radian, alpha=0.002)
# This looks like Alex' paper, good
# TODO how to get frac_area from this?


In [None]:
mxxl_halo_id = weights['Data/mxxl_id'][0:DATA_CUT_INDEX]
np.sum(mxxl_halo_id == 0) / len(mxxl_halo_id)
# TODO why do 2.5% of galaxies have 0 for the MXXL Halo ID? This may be messing us up

weird_indexes = np.argwhere(np.invert(mxxl_halo_id.astype(bool)))
weird_types = small_gal_type[weird_indexes]
trash = plt.hist(weird_types)

In [None]:
small_app_mag = weights['Data/app_mag'][0:DATA_CUT_INDEX]
bins = plt.hist(small_app_mag, bins=50)
plt.xlabel("Apparent Mag")
plt.title("Histogram of Apparent Mags")

In [None]:
small_abs_mag = weights['Data/abs_mag'][0:DATA_CUT_INDEX]
small_colours = weights['Data/g_r'][0:DATA_CUT_INDEX]

In [None]:
# Calculating luminosity distances from the cosmology is a bit slow
my_abs_mag = app_mag_to_abs_mag(small_app_mag, small_z_obs)
#my_abs_mag_k = app_mag_to_abs_mag_k(small_app_mag, small_z_obs, small_colours)


In [None]:
# Compare my_abs_mag to abs_mag. 
x = plt.hist(my_abs_mag, label="my abs_mag", bins=50)
y = plt.hist(small_abs_mag, label="alex abs_mag", bins=50)
#z = plt.hist(my_abs_mag_k, label="my k abs_mag", bins=50)
plt.xlabel("Absolute Mag")
plt.ylabel("Count")
plt.title("Compare Absolute Mags")
plt.legend()

In [None]:
# At what distance (luminosity distance) would the objects appear to be 19.5 mag?
v_max = get_max_observable_volume(my_abs_mag, small_z_obs, 19.5)
v_max2 = get_max_observable_volume(small_abs_mag, small_z_obs, 19.5)

bins = plt.hist(np.log10(v_max), label="my abs_mag", bins=50)
bins = plt.hist(np.log10(v_max2), label="alex abs_mag", bins=50)
plt.title("Compare V_max")
plt.legend()
plt.xlabel("log(V_max) [Mpc]")
plt.ylabel("Count")

### What is a reasonable z fudge factor for 'close enough' redshifts given galaxies $v_{\mathrm{pec}}$?

Galaxies move at hundreds of km/s usually, or thousands in a rich cluster.

Two galaxies moving at 600 km/s towards each other along LOS but at same cosmological redshift would have a total redshift difference of 0.004. This suggests a z +/- 0.002 is totally reasonable. In richer areas this could be as high as z +/- 0.010. 

Adopting z +/- 0.003 for now seems fine. Can refine later.

In [None]:
# What is a reasonable z +/- fudge factor for 'close enough' redshifts? 
# Consider peculiar velocities.
z_test = [0.001, 0.002, 0.003, 0.005, 0.01] * u.dimensionless_unscaled
v_pec = z_test.to(u.km / u.s, u.equivalencies.doppler_redshift())
for i in range(len(z_test)):
    print(f"z={z_test[i]:.3f} is {v_pec[i]:.0f}")



## Get Truth Abs Mag for Correcting

In [None]:
app_mag = weights['Data/app_mag'][:]
z_obs = weights['Data/z_obs'][:]
APP_MAG_CUT = 19.5
bright_filter = app_mag < APP_MAG_CUT 
redshift_filter = z_obs > 0 
keep = np.all([bright_filter, redshift_filter], axis=0)

app_mag = app_mag[keep]
z_obs = z_obs[keep]

my_abs_mag = app_mag_to_abs_mag(app_mag, z_obs)

In [None]:
bins = np.linspace(min(my_abs_mag), max(my_abs_mag), 100)
densities, bins  = np.histogram(my_abs_mag, bins=bins, density=True)
t = plt.hist(my_abs_mag, bins, density=True)

with open('bin/abs_mag_weight.npy', 'wb') as f:
    np.save(f, densities, allow_pickle=False)
    np.save(f, bins, allow_pickle=False)

In [None]:
with open('bin/abs_mag_weight.npy', 'rb') as f:
    densities = np.load(f)
    bins = np.load(f)

plt.plot(bins[0:99], densities)
#plt.yscale('log')

## Examine map of apparent mag to z distribution

In [None]:
# Builds a map of apparent mags to a pdf of redshifts.plt
# Build the map all the way to 20th mag
app_mag = weights['Data/app_mag'][:]
z_obs = weights['Data/z_obs'][:]
APP_MAG_CUT = 20.0
bright_filter = app_mag < APP_MAG_CUT 
redshift_filter = z_obs > 0 
keep = np.all([bright_filter, redshift_filter], axis=0)
app_mag = app_mag[keep]
z_obs = z_obs[keep]

In [None]:
app_mag_bins, the_map = build_app_mag_to_z_map(app_mag, z_obs)

counts, app_mag_bins_2  = np.histogram(app_mag, bins=app_mag_bins, density=False)
plt.figure()
t = plt.hist(app_mag, app_mag_bins, density=False)
plt.yscale('log')

plt.figure()
trash=plt.hist(the_map[0],bins=30, density=True)
trash=plt.hist(the_map[50],bins=30, density=True)
trash=plt.hist(the_map[100],bins=30, density=True)

### Density of Galaxies per square degree

In [None]:
app_mag = weights['Data/app_mag'][:]
print(f"There are ~{np.sum(app_mag < 19.5) / 14000:.0f} galaxies/deg^2 < 19.5 mag")
print(f"There are ~{np.sum(np.all([app_mag > 19.5, app_mag < 20.0], axis=0)) / 14000:.0f} galaxies/deg^2 between 19.5 and 20.0 mag")

## Nearest Neighbor Angular Separation and Same-Halo Analysis


In [None]:
input = weights
dec = input['Data/dec'][:]
ra = input['Data/ra'][:]
z_obs = input['Data/z_obs'][:]
app_mag = input['Data/app_mag'][:]

APP_MAG_CUT = 20.0
bright_filter = app_mag < APP_MAG_CUT # makes a filter array (True/False values)
redshift_filter = z_obs > 0 # makes a filter array (True/False values)
#location_filter_1 = ra < 270.0
#location_filter_2 = ra > 120.0
#location_filter_3 = dec > 0.0
#location_filter_4 = dec < 45.0
keep = np.all([bright_filter, redshift_filter], axis=0)
#keep = np.all([bright_filter, redshift_filter, location_filter_1, location_filter_2, location_filter_3, location_filter_4], axis=0)

dec = dec[keep]
ra = ra[keep]
z_obs = z_obs[keep]
app_mag = app_mag[keep]
sim_halo_id = input['Data/mxxl_id'][:]
sim_halo_id = sim_halo_id[keep]


len(dec)

In [None]:
BIT_CHOICE = 0
FIBER_ASSIGNED_SELECTOR = 2**BIT_CHOICE
fassigned = (input['Weight/bitweight0'][:] & FIBER_ASSIGNED_SELECTOR).astype(bool) # choose 1 of the 2048 fiber assignment realizations with this bitstring
fnotassigned = np.invert(fassigned)

fassigned = fassigned[keep]
fnotassigned = fnotassigned[keep]
indexes_not_assigned = np.argwhere(fnotassigned)

print(np.sum(fassigned) / len(dec))

with open('bin/prob_obs.npy', 'rb') as f:
    prob_obs = np.load(f)
prob_obs_cut = prob_obs[keep]

### Calculate P_obs

In [None]:
def bitsum(bitstring):
    return bin(c_uint64(bitstring).value).count("1")
v_bitsum = np.vectorize(bitsum)

def summate(a):
    return np.sum(v_bitsum(a))


In [None]:
# Skip this if iips were loaded OK. Takes ~8 minutes.

# Read all 32 64-bitstrings into memory from the file
num_bitstrings = 32
galaxy_count = len(input['Weight/bitweight0'])
bitweights = np.empty((num_bitstrings, galaxy_count), dtype='i8')

for i in range(num_bitstrings):
    bitweights[i] = input['Weight/bitweight{0}'.format(i)][:]
    
prob_obs = np.apply_along_axis(summate, 0, bitweights) / 2048

with open('bin/prob_obs.npy', 'wb') as f:
    np.save(f, prob_obs)


In [None]:
specimen = 123
bit_selector = c_uint64(2).value
print('{:064b}'.format(bit_selector))
print('')
for i in range(num_bitstrings):
    value = bitweights[(i,specimen)]
    converted = c_uint64(value).value
    print('{:064b}'.format(converted), '{:2.0f}'.format(bitsum(value)), bool(converted & bit_selector))

print("Averaged Probability of being targetted: ", prob_obs[specimen])

In [None]:
prob_obs_cut = prob_obs[keep]

pobs_bins_temp = np.linspace(0,1)
trash=plt.hist(prob_obs, bins=pobs_bins_temp, label="All galaxies")
trash2=plt.hist(prob_obs_cut, bins=pobs_bins_temp, label=f"Galaxies below {APP_MAG_CUT} mag")
plt.yscale('log')
plt.legend()

In [None]:
prob_obs_dim = prob_obs[np.invert(keep)]
trash=plt.hist(prob_obs_dim, bins=pobs_bins_temp, alpha=0.5, label=f"Galaxies above {APP_MAG_CUT} mag")
trash2=plt.hist(prob_obs_cut, bins=pobs_bins_temp, alpha=0.5, label=f"Galaxies below {APP_MAG_CUT} mag")
plt.yscale('log')
plt.xlabel('$P_{obs}$')
plt.ylabel("Count")
plt.legend()

### Same Halo / Similar z Analysis

What fraction of time nearest neighbors in same halo? 

What is the distribution of Angular distances?

What fraction of time is nearest neighbors at a similar enough redshift?

In [None]:
# Now bin so that things with ang distances higher than the max we care about are thrown out
BIN_COUNT = 25
bins = np.logspace(np.log10(3), np.log10(60*60), BIN_COUNT)
print("Angular Distance Bin Markers", bins)

z_bins = [0.1, 0.2, 0.3, 1.0] # nothing can be higher than rightmost bin value
z_bins = SimpleRedshiftGuesser.z_bins
print("Redshift Bin Markers", z_bins)

POBS_BIN_COUNT = 25
POBS_bins = np.linspace(0.01, 1.0, POBS_BIN_COUNT)
print("Pobs Bin Markers", POBS_bins)

LOST_GALAXIES_ONLY = True

if LOST_GALAXIES_ONLY:
    treename = 'mxxl_same_halo_analysis_fiberassigned_b' + str(BIT_CHOICE)
    catalog = coord.SkyCoord(ra=ra[fassigned]*u.degree, dec=dec[fassigned]*u.degree, frame='icrs')
    sim_halo_id_catalog = sim_halo_id[fassigned]
    z_obs_catalog = z_obs[fassigned]
else:
    treename = 'mxxl_same_halo_analysis_all'
    catalog = coord.SkyCoord(ra=ra*u.degree, dec=dec*u.degree, frame='icrs')
    sim_halo_id_catalog = sim_halo_id
    z_obs_catalog = z_obs


In [None]:
# Get NN's angular distance distribution and same halo truth from MXXL

# Though this is binned by z of the target and not the NN, it shouldn't be able to affect results
# by recipricality of NN
z_bin = np.digitize(z_obs, z_bins)

if LOST_GALAXIES_ONLY:
    nn_bins = np.arange(5)+1
else:
    nn_bins = [2]#[2,3,4] # this means closest 3. '1' will find the same object.

all_ang_bincounts = np.ones((len(z_bins), len(nn_bins), len(bins)))
all_same_halo_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))
all_same_z_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))
all_sim_z_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))

for i in range(len(z_bins)):
    for j in range(len(nn_bins)):
        if LOST_GALAXIES_ONLY:
            filter = np.all([z_bin == i, fnotassigned], axis=0)
        else:
            filter = z_bin == i
        to_match = coord.SkyCoord(ra=ra[filter]*u.degree, dec=dec[filter]*u.degree, frame='icrs')
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nn_bins[j], storekdtree=treename)

        same_halo = sim_halo_id[filter] == sim_halo_id_catalog[idx]
        same_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=0.000001)
        sim_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=SIM_Z_THRESH)
        
        angdist_bin_ind = np.digitize(d2d.to(u.arcsec).value, bins)
        
        bincounts = np.bincount(angdist_bin_ind, minlength=len(bins)) + 1 # avoids divide by 0, won't hurt statistics
        all_ang_bincounts[i][j] = bincounts

        bincounts2 = np.bincount(angdist_bin_ind, minlength=len(bins), weights=same_halo.astype(int))
        all_same_halo_bincounts[i][j] = bincounts2

        bincount3 = np.bincount(angdist_bin_ind, minlength=len(bins), weights=same_z.astype(int))
        all_same_z_bincounts[i][j] = bincount3

        bincount4 = np.bincount(angdist_bin_ind, minlength=len(bins), weights=sim_z.astype(int))
        all_sim_z_bincounts[i][j] = bincount4
    

In [None]:
print("Galaxies studied: {0}. Same halo: {1}. Similar z: {2}".format(np.sum(all_ang_bincounts), np.sum(all_same_halo_bincounts), np.sum(all_sim_z_bincounts)))

In [None]:
# TODO 
#for b in range(len(all_same_halo_bincounts)):
#    print(all_same_halo_bincounts[b], all_same_z_bincounts[b], len(all_same_z_bincounts))

np.all(np.isclose(all_same_halo_bincounts, all_same_z_bincounts))

In [None]:
def getlabel(index, z_bins):
    if index==0:
        label = "< {0}".format(z_bins[index])
    else:
        label = "{0} - {1}".format(z_bins[index-1], z_bins[index])
    return label

In [None]:
# Plots for nearest-neighbor angular distances and same-halo analysis

for j in range(len(nn_bins)):
    if j < 5:
        plt.figure()
        for i in range(len(z_bins)):
            label = getlabel(i, z_bins)
        plt.plot(bins, all_ang_bincounts[i][j], label=label, color=get_color(i))

        plt.title(f"Nearest Neighbor {j} Ang. Distance Distribution")
        plt.ylabel("Count")
        plt.xlabel("Angular Distance (arcsec)")
        plt.yscale('log')
        plt.xscale('log')
        plt.legend()
        plt.draw()

        plt.figure()
        for i in range(len(z_bins)):
            label = getlabel(i, z_bins)
            plt.plot(bins, all_same_halo_bincounts[i][j]/all_ang_bincounts[i][j], label=label, color=get_color(i))
            print("Total fraction of nearest neighbors in same halo (z {0}, NN-{1}): {2:.3f}".format(label, j+1, np.sum(all_same_halo_bincounts[i][j]) / np.sum(all_ang_bincounts[i][j])))

        plt.title(f"Nearest Neighbor {j} Same Halo Fraction")
        plt.ylabel("NN Same Halo Fraction")
        plt.xlabel("Angular Distance (arcsec)")
        plt.xscale('log')
        plt.legend()
        plt.draw()

        plt.figure()
        for i in range(len(z_bins)):
            label = getlabel(i, z_bins)
            plt.plot(bins, all_sim_z_bincounts[i][j]/all_ang_bincounts[i][j], label=label, color=get_color(i))
            
            print("Total fraction of nearest neighbors at sim z (z {0}, NN-{1}): {2:.3f}".format(label, j+1, np.sum(all_sim_z_bincounts[i][j]) / np.sum(all_ang_bincounts[i][j])))

        plt.title(f"Nearest Neighbor {j} Sim z Fraction")
        plt.ylabel("NN Sim z Fraction")
        plt.xlabel("Angular Distance (arcsec)")
        plt.xscale('log')
        plt.legend()
        plt.draw()

#print("What fraction of the time is the NN >19.5 mag?")

### Color plots of NN Same Halo in z / ang distance / P_obs space

In [None]:
POBS_bin = np.digitize(prob_obs_cut, POBS_bins)

if LOST_GALAXIES_ONLY: 
    nn_bins = [1]
else:
    nn_bins = [2] # since catalog includes the targets in this case

all_ang_bincounts_2 = np.ones((POBS_BIN_COUNT, len(nn_bins), len(z_bins), BIN_COUNT))
all_same_halo_bincounts_2 = np.zeros((POBS_BIN_COUNT, len(nn_bins), len(z_bins), BIN_COUNT))
all_sim_z_bincounts_2 = np.zeros((POBS_BIN_COUNT, len(nn_bins), len(z_bins), BIN_COUNT))

for i in range(len(POBS_bins)):
    for j in range(len(nn_bins)):
        if LOST_GALAXIES_ONLY:
            filter = np.all([POBS_bin == i, fnotassigned], axis=0)
        else:
            filter = POBS_bin == i
        to_match = coord.SkyCoord(ra=ra[filter]*u.degree, dec=dec[filter]*u.degree, frame='icrs')
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nn_bins[j], storekdtree=treename)
        same_halo = sim_halo_id[filter] == sim_halo_id_catalog[idx]
        sim_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=SIM_Z_THRESH)

        nn_z_bin_ind = np.digitize(z_obs_catalog[idx], z_bins)
        angdist_bin_ind = np.digitize(d2d.to(u.arcsec).value, bins)
        
        for zb in range(len(z_bins)):
            right_z_bin = nn_z_bin_ind == zb
            bincounts = np.bincount(angdist_bin_ind, minlength=len(bins), weights=right_z_bin.astype(int)) + 1 # avoids divide by 0, won't hurt statistics
            all_ang_bincounts_2[i][j][zb] = bincounts

            bincounts2 = np.bincount(angdist_bin_ind, minlength=len(bins), weights=np.all([same_halo, right_z_bin], axis=0).astype(int))
            all_same_halo_bincounts_2[i][j][zb] = bincounts2

            bincounts3 = np.bincount(angdist_bin_ind, minlength=len(bins), weights=np.all([sim_z, right_z_bin], axis=0).astype(int))
            all_sim_z_bincounts_2[i][j][zb] = bincounts3

In [None]:
frac_same = all_same_halo_bincounts_2 / all_ang_bincounts_2
frac_sim_z = all_sim_z_bincounts_2 / all_ang_bincounts_2
success_bins = [0,0.3,1.0]
frac_same_binned = np.digitize(frac_same, bins=success_bins)
#for i in range(len(frac_same_binned)):
#    frac_same_binned[i] = success_bins[frac_same_binned[i-1]]
frac_same_over50 = (frac_same > 0.5).astype(int)
frac_at50 = close_enough(frac_same, 0.5, threshold=0.05).astype(int)

In [None]:
fig, axes = plt.subplots(nrows=len(z_bins), ncols=4, figsize=(24, 4*len(z_bins)))

for zb in range(len(z_bins)):
    density = all_ang_bincounts_2[:,0,zb,:] #/ np.sum(all_ang_bincounts_2[:,0,zb,:])
    print(f"Galaxies in this z-bin: {np.sum(density)}")

    cplot = axes[zb][0].pcolor(bins, POBS_bins, frac_same_binned[:,0,zb,:], shading='auto', cmap='RdYlGn')
    #cplot = axes[zb][0].pcolor(bins, POBS_bins, frac_same_over50[:,0,zb,:], shading='auto', cmap='RdYlGn', norm=c.Normalize(vmin=0, vmax=0.8))
    fig.colorbar(cplot, ax=axes[zb][0])
    axes[zb][0].set_title(f"Nearest Neighbor Same Halo Over 40% (NN z {getlabel(zb, z_bins)})")
    axes[zb][0].set_ylabel("Lost Galaxy $P_{obs}$")
    axes[zb][0].set_xlabel("Angular Distance (arcsec) to NN")
    axes[zb][0].set_xscale('log')
    axes[zb][0].set_xlim(3,250)

    cplot = axes[zb][1].pcolor(bins, POBS_bins, frac_same[:,0,zb,:], shading='auto', cmap='RdYlGn', norm=c.Normalize(vmin=0, vmax=0.8))
    fig.colorbar(cplot, ax=axes[zb][1])
    axes[zb][1].set_title(f"Nearest Neighbor Same Halo (NN z {getlabel(zb, z_bins)})")
    axes[zb][1].set_ylabel("Lost Galaxy $P_{obs}$")
    axes[zb][1].set_xlabel("Angular Distance (arcsec) to NN")
    axes[zb][1].set_xscale('log')

    cplot = axes[zb][2].pcolor(bins, POBS_bins, frac_sim_z[:,0,zb,:], shading='auto', cmap='RdYlGn', norm=c.Normalize(vmin=0, vmax=0.8))
    fig.colorbar(cplot, ax=axes[zb][2])
    axes[zb][2].set_title(f"Nearest Neighbor Sim z Fraction (NN z {getlabel(zb, z_bins)})")
    axes[zb][2].set_ylabel("Lost Galaxy $P_{obs}$")
    axes[zb][2].set_xlabel("Angular Distance (arcsec) to NN")
    axes[zb][2].set_xscale('log')

    #cplot = axes[zb][3].pcolor(bins, POBS_bins, density, shading='auto', cmap='YlGn', norm=c.LogNorm(vmin=0.0001, vmax=0.1))
    cplot = axes[zb][3].pcolor(bins, POBS_bins, density, shading='auto', cmap='YlGn', norm=c.LogNorm(vmin=10, vmax=5000))
    fig.colorbar(cplot, ax=axes[zb][3])
    axes[zb][3].set_title(f"Counts (NN z {getlabel(zb, z_bins)})")
    axes[zb][3].set_ylabel("Lost Galaxy $P_{obs}$")
    axes[zb][3].set_xlabel("Angular Distance (arcsec) to NN")
    axes[zb][3].set_xscale('log')

    axes[zb][0].scatter(get_NN_30_line(z_bins[zb]-0.01, POBS_bins), POBS_bins)
    axes[zb][1].scatter(get_NN_30_line(z_bins[zb]-0.01, POBS_bins), POBS_bins)
    axes[zb][2].scatter(get_NN_30_line(z_bins[zb]-0.01, POBS_bins), POBS_bins)
    
fig.tight_layout() 


In [None]:
print("Using z +/- {0} values as a success metric:\n ".format(SIM_Z_THRESH))
for i in range(BIN_COUNT):
    arcsec = bins[i]
    tot = np.sum(all_ang_bincounts[:,0,0:i])
    frac = np.sum(all_sim_z_bincounts[:,0,0:i]) / np.sum(all_ang_bincounts[:,0,0:i])
    frac_assigned = np.sum(all_ang_bincounts[:,0,0:i]) / np.sum(all_ang_bincounts[:,0,:])
    print("  Up to {0:.1f}\": Success frac: {1:.3f}. Assigned frac: {2:.3f}".format(arcsec, frac, frac_assigned))


### Examine velocities between neighbors

In [None]:
# TODO


### Fancy Algorithm

In [None]:
NUM_NEIGHBORS = 20
fancy_to_match = coord.SkyCoord(ra=ra[fnotassigned]*u.degree, dec=dec[fnotassigned]*u.degree, frame='icrs')

In [None]:
neighbor_indexes = np.zeros(shape=(NUM_NEIGHBORS, len(fancy_to_match)), dtype=np.int32) # indexes point to CATALOG locations
ang_distances = np.zeros(shape=(NUM_NEIGHBORS, len(fancy_to_match)))

print(f"Finding nearest {NUM_NEIGHBORS} neighbors... ", end='\r')   
for n in range(0, NUM_NEIGHBORS):
    idx, d2d, d3d = coord.match_coordinates_sky(fancy_to_match, catalog, nthneighbor=n+1, storekdtree=treename)
    neighbor_indexes[n] = idx # TODO is that right?
    ang_distances[n] = d2d.to(u.arcsec).value
print(f"Finding nearest {NUM_NEIGHBORS} neighbors... done!")   

In [None]:
with FancyRedshiftGuesser(NUM_NEIGHBORS, debug=False) as scorer:
    halo_matches = 0
    z_matches = 0

    print(f"Assinging missing redshifts... ")   
    # TODO don't loop?
    j = 0 # index of the fancy_to_match sized arrays
    
    #for i in special_id:
    for i in indexes_not_assigned: # index of the master arrays

        #if i not in [7793057, 11425052]:
        #    j+=1
        #    continue

        if j%10000==0:
            print(f"{j}/{len(fancy_to_match)} complete", end='\r')

        neighbors = neighbor_indexes[:,j]
        neighbors_z = z_obs_catalog[neighbors]
        neighbors_ang_dist = ang_distances[:,j]
        my_prob_obs = prob_obs_cut[i]
        my_app_mag = app_mag[i]

        winning_num = scorer.choose_winner(neighbors_z, neighbors_ang_dist, my_prob_obs, my_app_mag, z_obs[i])
        winner_index = neighbors[winning_num]

        # Track total correct
        z_chosen = z_obs_catalog[winner_index] 
        if np.isclose(z_chosen, z_obs[i], rtol=0, atol=SIM_Z_THRESH):
            z_matches += 1
        halo_chosen = sim_halo_id_catalog[winner_index]
        if halo_chosen == sim_halo_id[i]:
            halo_matches += 1

        j += 1 

    print(f"{j}/{len(fancy_to_match)} complete")



In [None]:
print(f"Halo matches: {halo_matches / len(fancy_to_match)}")
print(f"z matches: {z_matches / len(fancy_to_match)}")

In [None]:
# View results from a run of the FancyRedshiftGuesser. Must put in the right filename (number)
filename = 'bin/redshift_guesser_1691466513.171286.npy'
with open(filename, 'rb') as f:
    quick_nn = np.load(f)
    quick_correct = np.load(f)
    nn_used = np.load(f)
    nn_correct = np.load(f)

print(f"Quick NN uses: {quick_nn}. Success: {quick_correct / (quick_nn+1)}")
print(f"NN bin uses: {nn_used}. Success: {nn_correct / (nn_used+1)}")

### Galaxy Pairs Angular Separation and Same-Halo Analysis
Continuation of the above.

THIS IS N^2 CALCULATION do not run on full sky. Adjust data


In [None]:
# THIS IS N^2 CALCULATION do not run on full sky.
total_bincounts = np.ones((len(z_bins), BIN_COUNT))
total_same_halo_bincounts = np.zeros((len(z_bins), BIN_COUNT))

# Examine each galaxy in the sample pair once
for i in range(len(ra)-1):
    ang_distance = coord.angular_separation(ra[i]*u.degree, dec[i]*u.degree, ra[i+1:len(ra)]*u.degree, dec[i+1:len(ra)]*u.degree).to(u.arcsec)
        
    same_halo = sim_halo_id[i] == sim_halo_id[i+1:len(ra)]
    #print("Same halo fraction for {0}:".format(i), np.sum(same_halo) / len(same_halo))

    angdist_bin_ind = np.digitize(ang_distance.value, bins)
    #print(bin_ind)
    bincounts = np.bincount(angdist_bin_ind)[0:BIN_COUNT]
    same_halo_bincounts = np.bincount(angdist_bin_ind, weights= same_halo.astype(int)) [0:BIN_COUNT]

    z_bin = np.digitize(z_obs[i], z_bins)
    total_bincounts[z_bin] = total_bincounts[z_bin] + bincounts
    total_same_halo_bincounts[z_bin] = total_same_halo_bincounts[z_bin] + same_halo_bincounts
    #print(total_same_halo_bincounts)

#print("Total counts in each bin:", total_bincounts)

fraction_same_halo = total_same_halo_bincounts / total_bincounts
#print(fraction_same_halo)

In [None]:
# Plots for galaxy pairs
plt.figure()
for i in range(len(z_bins)):
    if i==0:
        label = "< {0}".format(z_bins[i])
    else:
        label = "{0} - {1}".format(z_bins[i-1], z_bins[i])
    plt.plot(bins, total_bincounts[i], label=label)
plt.legend()
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Angular Separation (arcsec)')
plt.ylabel('Count of Galaxies Pairs')
plt.title("Galaxy Pair Counts (by ang separation and z)")
plt.draw()

plt.figure()
for i in range(len(z_bins)):
    if i==0:
        label = "< {0}".format(z_bins[i])
    else:
        label = "{0} - {1}".format(z_bins[i-1], z_bins[i])
    plt.plot(bins, fraction_same_halo[i], label=label)
plt.legend()
plt.xscale('log')
plt.xlabel('Angular Separation (arcsec)')
plt.ylabel('Fraction Pair in Same Halo')
plt.ylim(-0.01, 1.0)
plt.title("Fraction Pair in Same Halo (by ang separation and z)")
plt.draw()

# Post Group Founder Analysis

In [None]:
def process(filename):

    filename_props = str.replace(filename, ".out", "_galprops.dat")

    df = pd.read_csv(filename, delimiter=' ', names=('RA', 'Dec', 'z', 'L_gal', 'V_max', 'P_sat', 'M_halo', 'N_sat', 'L_tot', 'igrp', 'unknown'))
    galprops = pd.read_csv(filename_props, delimiter=' ', names=('app_mag', 'g_r', 'galaxy_type', 'mxxl_halo_mass', 'fiber_assigned_0', 'assigned_halo_mass', 'z_obs', 'mxxl_halo_id', 'assigned_halo_id'), dtype={'mxxl_halo_id': np.int32, 'assigned_halo_id': np.int32})
    all_data = pd.merge(df, galprops, left_index=True, right_index=True)

    # Drop bad data, should have been cleaned up earlier though!
    orig_count = len(all_data)
    all_data = all_data[all_data.M_halo != 0]
    new_count = len(all_data)
    if (orig_count != new_count):
        print("Dropped {0} bad galaxies".format(orig_count - new_count))

    all_data['is_sat'] = (all_data.index != all_data.igrp).astype(int)
    all_data['is_sat_truth'] = np.logical_or(all_data.galaxy_type == 1, all_data.galaxy_type == 3).astype(int)
    all_data['logLgal'] = np.log10(all_data.L_gal)

    #bins = np.logspace(np.log10(min(all_data.M_halo)), np.log10(max(all_data.M_halo)), 30)
    #labels = bins[0:len(bins)-1] # using bottom (or top?) value, not middle
    all_data['Mh_bin'] = pd.cut(x = all_data['M_halo'], bins = Mhalo_bins, labels = Mhalo_labels, include_lowest = True)
    
    centrals = all_data[all_data.index == all_data.igrp]
    #logmstar_means = centrals.groupby('Mh_bin').log_M_star.mean()
    #logmstar_scatter = centrals.groupby('Mh_bin').log_M_star.std()
    loglcen_means = centrals.groupby('Mh_bin').logLgal.mean()
    loglcen_scatter = centrals.groupby('Mh_bin').logLgal.std()

    # Compute f_sat(Lgal)
    #L_gal_bins = np.logspace(np.log10(min(all_data.L_gal)), np.log10(max(all_data.L_gal)), 30)
    #L_gal_labels = L_gal_bins[0:len(L_gal_bins)-1] # using bottom (or top?) value, not middle
    all_data['Lgal_bin'] = pd.cut(x = all_data['L_gal'], bins = L_gal_bins, labels = L_gal_labels, include_lowest = True)
    
    f_sat = all_data.groupby('Lgal_bin').is_sat.mean()
    Lgal_counts = all_data.groupby('Lgal_bin').RA.count()

    dataset = types.SimpleNamespace()
    dataset.filename = filename[filename.rfind('/')+1 : len(filename)-4]
    dataset.all_data = all_data
    dataset.Mhalo_bins = Mhalo_bins
    dataset.labels = Mhalo_labels
    dataset.centrals = centrals
    #dataset.logmstar_means = logmstar_means
    #dataset.logmstar_scatter = logmstar_scatter
    dataset.loglcen_means = loglcen_means
    dataset.loglcen_scatter = loglcen_scatter
    dataset.L_gal_bins = L_gal_bins
    dataset.L_gal_labels = L_gal_labels
    dataset.f_sat = f_sat
    dataset.Lgal_counts = Lgal_counts

    return dataset

    

def plots(*frames):
    contains_20_data = False
    for f in frames:
        if ('20' in f.name):
            contains_20_data = True
    
    plt.figure(dpi=DPI)
    for f in frames:
        if ('20' not in f.name):
            plt.errorbar(f.labels, f.loglcen_means, yerr=f.loglcen_scatter, label=f.name, color=f.color)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$log(L_{cen})$')
    plt.title("Central Luminosity vs. Halo Mass")
    plt.legend()
    plt.draw()

    if contains_20_data:
        plt.figure(dpi=DPI)
        for f in frames:
            if ('20' in f.name):
                plt.errorbar(f.labels, f.loglcen_means, yerr=f.loglcen_scatter, label=f.name, color=f.color)
        plt.xscale('log')
        plt.xlabel('$M_{halo}$')
        plt.ylabel('$log(L_{cen})$')
        plt.title("Central Luminosity vs. Halo Mass")
        plt.legend()
        plt.draw()

    plt.figure(dpi=DPI)    
    for f in frames:
        if ('20' not in f.name):
            plt.plot(f.labels, f.loglcen_scatter, color=f.color, label=f.name)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$\\sigma(\\log(L_{cen})$')
    plt.title("Central Luminosity Scatter vs. Halo Mass")
    plt.legend()
    plt.draw()

    if contains_20_data:
        plt.figure(dpi=DPI)    
        for f in frames:
            if ('20' in f.name):
                plt.plot(f.labels, f.loglcen_scatter, color=f.color, label=f.name)
        plt.xscale('log')
        plt.xlabel('$M_{halo}$')
        plt.ylabel('$\\sigma(\\log(L_{cen})$')
        plt.title("Central Luminosity Scatter vs. Halo Mass")
        plt.legend()
        plt.draw()

    """     
    plt.figure()
    for f in frames:
        plt.scatter(f.centrals.M_halo, f.centrals.L_gal, alpha=0.002)
    plt.loglog()
    plt.xlabel('M_halo / h')
    plt.ylabel('L_gal / $h^2$)')
    plt.draw() 
    """

    if 'N_sat' in f.all_data.columns:
        plt.figure(dpi=DPI)
        for f in frames:
            Nsat_means = f.all_data.groupby('Mh_bin').N_sat.mean()
            plt.plot(f.labels, Nsat_means, f.marker, label=f.name, color=f.color)
            #plt.hist(f.centrals.N_sat, np.arange(0,50,1), alpha=0.5)
        plt.loglog()    
        plt.ylabel("$<N_{sat}>$")    
        plt.xlabel('$M_{halo}$')
        plt.title("Mean Number of Satellites by Halo Mass")
        plt.legend()
        plt.draw()

    plt.figure(dpi=DPI)
    for f in frames:
        plt.plot(f.L_gal_labels, f.f_sat, f.marker, label=f.name, color=f.color)
    #truth_f_sat = frames[0].all_data.groupby('Lgal_bin').is_sat_truth.mean()
    #plt.plot(frames[0].L_gal_labels, truth_f_sat, 'k', label="MXXL Truth")
    plt.xscale('log')
    plt.xlabel("$L_{gal}$")
    plt.ylabel("$f_{sat}$")
    plt.title("Satellite fraction vs Galaxy Luminosity")
    plt.legend()
    plt.draw()

    fig,ax1=plt.subplots()
    fig.set_dpi(DPI)
    for f in frames:
        plt.plot(f.L_gal_labels, f.f_sat, f.marker, label=f.name, color=f.color)
    #truth_f_sat = frames[0].all_data.groupby('Lgal_bin').is_sat_truth.mean()
    #ax1.plot(frames[0].L_gal_labels, truth_f_sat, 'k', label="MXXL Truth")
    ax1.set_xscale('log')
    ax1.set_xlabel("$L_{gal}$")
    ax1.set_ylabel("$f_{sat}$")
    ax1.set_title("Satellite fraction vs Galaxy Luminosity")
    ax1.legend()
    ax1.set_xlim(3E8,1E11)
    ax1.set_ylim(0.1,0.5)
    ax2 = ax1.twinx()
    idx = 0
    for f in frames:
        widths = np.zeros(len(f.L_gal_bins)-1)
        for i in range(0,len(f.L_gal_bins)-1):
            widths[i]=(f.L_gal_bins[i+1] - f.L_gal_bins[i]) / len(frames)
        ax2.bar(f.L_gal_labels+(widths*idx), f.all_data[f.all_data.is_sat == True].groupby('Lgal_bin').size(), width=widths, color=f.color, alpha=0.5)
        idx+=1
    ax2.set_ylabel('$N_{sat}$')
    fig.tight_layout()


    print("TOTAL f_sat: ")
    for f in frames:
        print(f"  {f.name}:  {f.all_data['is_sat'].sum() / f.all_data['is_sat'].count():.3f}")
        if 'is_sat_truth' in f.all_data.columns:
            print(f"  MXXL Truth: {f.all_data['is_sat_truth'].sum() / f.all_data['is_sat_truth'].count():.3f}")

# It gives same result as NFW version! Good
def get_vir_radius_mine(halo_mass):
    _cosmo = get_MXXL_cosmology()
    rho_m = (_cosmo.critical_density(0) * _cosmo.Om(0))
    return np.power(((3/(4*math.pi)) * halo_mass / (200*rho_m)), (1/3)).to(u.kpc).value

def post_process(frame):
    df: pd.DataFrame = frame.all_data
    
    # Calculate additional halo properties
    masses = df.loc[:, 'mxxl_halo_mass'].to_numpy() * 1E10 * u.solMass
    df.loc[:, 'mxxl_halo_vir_radius_guess'] = get_vir_radius_mine(masses)

    _cosmo = FlatLambdaCDM(H0=73, Om0=0.25, Ob0=0.045, Tcmb0=2.725, Neff=3.04) 
    # TODO comoving or proper?
    as_per_kpc = _cosmo.arcsec_per_kpc_proper(df.loc[:, 'z'].to_numpy())
    df.loc[:, 'mxxl_halo_vir_radius_guess_arcsec'] =  df.loc[:, 'mxxl_halo_vir_radius_guess'] * as_per_kpc.to(u.arcsec / u.kpc).value

    # Luminosity distance to z_obs
    df.loc[:, 'ldist_true'] = z_to_ldist(df.z_obs.to_numpy())



## Loading existing datasets

In [None]:
all = types.SimpleNamespace()
all.name = "All <19.5"
all20 = types.SimpleNamespace()
all20.name = "All <20"
fiberonly = types.SimpleNamespace()
fiberonly.name = "Fiber Assigned Only <19.5"
fiberonly20 = types.SimpleNamespace()
fiberonly20.name = "Fiber Assigned Only <20"
nn_kd = types.SimpleNamespace()
nn_kd.name = "Nearest Neighbor <19.5"
nn_kd20 = types.SimpleNamespace()
nn_kd20.name = "Nearest Neighbor <20"
fancy_1 = types.SimpleNamespace()
fancy_1.name = "Fancy v1 <19.5"
fancy_6 = types.SimpleNamespace()
fancy_6.name = "Fancy v6 <19.5"
simple_1 = types.SimpleNamespace()
simple_1.name = "Simple v1 <19.5"
simple_1_20 = types.SimpleNamespace()
simple_1_20.name = "Simple v1 <20"
simple_2 = types.SimpleNamespace()
simple_2.name = "Simple v2 <19.5"
simple_2_20 = types.SimpleNamespace()
simple_2_20.name = "Simple v2 <20"
simple_2_mix = types.SimpleNamespace()
simple_2_mix.name = "Simple v2 mix"
simple_3 = types.SimpleNamespace()
simple_3.name = "Simple v3 <19.5"
simple_3_20 = types.SimpleNamespace()
simple_3_20.name = "Simple v3 <20"

In [None]:
with open(ROOT_FOLDER + all.name, 'rb') as f:    
    all = pickle.load(f)
#with open(ROOT_FOLDER + all20.name, 'rb') as f:    
#    all20 = pickle.load(f)
#with open(ROOT_FOLDER + fiberonly.name, 'rb') as f:    
#    fiberonly = pickle.load(f)
#with open(ROOT_FOLDER + fiberonly20.name, 'rb') as f:    
#    fiberonly20 = pickle.load(f)
#with open(ROOT_FOLDER + nn_kd.name, 'rb') as f:    
#    nn_kd = pickle.load(f)
#with open(ROOT_FOLDER + nn_kd20.name, 'rb') as f:    
#   nn_kd20 = pickle.load(f)
#with open(ROOT_FOLDER + simple_1.name, 'rb') as f:    
#    simple_1 = pickle.load(f)
#with open(ROOT_FOLDER + simple_1_20.name, 'rb') as f:    
#    simple_1_20 = pickle.load(f)
#with open(ROOT_FOLDER + simple_2.name, 'rb') as f:    
#    simple_2 = pickle.load(f)
#with open(ROOT_FOLDER + simple_2_20.name, 'rb') as f:    
#    simple_2_20 = pickle.load(f)
#with open(ROOT_FOLDER + simple_3.name, 'rb') as f:    
#    simple_3 = pickle.load(f)
#with open(ROOT_FOLDER + simple_3_20.name, 'rb') as f:    
#    simple_3_20 = pickle.load(f)
    

## Process New Datasets

In [None]:
all = process(ROOT_FOLDER + "mxxl_3pass_all.out")
all.name = "All <19.5"
all.color = get_color(0)
all.marker = '-'
post_process(all)
with open(ROOT_FOLDER + all.name, 'wb') as f:
    pickle.dump(all, f)


In [None]:
all20 = process(ROOT_FOLDER + "mxxl_3pass_all20.out")
all20.name = "All <20"
all20.color = get_color(0)
all20.marker = '--'
post_process(all20)
with open(ROOT_FOLDER + all20.name, 'wb') as f:
    pickle.dump(all20, f)
del(all20)

In [None]:
fiberonly = process(ROOT_FOLDER + "mxxl_3pass_fiberonly.out")
fiberonly.name = "Fiber Assigned Only <19.5"
fiberonly.color = get_color(1)
fiberonly.marker = '-'
post_process(fiberonly)
with open(ROOT_FOLDER + fiberonly.name, 'wb') as f:
    pickle.dump(fiberonly, f)


In [None]:
fiberonly20 = process(ROOT_FOLDER + "mxxl_3pass_fiberonly20.out")
fiberonly20.name = "Fiber Assigned Only <20"
fiberonly20.color = get_color(1)
fiberonly20.marker = '--'
post_process(fiberonly20)
with open(ROOT_FOLDER + fiberonly20.name, 'wb') as f:
    pickle.dump(fiberonly20, f)
del(fiberonly20)

In [None]:
nn_kd = process(ROOT_FOLDER + "mxxl_3pass_nn_kd.out")
nn_kd.name = "Nearest Neighbor <19.5"
nn_kd.color = get_color(2)
nn_kd.marker = '-'
post_process(nn_kd)
with open(ROOT_FOLDER + nn_kd.name, 'wb') as f:
    pickle.dump(nn_kd, f)


In [None]:
nn_kd20 = process(ROOT_FOLDER + "mxxl_3pass_nn_kd20.out")
nn_kd20.name = "Nearest Neighbor <20"
nn_kd20.color = get_color(2)
nn_kd20.marker = '--'
post_process(nn_kd20)
with open(ROOT_FOLDER + nn_kd20.name, 'wb') as f:
    pickle.dump(nn_kd20, f)
del(nn_kd20)

In [None]:
fancy_1 = process(ROOT_FOLDER + "mxxl_3pass_fancy_1.out")
post_process(fancy_1)
fancy_1.name = "Fancy v1 <19.5"
fancy_1.color = get_color(3)
fancy_1.marker = '-'
with open(ROOT_FOLDER + fancy_1.name, 'wb') as f:
    pickle.dump(fancy_1, f)
del(fancy_1)

In [None]:
fancy_6 = process(ROOT_FOLDER + "mxxl_3pass_fancy_6.out")
post_process(fancy_6)
fancy_6.name = "Fancy v6 <19.5"
fancy_6.color = get_color(4)
fancy_6.marker = '-'
with open(ROOT_FOLDER + fancy_6.name, 'wb') as f:
    pickle.dump(fancy_6, f)
del(fancy_6)

In [None]:
simple_1 = process(ROOT_FOLDER + "mxxl_3pass_simple_1.out")
post_process(simple_1)
simple_1.name = "Simple v1 <19.5"
simple_1.color = get_color(5)
simple_1.marker = '-'
with open(ROOT_FOLDER + simple_1.name, 'wb') as f:
    pickle.dump(simple_1, f)


In [None]:
simple_1_20 = process(ROOT_FOLDER + "mxxl_3pass_simple_1_20.out")
post_process(simple_1_20)
simple_1_20.name = "Simple v1 <20"
simple_1_20.color = get_color(5)
simple_1_20.marker = '--'
with open(ROOT_FOLDER + simple_1_20.name, 'wb') as f:
    pickle.dump(simple_1_20, f)
del(simple_1_20)

In [None]:
simple_2 = process(ROOT_FOLDER + "mxxl_3pass_simple_2.out")
post_process(simple_2)
simple_2.name = "Simple v2 <19.5"
simple_2.color = get_color(6)
simple_2.marker = '-'
with open(ROOT_FOLDER + simple_2.name, 'wb') as f:
    pickle.dump(simple_2, f)

In [None]:
simple_2_20 = process(ROOT_FOLDER + "mxxl_3pass_simple_2_20.out")
post_process(simple_2_20)
simple_2_20.name = "Simple v2 <20"
simple_2_20.color = get_color(6)
simple_2_20.marker = '--'
with open(ROOT_FOLDER + simple_2_20.name, 'wb') as f:
    pickle.dump(simple_2_20, f)


In [None]:
# OLD ONE
simple_2_mix = process(ROOT_FOLDER + "mxxl_3pass_simple_2_mix.out")
post_process(simple_2_mix)
simple_2_mix.name = "Simple v2 mix"
simple_2_mix.color = get_color(6)
simple_2_mix.marker = '-.'
with open(ROOT_FOLDER + simple_2_mix.name, 'wb') as f:
    pickle.dump(simple_2_mix, f)


In [None]:
simple_3 = process(ROOT_FOLDER + "mxxl_3pass_simple_3.out")
post_process(simple_3)
simple_3.name = "Simple v3 <19.5"
simple_3.color = get_color(7)
simple_3.marker = '-'
with open(ROOT_FOLDER + simple_2.name, 'wb') as f:
    pickle.dump(simple_3, f)

In [None]:
simple_3_20 = process(ROOT_FOLDER + "mxxl_3pass_simple_3_20.out")
post_process(simple_3_20)
simple_3_20.name = "Simple v3 <20"
simple_3_20.color = get_color(7)
simple_3_20.marker = '--'
with open(ROOT_FOLDER + simple_3_20.name, 'wb') as f:
    pickle.dump(simple_3_20, f)

In [None]:
# TODO figure out
#def save_processed_data(frame):
#    frame.all_data.to_feather(ROOT_FOLDER + frame.name)


## View plots on the data

In [None]:
# Type in whatever datasets you want included in the generated plots
#plots(all, fiberonly, nn_kd, simple_1)
plots(all,fiberonly, nn_kd, simple_2)


### What effect does Fiber Assignment have on group finder properties?

In [None]:
# Halo Masses (in group finder abundance matching)
def group_finder_centrals_halo_masses_plots(all_to_use, comparisons):

    all_centrals = all_to_use.all_data[all_to_use.all_data.index == all_to_use.all_data.igrp]
    angdist_bin_ind = np.digitize(all_centrals.M_halo, all_to_use.Mhalo_bins)
    all_bincounts = np.bincount(angdist_bin_ind)[0:len(all_to_use.Mhalo_bins)]
    all_density = all_bincounts / np.sum(all_bincounts)

    fig,axes=plt.subplots(nrows=1, ncols=2, figsize=(12,4))
    fig.set_dpi(DPI)
    axes[0].set_xscale('log')
    axes[0].set_ylim(-0.2, 0.2)
    axes[0].set_xlim(5E10,2E15)
    axes[0].set_xlabel('$M_{halo}$')
    axes[0].set_ylabel('Normalized log(Comparion / All)')
    axes[0].axline((3E10,0), (3E15,0), linestyle='--', color='k')
    axes[0].set_title("Group Finder Halo Masses of Centrals")

    axes[1].plot(all_to_use.Mhalo_bins, all_density, label="All Galaxies") 
    axes[1].set_xscale('log')
    axes[1].set_yscale('log')
    axes[1].set_xlim(5E10,2E15)
    axes[1].set_xlabel('$M_{halo}$')
    axes[1].set_ylabel('Density of Galaxies')
    axes[1].set_title("Group Finder Halo Masses of Centrals")

    for comparison in comparisons:

        centrals = comparison.all_data[comparison.all_data.index == comparison.all_data.igrp]
        angdist_bin_ind = np.digitize(centrals.M_halo, all_to_use.Mhalo_bins)
        bincounts = np.bincount(angdist_bin_ind)[0:len(all_to_use.Mhalo_bins)]
        density = bincounts / np.sum(bincounts)

        axes[0].plot(all_to_use.Mhalo_bins, np.log10(density / all_density), linestyle=comparison.marker, color=comparison.color, label=comparison.name) 
        axes[1].plot(all_to_use.Mhalo_bins, density, linestyle=comparison.marker, color=comparison.color, label=comparison.name) 

    axes[0].legend()
    axes[1].legend()

    # Look up the centrals from all in fiberonly
    for comparison in comparisons:

        centrals = comparison.all_data[comparison.all_data.index == comparison.all_data.igrp]
        catalog = coord.SkyCoord(ra=all_centrals.RA.to_numpy()*u.degree, dec=all_centrals.Dec.to_numpy()*u.degree, frame='icrs')
        to_match = coord.SkyCoord(ra=centrals.RA.to_numpy()*u.degree, dec=centrals.Dec.to_numpy()*u.degree, frame='icrs')
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=1, storekdtree=False)

        perfect_match = np.isclose(d2d.to(u.arcsec).value, 0, rtol=0.0, atol=0.0001) 
        # 0.0001 arcsec precision on matching doesn't hit floating point noise. You get same with 0.001
        print(f"What fraction of centrals in \'{comparison.name}\' are centrals in \'all\'? {np.sum(perfect_match) / len(d2d)}")



In [None]:
import astropy
print(np.__version__)
print(astropy.__version__)

In [None]:
group_finder_centrals_halo_masses_plots(all, [simple_2])

## Compare assigned implied abs mags to truth from MXXL

In [None]:
run_to_check = simple_2

not_assigned = np.invert(run_to_check.all_data.fiber_assigned_0.astype(bool))
app_mags = run_to_check.all_data.app_mag[not_assigned].to_numpy()
my_assigned_abs_mag = app_mag_to_abs_mag(app_mags, run_to_check.all_data.z[not_assigned].to_numpy())
my_raw_abs_mag = app_mag_to_abs_mag(app_mags, run_to_check.all_data.z_obs[not_assigned].to_numpy())

print(len(my_raw_abs_mag), len(my_assigned_abs_mag))

In [None]:
# Compare absolute mags. Using my way of computing for both.
abs_mag_bins = np.linspace(-27, -10, num=50)
plt.figure(dpi=DPI)
x = plt.hist(my_raw_abs_mag, bins=abs_mag_bins, label="Truth", alpha=0.5)
y = plt.hist(my_assigned_abs_mag, bins=abs_mag_bins, label=f"{run_to_check.name} Assigned", alpha=0.5)
plt.xlabel("Absolute Mag")
plt.ylabel("Count")
#plt.yscale('log')
plt.title("Compare Lost Galaxies Abs Mags")
plt.legend()

## Test Purity and Completeness

In [None]:
def build_interior_bin_labels(bin_edges):
    labels = []
    for i in range(0,len(bin_edges)-1):
        labels.append(f"{bin_edges[i]:.2e} - {bin_edges[i+1]:.2e}")
    return labels

def test_purity_and_completeness(*sets):

    for s in sets:
        print(s.name)
        data = s.all_data

        assigned_sats = data[data.is_sat == True]
        print(f"Purity of sats: {np.sum(assigned_sats.is_sat_truth) / len(assigned_sats.index):.3f}")

        true_sats = data[data.is_sat_truth == True]
        print(f"Completeness of sats: {np.sum(true_sats.is_sat) / len(true_sats.index):.3f}")

        assigned_centrals = data[data.is_sat == False]
        print(f"Purity of centrals: {1 - (np.sum(assigned_centrals.is_sat_truth) / len(assigned_centrals.index)):.3f}")

        true_centrals = data[data.is_sat_truth == False]
        print(f"Completeness of centrals: {1 - (np.sum(true_centrals.is_sat) / len(true_centrals.index)):.3f}")

        assigned_true_sats = assigned_sats[assigned_sats.is_sat_truth == True]
        assigned_sats_g = assigned_sats.groupby('Lgal_bin').size().to_numpy()
        assigned_sats_correct_g = assigned_true_sats.groupby('Lgal_bin').size().to_numpy()
        s.keep=np.nonzero(assigned_sats_g)
        s.purity_g = assigned_sats_correct_g[s.keep] / assigned_sats_g[s.keep]

        true_sats_assigned = true_sats[true_sats.is_sat == True]
        true_sats_g = true_sats.groupby('Lgal_bin').size().to_numpy()
        true_sats_correct_g = true_sats_assigned.groupby('Lgal_bin').size().to_numpy()
        s.keep2=np.nonzero(true_sats_g)
        s.completeness_g = true_sats_correct_g[s.keep2] / true_sats_g[s.keep2]

        assigned_true_centrals = assigned_centrals[assigned_centrals.is_sat_truth == False]
        assigned_centrals_g = assigned_centrals.groupby('Lgal_bin').size().to_numpy()
        assigned_centrals_correct_g = assigned_true_centrals.groupby('Lgal_bin').size().to_numpy()
        s.keep3=np.nonzero(assigned_centrals_g)
        s.purity_c_g = assigned_centrals_correct_g[s.keep3] / assigned_centrals_g[s.keep3]

        true_centrals_assigned = true_centrals[true_centrals.is_sat == False]
        true_centrals_g = true_centrals.groupby('Lgal_bin').size().to_numpy()
        true_centrals_correct_g = true_centrals_assigned.groupby('Lgal_bin').size().to_numpy()
        s.keep4=np.nonzero(true_centrals_g)
        s.completeness_c_g = true_centrals_correct_g[s.keep4] / true_centrals_g[s.keep4]


def purity_complete_plots(*sets):

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
    fig.set_dpi(DPI/2)

    axes[1][0].set_title('Satellite Purity')
    axes[1][0].set_xscale('log')
    axes[1][0].set_xlabel('$L_{gal}$')
    axes[1][0].set_xlim(2E8,1E11)
    axes[1][0].set_ylim(0.4,1.0)

    axes[1][1].set_title('Satellite Completeness')
    axes[1][1].set_xscale('log')
    axes[1][1].set_xlabel('$L_{gal}$')
    axes[1][1].set_xlim(2E8,1E11)
    axes[1][1].set_ylim(0.4,1.0)

    axes[0][0].set_title('Central Purity')
    axes[0][0].set_xscale('log')
    axes[0][0].set_xlabel('$L_{gal}$')
    axes[0][0].set_xlim(2E8,1E11)
    axes[0][0].set_ylim(0.4,1.0)

    axes[0][1].set_title('Central Completeness')
    axes[0][1].set_xscale('log')
    axes[0][1].set_xlabel('$L_{gal}$')
    axes[0][1].set_xlim(2E8,1E11)
    axes[0][1].set_ylim(0.4,1.0)

    for s in sets:
        axes[1][0].plot(s.L_gal_bins[s.keep], s.purity_g, s.marker, label=f"{s.name}", color=s.color)
        axes[1][1].plot(s.L_gal_bins[s.keep2], s.completeness_g, s.marker, label=f"{s.name}", color=s.color)
        axes[0][0].plot(s.L_gal_bins[s.keep3], s.purity_c_g, s.marker, label=f"{s.name}", color=s.color)
        axes[0][1].plot(s.L_gal_bins[s.keep4], s.completeness_c_g, s.marker, label=f"{s.name}", color=s.color)

    
    axes[0][0].legend()
    fig.tight_layout()


In [None]:
test_purity_and_completeness(all, simple_2)

In [None]:
purity_complete_plots(all, simple_2)

## Find fraction of time the NN is in the same halo, similar z, etc

There is another version of this directly on the MXXL data above.

In [None]:
def resulting_halo_analysis(*sets):

    for data in sets:

        print(data.name)

        #same_halo_mass = np.isclose(data.all_data['assigned_halo_mass'], data.all_data['mxxl_halo_mass'], atol=0.0, rtol=1e-03)
        #same_mxxl_halo = data.all_data['assigned_halo_mass']
        #data.all_data['same_mxxl_halo'] = same_mxxl_halo

        lost_galaxies = data.all_data[data.all_data.fiber_assigned_0 == 0]
        print(len(lost_galaxies), "lost galaxies")

        # TODO understand this MXXL quirk
        lost_galaxies = lost_galaxies[lost_galaxies['assigned_halo_id'] != 0]
        print(len(lost_galaxies), "lost galaxies after removing ones with no MXXL halo ID (no idea why)")

        lost_galaxies_same_halo = np.equal(lost_galaxies['assigned_halo_id'], lost_galaxies['mxxl_halo_id'])
        print("Fraction of time assigned halo ID is the same as the galaxy's actual halo ID: {0:.3f}".format(np.sum(lost_galaxies_same_halo) / len(lost_galaxies_same_halo)))
        
        lost_galaxies_same_halo_mass = np.isclose(lost_galaxies['assigned_halo_mass'], lost_galaxies['mxxl_halo_mass'], atol=0.0, rtol=1e-03)
        print("Fraction of time assigned halo mass is \'the same\' as the galaxy's actual halo mass: {0:.3f}".format(np.sum(lost_galaxies_same_halo_mass) / len(lost_galaxies_same_halo_mass)))
      
        z_thresh=0.01
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)         
        print("Fraction of time assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))

        z_thresh=0.005
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)         
        print("Fraction of time assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))

        z_thresh=0.003
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)         
        print("Fraction of time assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))

        z_thresh=0.001
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)        
        print("Fraction of time assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))
        
        # TODO as a function of reshift. But we essentially already have this from the direct MXXL data plots

        #z_bins = np.linspace(min(data.all_data.z), max(data.all_data.z), 20)
        #z_labels = z_bins[0:len(z_bins)-1] 
        #data.all_data['z_bin'] = pd.cut(x = data.all_data['z'], bins = z_bins, labels = z_labels, include_lowest = True)

        #groupby_z = lost_galaxies.groupby('z_bin')['same_halo_mass'].sum() / lost_galaxies.groupby('z_bin')['same_halo_mass'].count()

        #plt.plot(z_labels, groupby_z)
        #plt.xlabel('$z_{eff}$ (effective/assigned redshift)')
        #plt.ylabel('Fraction Assigned Halo = True Host Halo')
        


In [None]:
resulting_halo_analysis(simple_2)

## Galaxy Neighborhood Examiner

In [None]:
data = fancy_1.all_data


In [None]:
lost_galaxies = data.loc[data['fiber_assigned_0'] == 0]
#lost_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 0]
obs_galaxies = data.loc[data['fiber_assigned_0'] == 1]
#obs_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 1]
print("Lost galaxies: ", len(lost_galaxies), "Observed Galaxies: ", len(obs_galaxies))

# TODO could use angular size / redshift relation as part of this :-)
def getsize(z):
    if z < 0.05:
        return 300
    elif z < 0.1:
        return 200
    elif z < 0.2:
        return 120
    elif z < 0.2:
        return 75
    elif z < 0.3:
        return 45
    elif z < 0.4:
        return 25
    elif z < 0.5:
        return 15
    elif z < 0.6:
        return 8
    else:
        return 3

nearby_angle = coord.Angle('5m')

def neighbor_exam(target):
    z_eff = target.z
    target_dist_true = z_to_ldist(target.z_obs)

    ra_max = (coord.Angle(target.RA*u.degree) + nearby_angle).value
    ra_min = (coord.Angle(target.RA*u.degree) - nearby_angle).value
    dec_max = (coord.Angle(target.Dec*u.degree) + nearby_angle).value
    dec_min = (coord.Angle(target.Dec*u.degree) - nearby_angle).value

    nearby = obs_galaxies.query('RA < @ra_max and RA > @ra_min and Dec < @dec_max and Dec > @dec_min')

    close_neighbors = 0
    if len(nearby) > 0:
        close_neighbors = np.isclose(nearby.ldist_true.to_numpy(), target_dist_true, rtol=0.0, atol=20)

    return (np.sum(close_neighbors), len(nearby), np.sum(close_neighbors)/len(nearby))
    


In [None]:
catalog = coord.SkyCoord(ra=data.RA.to_numpy()*u.degree, dec=data.Dec.to_numpy()*u.degree, frame='icrs')

# This is too slow when called 1 at a time, not using. 
# TODO Could be faster when batched for the whole sample?
def neighbors_within(max_angle: coord.Angle, to_match: coord.Angle, catalog: np.ndarray, treekey: str):

    angular_distance = coord.Angle(0*u.arcsec)
    nth = 1 # cap at 100 for now, TODO remove when safe
    neighbor_ind = []
    neighbor_dist = []

    while angular_distance < max_angle and nth < 100:
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nth, storekdtree=treekey)
        angular_distance = d2d
        nth = nth + 1
        neighbor_ind.append(idx)
        neighbor_dist.append(angular_distance)

    return neighbor_ind, neighbor_dist

def examine_around(target):

    target_observed = target.fiber_assigned_0
    #target = data.loc[index]

    target_pos = coord.SkyCoord(ra=target.RA*u.degree, dec=target.Dec*u.degree, frame='icrs')

    z_eff = target.z
    #target_dist_true = z_to_ldist(target.z_obs)

    ra_max = (coord.Angle(target.RA*u.degree) + nearby_angle).value
    ra_min = (coord.Angle(target.RA*u.degree) - nearby_angle).value
    dec_max = (coord.Angle(target.Dec*u.degree) + nearby_angle).value
    dec_min = (coord.Angle(target.Dec*u.degree) - nearby_angle).value

    # TODO replace with a non-angular search so all redshifts are treated equally
    #indexes, angular_distances = neighbors_within(nearby_angle, target_pos, catalog, 'treekey_nnkd')
    #nearby = data.iloc[indexes]
    nearby = data.query('RA < @ra_max and RA > @ra_min and Dec < @dec_max and Dec > @dec_min')
    nearby = nearby.drop(target.name) # drop the target itself from this df

    nearby_obs = nearby.loc[nearby['fiber_assigned_0'] == 1]
    nearby_unobs = nearby.loc[nearby['fiber_assigned_0'] == 0]

    z_match = nearby_obs.query('z == @z_eff')
    #assert len(z_match) == 1, len(z_match) # TODO need a better way to verify which row is the one that we assigned the z from
    if len(z_match) > 0:
        z_match = z_match.iloc[0]
    #nearby_obs = nearby_obs.drop(z_match.name)

    good_obs_z_filter = list(map(lambda a: close_enough(target.z_obs, a), nearby_obs.z))
    nearby_obs_good_z = nearby_obs.loc[good_obs_z_filter]
    nearby_obs_good_z_dim = nearby_obs_good_z.loc[nearby_obs_good_z.app_mag > 19.5]
    nearby_obs_good_z = nearby_obs_good_z.loc[np.invert(nearby_obs_good_z.app_mag > 19.5)]

    if len(good_obs_z_filter) > 0:
        nearby_obs_other = nearby_obs.loc[np.invert(good_obs_z_filter)]
    else:
        nearby_obs_other = nearby_obs
    nearby_obs_other_dim = nearby_obs_other.loc[nearby_obs_other.app_mag > 19.5]
    nearby_obs_other = nearby_obs_other.loc[np.invert(nearby_obs_other.app_mag > 19.5)]

    good_unobs_z_filter = list(map(lambda a: close_enough(target.z_obs, a), nearby_unobs.z))

    nearby_unobs_good_z = nearby_unobs.loc[good_unobs_z_filter]
    if good_unobs_z_filter:
        nearby_unobs_other = nearby_unobs.loc[np.invert(good_unobs_z_filter)]
        nearby_unobs_other_dim = nearby_unobs_other.loc[nearby_unobs_other.app_mag > 19.5]
        nearby_unobs_other = nearby_unobs_other.loc[np.invert(nearby_unobs_other.app_mag > 19.5)]
    else:
        nearby_unobs_other = nearby_unobs_good_z # empty df
        nearby_unobs_other_dim = nearby_unobs_good_z

    nearby_unobs_good_z_dim = nearby_unobs_good_z.loc[nearby_unobs_good_z.app_mag > 19.5]
    nearby_unobs_good_z = nearby_unobs_good_z.loc[np.invert(nearby_unobs_good_z.app_mag > 19.5)]

    if target_observed:
        title = "Observed Galaxy {0}: z_true={1:.3f}, z_NN={2:.3f}".format(target.name, target.z_obs, target.z)
    else:
        title = "Lost Galaxy {0}: z_true={1:.3f}, z_NN={2:.3f}".format(target.name, target.z_obs, target.z)

    if len(nearby) > 1:

        fig,ax = plt.subplots(1)
        fig.set_size_inches(10,10)
        ax.set_aspect('equal')

        # Add virial radii or MXXL Halos to the observed galaxies
        for k in range(len(nearby_obs)):
            current = nearby_obs.iloc[k]
            radius = current.mxxl_halo_vir_radius_guess_arcsec / 3600 # arcsec to degrees, like the plot
            circ = Circle((current.RA,current.Dec), radius, color=get_color(0), alpha=0.10)
            ax.add_patch(circ)

        textsize = 9
        dimalpha = 0.4

        plt.scatter(nearby_obs_other.RA, nearby_obs_other.Dec, s=list(map(getsize, nearby_obs_other.z)), color=get_color(0), label="Obs ({0})".format(len(nearby_obs_other)))
        if len(nearby_obs_other_dim) > 0:
            plt.scatter(nearby_obs_other_dim.RA, nearby_obs_other_dim.Dec, s=list(map(getsize, nearby_obs_other_dim.z)), color=get_color(2), alpha=dimalpha, label="Obs dim ({0})".format(len(nearby_obs_other_dim)))
        
        plt.scatter(nearby_obs_good_z.RA, nearby_obs_good_z.Dec, s=list(map(getsize, nearby_obs_good_z.z)), color=get_color(2), label="Obs good z ({0})".format(len(nearby_obs_good_z)))
        if len(nearby_obs_good_z_dim) > 0:
            plt.scatter(nearby_obs_good_z_dim.RA, nearby_obs_good_z_dim.Dec, s=list(map(getsize, nearby_obs_good_z_dim.z)), color=get_color(0), alpha=dimalpha, label="Obs good z dim ({0})".format(len(nearby_obs_good_z_dim)))

        plt.scatter(nearby_unobs_other.RA, nearby_unobs_other.Dec, marker='x', s=list(map(getsize, nearby_unobs_other.z)), color=get_color(0), label="Unobs ({0})".format(len(nearby_unobs_other)))
        if len(nearby_unobs_other_dim) > 0:
            plt.scatter(nearby_unobs_other_dim.RA, nearby_unobs_other_dim.Dec, marker='x', s=list(map(getsize, nearby_unobs_other_dim.z)), color=get_color(0), alpha=dimalpha, label="Unobs dim ({0})".format(len(nearby_unobs_other_dim)))
        
        plt.scatter(nearby_unobs_good_z.RA, nearby_unobs_good_z.Dec, marker='x', s=list(map(getsize, nearby_unobs_good_z.z)), color=get_color(2), label="Unobs good z ({0})".format(len(nearby_unobs_good_z)))
        if len(nearby_unobs_good_z_dim) > 0:
            plt.scatter(nearby_unobs_good_z_dim.RA, nearby_unobs_good_z_dim.Dec, marker='x', s=list(map(getsize, nearby_unobs_good_z_dim.z)), color=get_color(2), alpha=dimalpha, label="Unobs good z dim ({0})".format(len(nearby_unobs_good_z_dim)))
        
        # redshift data labels
        for k in range(len(nearby_obs)):
            plt.text(nearby_obs.iloc[k].RA, nearby_obs.iloc[k].Dec, "{0:.3f}".format(nearby_obs.iloc[k].z), size=textsize)
        for k in range(len(nearby_unobs)):
            plt.text(nearby_unobs.iloc[k].RA, nearby_unobs.iloc[k].Dec, "{0:.3f}".format(nearby_unobs.iloc[k].z), size=textsize)

        # Circle assigned one
        if len(z_match) > 0:
            plt.scatter(z_match.RA, z_match.Dec, color=get_color(3), facecolors='none', s=getsize(z_match.z)*2, label="Assigned")
            plt.text(z_match.RA, z_match.Dec, "{0:.3f}".format(z_match.z), size=textsize)

        # Target galaxy
        if target_observed:
            plt.scatter(target.RA, target.Dec, s=getsize(target.z_obs), color=get_color(1), label="Target")
        else:
            plt.scatter(target.RA, target.Dec, s=getsize(target.z_obs), marker='X', color=get_color(1), label="Target")  
        plt.text(target.RA, target.Dec, "{0:.3f}".format(target.z_obs), size=textsize)

        plt.xlim(ra_min, ra_max)
        plt.ylim(dec_min, dec_max)
        plt.xlabel('RA')
        plt.xlabel('Dec')
        plt.legend()
        plt.title(title)
        plt.draw()
    
    else:
        print("Skipping empty plot for {0}".format(title))

In [None]:
PLOTS_TO_MAKE = 10
GALAXY_POOL = lost_galaxies

#START_INDEX = 777
#for i in range(START_INDEX, START_INDEX + PLOTS_TO_MAKE):
#    index = lost_galaxies.index[i]
#    examine_around(index)
print("Number of galaxies to choose from: ", len(GALAXY_POOL))
indexes = np.random.randint(0, len(GALAXY_POOL)-1, size=PLOTS_TO_MAKE)
for i in indexes:
    target = GALAXY_POOL.iloc[i]
    examine_around(target)

## Idea: analyze entire neighborhood and look for groups of similar z galaxies, choose a z from the biggest group

In [None]:
MAX = 300
close = np.empty(MAX)
total = np.empty(MAX)
frac = np.empty(MAX)
for i in range(0,MAX):
    target = lost_galaxies.iloc[i]
    close[i], total[i], frac[i] = neighbor_exam(target)

In [None]:
max_finished = 81408
finished_close = close[0:max_finished]
finished_total = total[0:max_finished]
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'wb') as f:
    np.save(f, finished_close)
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'wb') as f:
    np.save(f, finished_total)

In [None]:
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'rb') as f:
    close = np.load(f)

with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'rb') as f:
    total = np.load(f)

frac = close / total

In [None]:
bins = np.linspace(0,30,31)
trash = plt.hist(close, bins=bins)
plt.title("Lost Galaxies Neighbors at ~Correct z")
plt.xlabel("Count of Similar z Neighbors")
plt.ylabel("Count of Lost Galaxies")
print("Hopeless Fraction: ", np.sum(close==0) / len(close))
print("Essentially Hopeless Fraction: ", (np.sum(close==0) + np.sum(close==1)) / len(close))

In [None]:
viable = close > 1
frac[viable]
trash=plt.hist(frac[viable], bins=30)
plt.title("Viable Lost Galaxies: Correct z Neighbor Fraction")
plt.xlabel("Fraction with Similar z")
plt.ylabel("Count of Viable Lost Galaxies")

# UCHUU


### Experiments

In [None]:
#[('R_MAG_APP', '>f4'), ('R_MAG_ABS', '>f4'), ('G_R_REST', '>f4'), ('G_R_OBS', '>f4'), ('DEC', '>f8'), ('HALO_MASS', '>f4'), ('CEN', '>i4'), ('RES', '>i4'), ('RA', '>f8'), ('Z_COSMO', '>f4'), ('Z', '>f4'), ('STATUS', '>i4'), ('FIRST_ACC_SCALE', '>f4'), ('M_ACC', '>f4'), ('M_VIR_ALL', '>f4'), ('R_VIR', '>f4'), ('V_PEAK', '>f4'), ('R_S', '>f4'), ('V_RMS', '>f4'), ('NGC', '>f4'), ('SGC', '>f4'), ('HALO_ID', '>i8'), ('PID', '>i8')]))

filename='/export/sirocco2/tinker/DESI/UCHUU_MOCKS/BGS_LC_Uchuu.fits'
u_table = Table.read(filename, format='fits')

In [None]:
APP_MAG_CUT = 19.5

In [None]:
dec = u_table['DEC']
ra = u_table['RA']
z_obs = u_table['Z']
app_mag = u_table['R_MAG_APP']
g_r = u_table['G_R_REST'] # TODO before using ensure it should be rest and not observed
central = u_table['CEN']
uchuu_halo_mass = u_table['HALO_MASS']
uchuu_halo_id = u_table['HALO_ID']

bright_filter = app_mag < APP_MAG_CUT 
redshift_filter = z_obs > 0 
keep = np.all([bright_filter, redshift_filter], axis=0)

dec = dec[keep]
ra = ra[keep]
z_obs = z_obs[keep]
app_mag = app_mag[keep]
g_r = g_r[keep]
central = central[keep]
uchuu_halo_mass = uchuu_halo_mass[keep]
uchuu_halo_id = uchuu_halo_id[keep]


### Reading UCHUU truth data

In [None]:
# From the FITS file:
#[('R_MAG_APP', '>f4'), ('R_MAG_ABS', '>f4'), ('G_R_REST', '>f4'), ('G_R_OBS', '>f4'), ('DEC', '>f8'), ('HALO_MASS', '>f4'), ('CEN', '>i4'), ('RES', '>i4'), ('RA', '>f8'), ('Z_COSMO', '>f4'), ('Z', '>f4'), ('STATUS', '>i4'), ('FIRST_ACC_SCALE', '>f4'), ('M_ACC', '>f4'), ('M_VIR_ALL', '>f4'), ('R_VIR', '>f4'), ('V_PEAK', '>f4'), ('R_S', '>f4'), ('V_RMS', '>f4'), ('NGC', '>f4'), ('SGC', '>f4'), ('HALO_ID', '>i8'), ('PID', '>i8')]))

def read_uchuu(filename):
    dat = Table.read(filename, format='fits')
    all_data = dat.to_pandas()

    dataset = types.SimpleNamespace()
    dataset.filename = filename
    dataset.all_data = all_data

    return dataset

def process_uchuu(uchuu):
    """
    Processes the uchuu data so it is a dataframe similar to the ones we build from the group finder outputs.

    Then we can generate the same plots with it to compare.
    """

    print(len(uchuu.all_data))

    # remove all rows that are dimmer than our threshold. Drop so we don't waste memory
    uchuu.all_data = uchuu.all_data[uchuu.all_data.R_MAG_APP <= 19.5].reset_index(drop=True)

    # Remove columns we don't care about
    if 'NGC' in uchuu.all_data.columns:
        uchuu.all_data = uchuu.all_data.drop(columns=['NGC', 'SGC', 'PID', 'RES', 'V_RMS', 'R_S', 'G_R_REST'])

    # Drop bad data
    uchuu.all_data = uchuu.all_data[uchuu.all_data.HALO_MASS >= 0].reset_index(drop=True)

    print(len(uchuu.all_data))

    # For total parity with other ones, missing P_sat, N_sat, L_tot, g_R (and MXXL props of course)
    uchuu.all_data = uchuu.all_data.rename(columns={"DEC": "Dec", "Z": "z", "V_PEAK": "V_max", "R_MAG_APP": "app_mag"}) 
    uchuu.all_data['is_sat'] = (uchuu.all_data.CEN == 0).astype(int)

    uchuu.all_data['logLgal'] = abs_mag_r_to_log_solar_L(uchuu.all_data.R_MAG_ABS)
    uchuu.all_data['L_gal'] = np.power(10, uchuu.all_data.logLgal)
    uchuu.all_data['M_halo'] = uchuu.all_data.HALO_MASS * 10**10 # TODO check this
    uchuu.all_data['Mh_bin'] = pd.cut(x = uchuu.all_data['M_halo'], bins = Mhalo_bins, labels = Mhalo_labels, include_lowest = True)

    centrals = uchuu.all_data[uchuu.all_data.CEN == 1]
    loglcen_means = centrals.groupby('Mh_bin').logLgal.mean()
    loglcen_scatter = centrals.groupby('Mh_bin').logLgal.std()

    # Compute f_sat(Lgal)
    uchuu.all_data['Lgal_bin'] = pd.cut(x = uchuu.all_data['L_gal'], bins = L_gal_bins, labels = L_gal_labels, include_lowest = True)
    f_sat = uchuu.all_data.groupby('Lgal_bin').is_sat.mean()
    Lgal_counts = uchuu.all_data.groupby('Lgal_bin').RA.count()

    uchuu.Mhalo_bins = Mhalo_bins
    uchuu.labels = Mhalo_labels
    uchuu.centrals = centrals
    uchuu.loglcen_means = loglcen_means
    uchuu.loglcen_scatter = loglcen_scatter
    uchuu.L_gal_bins = L_gal_bins
    uchuu.L_gal_labels = L_gal_labels
    uchuu.f_sat = f_sat
    uchuu.Lgal_counts = Lgal_counts



In [None]:
uchuu = read_uchuu('/export/sirocco2/tinker/DESI/UCHUU_MOCKS/BGS_LC_Uchuu.fits')

In [None]:
process_uchuu(uchuu)

In [None]:
uchuu.name = "Uchuu <19.5"
uchuu.color = get_color(8)
uchuu.marker = '-'
#post_process(uchuu)
with open(ROOT_FOLDER + uchuu.name, 'wb') as f:
    pickle.dump(uchuu, f)

In [None]:
print("UCHUU Statistics for < 19.5 r-mag sample")
print(f"Number of centrals  : {len(uchuu.centrals)}")
print(f"Number of satellites: {np.sum(uchuu.all_data.is_sat)}")

In [None]:
plots(all,simple_2, uchuu)

In [None]:
#UCHUU Statistics for < 19.5 r-mag sample
#Number of centrals  : 25329452
#Number of satellites: 6946979