In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as c
from scipy import special
import h5py
from astropy.wcs import WCS
import astropy.coordinates as coord
import astropy.units as u
from pyutils import *
import types
import numpy.ma as ma
import sys
from random import randint
from matplotlib.patches import Circle
from astropy.modeling import models
import pyarrow.feather as feather

ROOT_FOLDER = "/Volumes/Seagate Backup Plus Drive/galaxy-groups-data/"
#ROOT_FOLDER = "/mnt/f/galaxy-groups-data/"
#ROOT_FOLDER = "bin/"


## Basic read-in of HDF5 data from MXXL


In [None]:
DATA_CUT_INDEX = 300000 #21201544 #3000000 

In [None]:
weights = h5py.File(ROOT_FOLDER + 'weights_3pass.hdf5', 'r')
print(list(weights))
print(list(weights['Data']))
print(list(weights['Weight']))


In [None]:
BIT_CHOICE = 0
SELECTOR = 2**BIT_CHOICE
assigned = np.array(weights['Weight/bitweight0'][0:DATA_CUT_INDEX] & SELECTOR).astype(bool) # choose 1 of the 2048 fiber assignment realizations with this bitstring
print(np.sum(assigned), "galaxies were assigned a fiber")
print(np.sum(np.invert(assigned)), "galaxies were NOT assigned a fiber")

In [None]:
# Common PLT helpers
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']
def get_color(i):
    co = colors[i%len(colors)]
    return co

# Test of nearest neighbors implementations


In [None]:

input = weights
dec = input['Data/dec'][0:DATA_CUT_INDEX]
ra = input['Data/ra'][0:DATA_CUT_INDEX]
z_obs = input['Data/z_obs'][0:DATA_CUT_INDEX]
app_mag = input['Data/app_mag'][0:DATA_CUT_INDEX]
sim_halo_mass = input['Data/halo_mass'][0:DATA_CUT_INDEX]
sim_halo_id = input['Data/mxxl_id'][0:DATA_CUT_INDEX]

bright_filter = app_mag < 19.5 # makes a filter array (True/False values)
redshift_filter = z_obs > 0 # makes a filter array (True/False values)
keep = np.all([bright_filter, redshift_filter], axis=0)
dec = dec[keep]
ra = ra[keep]
z_obs = z_obs[keep]
sim_halo_mass = sim_halo_mass[keep]
sim_halo_id = sim_halo_id[keep]

count = len(dec)
print(count, "galaxies in HDF5 file")

# choose 1 of the 2048 fiber assignment realizations with this bitstring
fiber_assigned_0 = assigned.astype(bool)
fiber_assigned_0 = fiber_assigned_0[keep]
fiber_not_assigned_0 = np.invert(fiber_assigned_0)
indexes_not_assigned = np.argwhere(fiber_not_assigned_0)
print(np.sum(fiber_assigned_0), "galaxies were assigned a fiber")

In [None]:
# Astropy NN Search with kdtrees
catalog = coord.SkyCoord(ra=ra[fiber_assigned_0]*u.degree, dec=dec[fiber_assigned_0]*u.degree, frame='icrs')
z_cat = z_obs[fiber_assigned_0]
halo_mass_cat = sim_halo_mass[fiber_assigned_0]
to_match = coord.SkyCoord(ra=ra[fiber_not_assigned_0]*u.degree, dec=dec[fiber_not_assigned_0]*u.degree, frame='icrs')

idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, storekdtree=False)

z_eff_a = np.copy(z_obs)
z_err_a = np.zeros(len(z_obs))

# i is the index of the full sized array that needed a NN z value
# j is the index along the to_match list corresponding to that
# idx are the indexes of the NN from the catalog

# Mhalo - (Mhalo of the NN galaxy)
halo_delta = np.zeros(len(idx))

j = 0
for i in indexes_not_assigned:
    assert np.isclose(ra[i], to_match[j].ra.value)
    new_z = z_cat[idx[j]]
    halo_delta[j] = sim_halo_mass[i] - halo_mass_cat[idx[j]]
    z_err_a[i] = abs(z_eff_a[i] - new_z) / z_eff_a[i]
    z_eff_a[i] = new_z
    j = j + 1


In [None]:
# Examine fractional error in assigned redshifts 
plt.hist(np.log10(z_err_a[fiber_not_assigned_0]), bins=50)
plt.yscale('log')
plt.xlabel('log(fractional error) from known simulation value')
plt.ylabel('Count')
print(np.sum(z_err_a))
print(np.count_nonzero(z_err_a))

# convert to km/s and think about velocity dispersions of galaxies


In [None]:
z_assigned_error = z_err_a[fiber_not_assigned_0] * u.dimensionless_unscaled
velocity_error = z_assigned_error.to(u.km / u.s, u.equivalencies.doppler_redshift())

plt.hist(np.log10(velocity_error.value), bins=50)
plt.yscale('log')
plt.xlabel('log(fractional error as km/s) from known simulation value')
plt.ylabel('Count')
print(np.sum(z_err_a))
print(np.count_nonzero(z_err_a))

# Experiments on MXXL Data Directly

## Simple plots of basic data

In [None]:
small_gal_type = weights['Data/galaxy_type'][0:DATA_CUT_INDEX] # 0 1 2 3 possible
bins = plt.hist(small_gal_type, bins=50)

In [None]:
small_z_obs = weights['Data/z_obs'][0:DATA_CUT_INDEX]
bins = plt.hist(small_z_obs, bins=50)
plt.xlabel("$z_{obs}$")
plt.title("Histogram of Observed Redshifts")


In [None]:
ra = weights['Data/ra'][0:DATA_CUT_INDEX]
dec = weights['Data/dec'][0:DATA_CUT_INDEX]

In [None]:
# Build a map of the galaxies

ra_angles = coord.Angle(ra*u.degree)
ra_angles = ra_angles.wrap_at(180*u.degree)
dec_angles = coord.Angle(dec*u.degree)

fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(111, projection="mollweide")
ax.scatter(ra_angles.radian, dec_angles.radian, alpha=0.002)
# This looks like Alex' paper, good
# TODO how to get frac_area from this?


In [None]:
#plt.plot(ra, dec)

In [None]:
mxxl_halo_id = weights['Data/mxxl_id'][0:DATA_CUT_INDEX]
np.sum(mxxl_halo_id == 0) / len(mxxl_halo_id)

# TODO why do 2.5% of galaxies have 0 for the MXXL Halo ID? This may be messing us up

In [None]:
small_app_mag = weights['Data/app_mag'][0:DATA_CUT_INDEX]
bins = plt.hist(small_app_mag, bins=50)
plt.xlabel("Apparent Mag")
plt.title("Histogram of Apparent Mags")

In [None]:
small_abs_mag = weights['Data/abs_mag'][0:DATA_CUT_INDEX]

In [None]:
# Calculating luminosity distances from the cosmology is a bit slow
my_abs_mag = app_mag_to_abs_mag(small_app_mag, small_z_obs)


In [None]:
# Compare my_abs_mag to abs_mag. 
x = plt.hist(my_abs_mag, label="my abs_mag", bins=50)
y = plt.hist(small_abs_mag, label="alex abs_mag", bins=50)
plt.xlabel("Absolute Mag")
plt.ylabel("Count")
plt.title("Compare Absolute Mags")
plt.legend()

In [None]:
# At what distance (luminosity distance) would the objects appear to be 19.5 mag?
v_max = get_max_observable_volume(my_abs_mag, small_z_obs, 19.5)
v_max2 = get_max_observable_volume(small_abs_mag, small_z_obs, 19.5)

bins = plt.hist(np.log10(v_max), label="my abs_mag", bins=50)
bins = plt.hist(np.log10(v_max2), label="alex abs_mag", bins=50)
plt.title("Compare V_max")
plt.legend()
plt.xlabel("log(V_max) [Mpc]")
plt.ylabel("Count")

### What is a reasonable z fudge factor for 'close enough' redshifts given galaxies $v_{\mathrm{pec}}$?

Galaxies move at hundreds of km/s usually, or thousands in a rich cluster.

Two galaxies moving at 600 km/s towards each other along LOS but at same cosmological redshift would have a total redshift difference of 0.004. This suggests a z +/- 0.002 is totally reasonable. In richer areas this could be as high as z +/- 0.010. 

Adopting z +/- 0.003 for now seems fine. Can refine later.

In [None]:
# What is a reasonable z +/- fudge factor for 'close enough' redshifts? 
# Consider peculiar velocities.
z_test = [0.001, 0.002, 0.003, 0.005, 0.01] * u.dimensionless_unscaled
v_pec = z_test.to(u.km / u.s, u.equivalencies.doppler_redshift())
for i in range(len(z_test)):
    print(f"z={z_test[i]:.3f} is {v_pec[i]:.0f}")



## Nearest Neighbor Angular Separation and Same-Halo Analysis


In [None]:
# What fraction of the time are galaxy pairs in the same halo?
# As a function of angular separation and redshift.

# Do calculation on a small patch of 100,000 galaxies for now
# TODO need to load all data and then manually cut it down into a region of the sky instead of this
# TODO then maybe switch to using k-nearest neighbors instead of the full n^2 pairs within the region

input = weights
DATA_CUT = 21201545 # all of the data
dec = input['Data/dec'][0:DATA_CUT]
ra = input['Data/ra'][0:DATA_CUT]
z_obs = input['Data/z_obs'][0:DATA_CUT]
app_mag = input['Data/app_mag'][0:DATA_CUT]

APP_MAG_CUT = 19.5
bright_filter = app_mag < APP_MAG_CUT # makes a filter array (True/False values)
redshift_filter = z_obs > 0 # makes a filter array (True/False values)
#location_filter_1 = ra < 270.0
#location_filter_2 = ra > 120.0
#location_filter_3 = dec > 0.0
#location_filter_4 = dec < 45.0
keep = np.all([bright_filter, redshift_filter], axis=0)
#keep = np.all([bright_filter, redshift_filter, location_filter_1, location_filter_2, location_filter_3, location_filter_4], axis=0)

dec = dec[keep]
ra = ra[keep]
z_obs = z_obs[keep]
app_mag = app_mag[keep]
sim_halo_id = input['Data/mxxl_id'][0:DATA_CUT]
sim_halo_id = sim_halo_id[keep]


len(dec)

In [None]:
fassigned = (input['Weight/bitweight0'][0:DATA_CUT] & 1).astype(bool) # choose 1 of the 2048 fiber assignment realizations with this bitstring
fnotassigned = np.invert(fassigned)

fassigned = fassigned[keep]
fnotassigned = fnotassigned[keep]

### Calculate IIP Weights

In [None]:
with open('bin/iip.npy', 'rb') as f:
    iip = np.load(f)

In [None]:
# Skip this if iips were loaded OK. Takes ~8 minutes.

# Read all 32 64-bitstrings into memory from the file
num_bitstrings = 32
galaxy_count = len(input['Weight/bitweight0'])
bitweights = np.empty((num_bitstrings, galaxy_count), dtype='i8')
for i in range(num_bitstrings):
    bitweights[i] = input['Weight/bitweight{0}'.format(i)][:]

def bitsum(bitstring):
    return bin(bitstring).count("1")
v_bitsum = np.vectorize(bitsum)

def summate(a):
    return np.sum(v_bitsum(a))

iip = np.apply_along_axis(summate, 0, bitweights) / 2048

with open('bin/iip.npy', 'wb') as f:
    np.save(f, iip)

In [None]:
iip_195 = iip[keep]
trash=plt.hist(iip, bins=10)
trash2=plt.hist(iip_195, bins=10)

### Same Halo / Similar z Analysis

What fraction of time nearest neighbors in same halo? 

What is the distribution of Angular distances?

What fraction of time is nearest neighbors at a similar enough redshift?

In [None]:
# Now bin so that things with ang distances higher than the max we care about are thrown out
BIN_COUNT = 25
bins = np.logspace(np.log10(3), np.log10(60*60), BIN_COUNT)
print("Angular Distance Bin Markers", bins)

z_bins = [0.1, 0.2, 0.3, 1.0]
print("Redshift Bin Markers", z_bins)

#IIP_BIN_COUNT = 7
#IIP_bins = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
IIP_BIN_COUNT = 20
IIP_bins = np.linspace(0.03, 0.60, IIP_BIN_COUNT)
print("IIP Bin Markers", IIP_bins)

SIM_Z_THRESH = 0.003

LOST_GALAXIES_ONLY = False

if LOST_GALAXIES_ONLY:
    treename = 'mxxl_same_halo_analysis_fiberassigned'
    catalog = coord.SkyCoord(ra=ra[fassigned]*u.degree, dec=dec[fassigned]*u.degree, frame='icrs')
    sim_halo_id_catalog = sim_halo_id[fassigned]
    z_obs_catalog = z_obs[fassigned]
else:
    treename = 'mxxl_same_halo_analysis_all'
    catalog = coord.SkyCoord(ra=ra*u.degree, dec=dec*u.degree, frame='icrs')
    sim_halo_id_catalog = sim_halo_id
    z_obs_catalog = z_obs


In [None]:
# Get NN's angular distance distribution and same halo truth from MXXL

# Though this is binned by z of the target and not the NN, it shouldn't be able to affect results
# by recipricality of NN
z_bin = np.digitize(z_obs, z_bins)

if LOST_GALAXIES_ONLY:
    nn_bins = [1]#[1,2,3]
else:
    nn_bins = [2]#[2,3,4] # this means closest 3. '1' will find the same object.

all_ang_bincounts = np.ones((len(z_bins), len(nn_bins), len(bins)))
all_same_halo_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))
all_same_z_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))
all_sim_z_bincounts = np.zeros((len(z_bins), len(nn_bins), len(bins)))

for i in range(len(z_bins)):
    for j in range(len(nn_bins)):
        if LOST_GALAXIES_ONLY:
            filter = np.all([z_bin == i, fnotassigned], axis=0)
        else:
            filter = z_bin == i
        to_match = coord.SkyCoord(ra=ra[filter]*u.degree, dec=dec[filter]*u.degree, frame='icrs')
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nn_bins[j], storekdtree=treename)

        same_halo = sim_halo_id[filter] == sim_halo_id_catalog[idx]
        same_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=0.000001)
        sim_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=SIM_Z_THRESH)
        
        bin_ind = np.digitize(d2d.to(u.arcsec).value, bins)
        
        bincounts = np.bincount(bin_ind, minlength=len(bins)) + 1 # avoids divide by 0, won't hurt statistics
        all_ang_bincounts[i][j] = bincounts

        bincounts2 = np.bincount(bin_ind, minlength=len(bins), weights=same_halo.astype(int))
        all_same_halo_bincounts[i][j] = bincounts2

        bincount3 = np.bincount(bin_ind, minlength=len(bins), weights=same_z.astype(int))
        all_same_z_bincounts[i][j] = bincount3

        bincount4 = np.bincount(bin_ind, minlength=len(bins), weights=sim_z.astype(int))
        all_sim_z_bincounts[i][j] = bincount4
    

In [None]:
print("Galaxies studied: {0}. Same halo: {1}. Similar z: {2}".format(np.sum(all_ang_bincounts), np.sum(all_same_halo_bincounts), np.sum(all_sim_z_bincounts)))

In [None]:
# TODO 
#for b in range(len(all_same_halo_bincounts)):
#    print(all_same_halo_bincounts[b], all_same_z_bincounts[b], len(all_same_z_bincounts))

np.all(np.isclose(all_same_halo_bincounts, all_same_z_bincounts))

In [None]:
# Plots for nearest-neighbor angular distances and same-halo analysis
def getlabel(index, z_bins):
    if i==0:
        label = "< {0}".format(z_bins[i])
    else:
        label = "{0} - {1}".format(z_bins[i-1], z_bins[i])
    return label

plt.figure()
for i in range(len(z_bins)):
    label = getlabel(i, z_bins)
    #trash = plt.hist(d2d.to(u.arcsec).value, histtype='step', bins=bins, label=label, density=True)
    for j in range(len(nn_bins)):
        if j==0:
            plt.plot(bins, all_ang_bincounts[i][j], label=label, color=get_color(i))
        if j==1:
            plt.plot(bins, all_ang_bincounts[i][j], '--', color=get_color(i))
        #if j==2:
        #    plt.plot(bins, all_ang_bincounts[i][j]/np.sum(all_ang_bincounts[i][j]), '-.', color=get_color(i))
plt.title("Nearest Neighbor Ang. Distance Distribution")
plt.ylabel("Count")
plt.xlabel("Angular Distance (arcsec)")
plt.yscale('log')
plt.xscale('log')
plt.legend()
plt.draw()

plt.figure()
for i in range(len(z_bins)):
    label = getlabel(i, z_bins)

    for j in range(len(nn_bins)):
        if j==0:
            plt.plot(bins, all_same_halo_bincounts[i][j]/all_ang_bincounts[i][j], label=label, color=get_color(i))
        if j==1:
            plt.plot(bins, all_same_halo_bincounts[i][j]/all_ang_bincounts[i][j], '--', color=get_color(i))
        #if j==2:
        #    plt.plot(bins, all_same_halo_bincounts[i][j]/all_ang_bincounts[i][j], linewidth=0.5, color=get_color(i))


        print("Total fraction of nearest neighbors in same halo (z {0}, NN-{1}): {2:.3f}".format(label, j+1, np.sum(all_same_halo_bincounts[i][j]) / np.sum(all_ang_bincounts[i][j])))

plt.title("Nearest Neighbor Same Halo Fraction")
plt.ylabel("NN Same Halo Fraction")
plt.xlabel("Angular Distance (arcsec)")
plt.xscale('log')
plt.legend()
plt.draw()

plt.figure()
for i in range(len(z_bins)):
    label = getlabel(i, z_bins)

    for j in range(len(nn_bins)):
        if j==0:
            plt.plot(bins, all_sim_z_bincounts[i][j]/all_ang_bincounts[i][j], label=label, color=get_color(i))
        if j==1:
            plt.plot(bins, all_sim_z_bincounts[i][j]/all_ang_bincounts[i][j], '--', color=get_color(i))
    
        print("Total fraction of nearest neighbors at sim z (z {0}, NN-{1}): {2:.3f}".format(label, j+1, np.sum(all_sim_z_bincounts[i][j]) / np.sum(all_ang_bincounts[i][j])))

plt.title("Nearest Neighbor Sim z Fraction")
plt.ylabel("NN Sim z Fraction")
plt.xlabel("Angular Distance (arcsec)")
plt.xscale('log')
plt.legend()
plt.draw()

#print("What fraction of the time is the NN >19.5 mag?")

In [None]:
IIP_bin = np.digitize(iip_195, IIP_bins)

if LOST_GALAXIES_ONLY: 
    nn_bins = [1]#[2,3,4] # this means closest 3. '1' will find the same object.
else:
    nn_bins = [2]

all_ang_bincounts_2 = np.ones((IIP_BIN_COUNT, len(nn_bins), BIN_COUNT))
all_same_halo_bincounts_2 = np.zeros((IIP_BIN_COUNT, len(nn_bins), BIN_COUNT))
all_sim_z_bincounts_2 = np.zeros((IIP_BIN_COUNT, len(nn_bins), BIN_COUNT))

for i in range(len(IIP_bins)):
    for j in range(len(nn_bins)):
        if LOST_GALAXIES_ONLY:
            filter = np.all([IIP_bin == i, fnotassigned], axis=0)
        else:
            filter = IIP_bin == i
        to_match = coord.SkyCoord(ra=ra[filter]*u.degree, dec=dec[filter]*u.degree, frame='icrs')
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nn_bins[j], storekdtree=treename)
        same_halo = sim_halo_id[filter] == sim_halo_id_catalog[idx]
        sim_z = np.isclose(z_obs[filter], z_obs_catalog[idx], rtol=0, atol=SIM_Z_THRESH)

        bin_ind = np.digitize(d2d.to(u.arcsec).value, bins)
        
        bincounts = np.bincount(bin_ind, minlength=len(bins)) + 1 # avoids divide by 0, won't hurt statistics
        all_ang_bincounts_2[i][j] = bincounts

        bincounts2 = np.bincount(bin_ind, minlength=len(bins), weights=same_halo.astype(int))
        all_same_halo_bincounts_2[i][j] = bincounts2

        bincounts3 = np.bincount(bin_ind, minlength=len(bins), weights=sim_z.astype(int))
        all_sim_z_bincounts_2[i][j] = bincounts3

In [None]:
frac_same = all_same_halo_bincounts_2 / all_ang_bincounts_2
frac_sim_z = all_sim_z_bincounts_2 / all_ang_bincounts_2
density = all_ang_bincounts_2[:,0,:] / np.sum(all_ang_bincounts_2[:,0,:])

plt.figure()
cplot = plt.pcolor(bins, IIP_bins, density, shading='auto', cmap='RdYlGn', norm=c.LogNorm(vmin=0.0001, vmax=0.2))
plt.colorbar(cplot)
plt.title("Nearest Neighbor Densities")
plt.ylabel("IIP")
plt.xlabel("Angular Distance (arcsec)")
plt.xscale('log')
plt.draw()

plt.figure()
cplot = plt.pcolor(bins, IIP_bins, frac_same[:,0,:], shading='auto', cmap='RdYlGn', norm=c.Normalize(vmin=0, vmax=0.8))
plt.colorbar(cplot)
plt.title("Nearest Neighbor Same Halo Fraction")
plt.ylabel("IIP")
plt.xlabel("Angular Distance (arcsec)")
plt.xscale('log')
plt.draw()

plt.figure()
cplot = plt.pcolor(bins, IIP_bins, frac_sim_z[:,0,:], shading='auto', cmap='RdYlGn', norm=c.Normalize(vmin=0, vmax=0.8))
plt.colorbar(cplot)
plt.title("Nearest Neighbor Sim z Fraction")
plt.ylabel("IIP")
plt.xlabel("Angular Distance (arcsec)")
plt.xscale('log')
plt.draw()

In [None]:
print("Using z +/- {0} values as a success metric:\n ".format(SIM_Z_THRESH))
for i in range(BIN_COUNT):
    arcsec = bins[i]
    tot = np.sum(all_ang_bincounts[:,0,0:i])
    frac = np.sum(all_sim_z_bincounts[:,0,0:i]) / np.sum(all_ang_bincounts[:,0,0:i])
    frac_assigned = np.sum(all_ang_bincounts[:,0,0:i]) / np.sum(all_ang_bincounts[:,0,:])
    print("  Up to {0:.1f}\": Success frac: {1:.3f}. Assigned frac: {2:.3f}".format(arcsec, frac, frac_assigned))


### Galaxy Pairs Angular Separation and Same-Halo Analysis
Continuation of the above.

THIS IS N^2 CALCULATION do not run on full sky. Adjust data


In [None]:
# THIS IS N^2 CALCULATION do not run on full sky.
total_bincounts = np.ones((len(z_bins), BIN_COUNT))
total_same_halo_bincounts = np.zeros((len(z_bins), BIN_COUNT))

# Examine each galaxy in the sample pair once
for i in range(len(ra)-1):
    ang_distance = coord.angular_separation(ra[i]*u.degree, dec[i]*u.degree, ra[i+1:len(ra)]*u.degree, dec[i+1:len(ra)]*u.degree).to(u.arcsec)
        
    same_halo = sim_halo_id[i] == sim_halo_id[i+1:len(ra)]
    #print("Same halo fraction for {0}:".format(i), np.sum(same_halo) / len(same_halo))

    bin_ind = np.digitize(ang_distance.value, bins)
    #print(bin_ind)
    bincounts = np.bincount(bin_ind)[0:BIN_COUNT]
    same_halo_bincounts = np.bincount(bin_ind, weights= same_halo.astype(int)) [0:BIN_COUNT]

    z_bin = np.digitize(z_obs[i], z_bins)
    total_bincounts[z_bin] = total_bincounts[z_bin] + bincounts
    total_same_halo_bincounts[z_bin] = total_same_halo_bincounts[z_bin] + same_halo_bincounts
    #print(total_same_halo_bincounts)

#print("Total counts in each bin:", total_bincounts)

fraction_same_halo = total_same_halo_bincounts / total_bincounts
#print(fraction_same_halo)

In [None]:
# Plots for galaxy pairs
plt.figure()
for i in range(len(z_bins)):
    if i==0:
        label = "< {0}".format(z_bins[i])
    else:
        label = "{0} - {1}".format(z_bins[i-1], z_bins[i])
    plt.plot(bins, total_bincounts[i], label=label)
plt.legend()
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Angular Separation (arcsec)')
plt.ylabel('Count of Galaxies Pairs')
plt.title("Galaxy Pair Counts (by ang separation and z)")
plt.draw()

plt.figure()
for i in range(len(z_bins)):
    if i==0:
        label = "< {0}".format(z_bins[i])
    else:
        label = "{0} - {1}".format(z_bins[i-1], z_bins[i])
    plt.plot(bins, fraction_same_halo[i], label=label)
plt.legend()
plt.xscale('log')
plt.xlabel('Angular Separation (arcsec)')
plt.ylabel('Fraction Pair in Same Halo')
plt.ylim(-0.01, 1.0)
plt.title("Fraction Pair in Same Halo (by ang separation and z)")
plt.draw()

# Post Group Founder Analysis

In [None]:
def process(filename):

    filename_props = str.replace(filename, ".out", "_galprops.dat")

    df = pd.read_csv(filename, delimiter=' ', names=('RA', 'Dec', 'z', 'L_gal', 'V_max', 'P_sat', 'M_halo', 'N_sat', 'L_tot', 'igrp', 'unknown'))
    galprops = pd.read_csv(filename_props, delimiter=' ', names=('app_mag', 'g_r', 'galaxy_type', 'mxxl_halo_mass', 'fiber_assigned_0', 'assigned_halo_mass', 'z_obs', 'mxxl_halo_id', 'assigned_halo_id'), dtype={'mxxl_halo_id': np.int32, 'assigned_halo_id': np.int32})
    all_data = pd.merge(df, galprops, left_index=True, right_index=True)

    # Drop bad data, should have been cleaned up earlier though!
    orig_count = len(all_data)
    all_data = all_data[all_data.M_halo != 0]
    new_count = len(all_data)
    if (orig_count != new_count):
        print("Dropped {0} bad galaxies".format(orig_count - new_count))

    all_data['is_sat'] = (all_data.index != all_data.igrp).astype(int)
    all_data['is_sat_truth'] = np.logical_or(all_data.galaxy_type == 1, all_data.galaxy_type == 3).astype(int)
    #print(all_data['is_sat'])
    all_data['logLgal'] = np.log10(all_data.L_gal)

    bins = np.logspace(np.log10(min(all_data.M_halo)), np.log10(max(all_data.M_halo)), 30)
    labels = bins[0:len(bins)-1] # using bottom (or top?) value, not middle
    all_data['Mh_bin'] = pd.cut(x = all_data['M_halo'], bins = bins, labels = labels, include_lowest = True)
    
    centrals = all_data[all_data.index == all_data.igrp]
    #logmstar_means = centrals.groupby('Mh_bin').log_M_star.mean()
    #logmstar_scatter = centrals.groupby('Mh_bin').log_M_star.std()
    loglcen_means = centrals.groupby('Mh_bin').logLgal.mean()
    loglcen_scatter = centrals.groupby('Mh_bin').logLgal.std()

    # Compute f_sat(Lgal)
    bins_Lgal = np.logspace(np.log10(min(all_data.L_gal)), np.log10(max(all_data.L_gal)), 30)
    labels_Lgal = bins_Lgal[0:len(bins_Lgal)-1] # using bottom (or top?) value, not middle
    all_data['Lgal_bin'] = pd.cut(x = all_data['L_gal'], bins = bins_Lgal, labels = labels_Lgal, include_lowest = True)
    
    f_sat = all_data.groupby('Lgal_bin').is_sat.mean()
    Lgal_counts = all_data.groupby('Lgal_bin').RA.count()

    dataset = types.SimpleNamespace()
    dataset.filename = filename[filename.rfind('/')+1 : len(filename)-4]
    dataset.all_data = all_data
    dataset.bins = bins
    dataset.labels = labels
    dataset.centrals = centrals
    #dataset.logmstar_means = logmstar_means
    #dataset.logmstar_scatter = logmstar_scatter
    dataset.loglcen_means = loglcen_means
    dataset.loglcen_scatter = loglcen_scatter
    dataset.bins_Lgal = bins_Lgal
    dataset.labels_Lgal = labels_Lgal
    dataset.f_sat = f_sat
    dataset.Lgal_counts = Lgal_counts

    return dataset

def plots(*frames):
    
    plt.figure()
    for f in frames:
        if ('20' not in f.name):
            plt.errorbar(f.labels, f.loglcen_means, yerr=f.loglcen_scatter, label=f.name, color=f.color)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$log(L_{cen})$')
    plt.title("Central Luminosity vs. Halo Mass")
    plt.legend()
    plt.draw()

    plt.figure()
    for f in frames:
        if ('20' in f.name):
            plt.errorbar(f.labels, f.loglcen_means, yerr=f.loglcen_scatter, label=f.name, color=f.color)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$log(L_{cen})$')
    plt.title("Central Luminosity vs. Halo Mass")
    plt.legend()
    plt.draw()

    plt.figure()    
    for f in frames:
        if ('20' not in f.name):
            plt.plot(f.labels, f.loglcen_scatter, f.marker, color=f.color, label=f.name)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$\\sigma(\\log(L_{cen})$')
    plt.title("Central Luminosity Scatter vs. Halo Mass")
    plt.legend()
    plt.draw()

    plt.figure()    
    for f in frames:
        if ('20' in f.name):
            plt.plot(f.labels, f.loglcen_scatter, f.marker, color=f.color, label=f.name)
    plt.xscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('$\\sigma(\\log(L_{cen})$')
    plt.title("Central Luminosity Scatter vs. Halo Mass")
    plt.legend()
    plt.draw()

    """     
    plt.figure()
    for f in frames:
        plt.scatter(f.centrals.M_halo, f.centrals.L_gal, alpha=0.002)
    plt.loglog()
    plt.xlabel('M_halo / h')
    plt.ylabel('L_gal / $h^2$)')
    plt.draw() 
    """

    plt.figure()
    for f in frames:
        Nsat_means = f.all_data.groupby('Mh_bin').N_sat.mean()
        plt.plot(f.labels, Nsat_means, f.marker, label=f.name, color=f.color)
        #plt.hist(f.centrals.N_sat, np.arange(0,50,1), alpha=0.5)
    plt.loglog()    
    plt.ylabel("$<N_{sat}>$")    
    plt.xlabel('$M_{halo}$')
    plt.title("Mean Number of Satellites by Halo Mass")
    plt.legend()
    plt.draw()

    plt.figure()
    for f in frames:
        plt.plot(f.labels_Lgal, f.f_sat, f.marker, label=f.name, color=f.color)
    truth_f_sat = frames[0].all_data.groupby('Lgal_bin').is_sat_truth.mean()
    plt.plot(frames[0].labels_Lgal, truth_f_sat, 'k')
    #centrals = frames[0].all_data['galaxy_type' == 0 or 'galaxy_type' == 2].groupby('Lgal_bin').count()
    #sats = frames[0].all_data['galaxy_type' == 1 or 'galaxy_type' == 3].groupby('Lgal_bin').count()
    #truth_f_sat = sats / (centrals + sats)
    plt.xscale('log')
    plt.xlabel("$L_{gal}$")
    plt.ylabel("$f_{sat}$")
    plt.title("Satellite fraction vs Galaxy Luminosity")
    plt.legend()
    plt.draw()
    
    plt.figure()
    for f in frames:
        plt.plot(f.labels_Lgal, f.Lgal_counts, f.marker, label=f.name, color=f.color)
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel("$L_{gal}$")
    plt.ylabel("Count of Galaxies")
    plt.title("Galaxy Luminosity Counts")
    plt.legend()
    plt.draw()

    print("TOTAL f_sat: ")
    for f in frames:
        print(f.filename, f.all_data['is_sat'].sum() / f.all_data['is_sat'].count())

    print("MXXL Truth", frames[0].all_data['is_sat_truth'].sum() / f.all_data['is_sat_truth'].count())

    plt.figure()
    for f in frames:
        bin_ind = np.digitize(f.all_data.M_halo, f.bins)
        bincounts = np.bincount(bin_ind)[0:len(f.bins)]
        plt.plot(f.bins, bincounts / np.sum(bincounts), f.marker, label=f.name, color=f.color) 
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('$M_{halo}$')
    plt.ylabel('Density of Halo Mass')
    plt.title("Fiber Loss on Group Finder Halo Masses")
    plt.legend()
    plt.draw()

In [None]:
def get_vir_radius(halo_mass):
    m = models.NFW(mass=halo_mass)
    return m.r_virial.to(u.kpc).value

def post_process(frame):
    df: pd.DataFrame = frame.all_data
    
    # Calculate additional halo properties
    masses = df.loc[:, 'mxxl_halo_mass'].to_numpy() * 1E9 * u.solMass
    df.loc[:, 'mxxl_halo_vir_radius_guess'] = get_vir_radius(masses)

    _cosmo = FlatLambdaCDM(H0=73, Om0=0.25, Ob0=0.045, Tcmb0=2.725, Neff=3.04) 
    # TODO comoving or proper?
    as_per_kpc = _cosmo.arcsec_per_kpc_proper(df.loc[:, 'z'].to_numpy())
    df.loc[:, 'mxxl_halo_vir_radius_guess_arcsec'] =  df.loc[:, 'mxxl_halo_vir_radius_guess'] * as_per_kpc.to(u.arcsec / u.kpc).value

    # Luminosity distance to z_obs
    df.loc[:, 'ldist_true'] = z_to_ldist(df.z_obs.to_numpy())



In [None]:
all = process(ROOT_FOLDER + "mxxl_3pass_all.out")
all20 = process(ROOT_FOLDER + "mxxl_3pass_all20.out")
all.name = "All"
all.color = get_color(0)
all.marker = '-'
all20.name = "All <20"
all20.color = get_color(0)
all20.marker = '--'

In [None]:
post_process(all)
post_process(all20)

In [None]:
fiberonly = process(ROOT_FOLDER + "mxxl_3pass_fiberonly.out")
fiberonly20 = process(ROOT_FOLDER + "mxxl_3pass_fiberonly20.out")
fiberonly.name = "Fiber Assigned Only"
fiberonly.color = get_color(1)
fiberonly.marker = '-'
fiberonly20.name = "Fiber Assigned Only <20"
fiberonly20.color = get_color(1)
fiberonly20.marker = '--'

In [None]:
post_process(fiberonly)
post_process(fiberonly20)

In [None]:
nn_kd = process(ROOT_FOLDER + "mxxl_3pass_nn_kd.out")


In [None]:
post_process(nn_kd)
nn_kd.name = "Nearest Neighbor"
nn_kd.color = get_color(2)
nn_kd.marker = '-'


In [None]:
nn_kd20 = process(ROOT_FOLDER + "mxxl_3pass_nn_kd20.out")
post_process(nn_kd20)
nn_kd20.name = "Nearest Neighbor <20"
nn_kd20.color = get_color(2)
nn_kd20.marker = '--'

In [None]:
# TODO figure out
#def save_processed_data(frame):
#    frame.all_data.to_feather(ROOT_FOLDER + frame.name)


In [None]:
ids = all.all_data['mxxl_halo_id'].to_numpy()
np.sum([ids == 0]) / len(ids)

In [None]:
#plots(all, fiberonly, nn_kd, all20, fiberonly20, nn_kd20)
plots(nn_kd)
# BLUE: ALL     ORANGE: FIBER ASSIGNED ONLY     GREEN: NEAREST NEIGHBOR

In the above plots, the NN ones have some galaxies at higher $L_{gal}$ than the 'all' sample. This is because some of the assigned redshifts imply a larger luminosity than any galaxy seen in MXXL.

### What effect does Fiber Assignment have on group finder properties?

In [None]:
# Halo Masses (in group finder abundance matching)
all_to_use = all
fiberonly_to_use = fiberonly

all_centrals = all_to_use.all_data[all_to_use.all_data.index == all_to_use.all_data.igrp]
#loglcen_means = centrals.groupby('Mh_bin').logLgal.mean()
bin_ind = np.digitize(all_centrals.M_halo, all_to_use.bins)
all_bincounts = np.bincount(bin_ind)[0:len(all_to_use.bins)]
all_density = all_bincounts / np.sum(all_bincounts)


fo_centrals = fiberonly_to_use.all_data[fiberonly_to_use.all_data.index == fiberonly_to_use.all_data.igrp]
#loglcen_means = centrals.groupby('Mh_bin').logLgal.mean()
bin_ind = np.digitize(fo_centrals.M_halo, all.bins)
fo_bincounts = np.bincount(bin_ind)[0:len(all.bins)]
fo_density = fo_bincounts / np.sum(fo_bincounts)

plt.figure()
plt.plot(all_to_use.bins, np.log10(fo_density / all_density)) 
plt.xscale('log')
plt.ylim(-0.2, 0.2)
plt.xlabel('$M_{halo}$')
plt.ylabel('Normalized log(Fiberonly / All)')
plt.title("Effects of Fiber Loss on Group Finder Halo Masses")
plt.draw()

plt.figure()
plt.plot(all_to_use.bins, all_density, label="All Galaxies") 
plt.plot(all_to_use.bins, fo_density, label="Fiber-Assigned Only") 
plt.xscale('log')
plt.yscale('log')
plt.xlabel('$M_{halo}$')
plt.ylabel('Density of Galaxies')
plt.title("Effects of Fiber Loss on Group Finder Halo Masses")
plt.legend()
plt.draw()

In [None]:
# Look up the centrals from all in fiberonly
catalog = coord.SkyCoord(ra=all_centrals.RA.to_numpy()*u.degree, dec=all_centrals.Dec.to_numpy()*u.degree, frame='icrs')
to_match = coord.SkyCoord(ra=fo_centrals.RA.to_numpy()*u.degree, dec=fo_centrals.Dec.to_numpy()*u.degree, frame='icrs')
idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=1, storekdtree='all_fo_matching')

In [None]:
perfect_match = np.isclose(d2d.to(u.arcsec).value, 0, rtol=0.0, atol=0.0001) 
# 0.0001 arcsec precision on matching doesn't hit floating point noise. You get same with 0.001
print("What fraction of centrals in \'fiberonly\' are centrals in \'all\'?", np.sum(perfect_match) / len(d2d))

## Compare NN-assigned implied abs mags to truth from MXXL

In [None]:
not_assigned = np.invert(nn_kd.all_data.fiber_assigned_0.astype(bool))
app_mags = nn_kd.all_data.app_mag[not_assigned].to_numpy()
my_assigned_abs_mag = app_mag_to_abs_mag(app_mags, nn_kd.all_data.z[not_assigned].to_numpy())
my_raw_abs_mag = app_mag_to_abs_mag(app_mags, nn_kd.all_data.z_obs[not_assigned].to_numpy())

print(len(my_raw_abs_mag), len(my_assigned_abs_mag))

In [None]:
# Compare absolute mags. Using my way of computing for both.
x = plt.hist(my_raw_abs_mag, label="Truth", bins=50, alpha=0.5)
y = plt.hist(my_assigned_abs_mag, label="NN Assigned", bins=50, alpha=0.5)
plt.xlabel("Absolute Mag")
plt.ylabel("Count")
plt.yscale('log')
plt.title("Compare NN-assigned Abs Mags")
plt.legend()

## Find fraction of time the NN is in the same halo, similar z, etc

There is another version of this directly on the MXXL data above.

In [None]:
#x=plt.hist(nn.all_data['assigned_halo_mass'], bins=50)
#x=plt.hist(nn.all_data['mxxl_halo_mass'], bins=50)
#x=plt.hist(all.all_data['mxxl_halo_mass'], bins=30) # should be exact same as above
#plt.yscale('log')

def nn_halo_analysis(*sets):

    for data in sets:

        print(data.name)

        #same_halo_mass = np.isclose(data.all_data['assigned_halo_mass'], data.all_data['mxxl_halo_mass'], atol=0.0, rtol=1e-03)
        #same_mxxl_halo = data.all_data['assigned_halo_mass']
        #data.all_data['same_mxxl_halo'] = same_mxxl_halo

        lost_galaxies = data.all_data[data.all_data.fiber_assigned_0 == 0]
        print(len(lost_galaxies), "lost galaxies")

        # TODO understand this MXXL quirk
        lost_galaxies = lost_galaxies[lost_galaxies['assigned_halo_id'] != 0]
        print(len(lost_galaxies), "lost galaxies after removing ones with no MXXL halo ID (no idea why)")

        lost_galaxies_same_halo = np.equal(lost_galaxies['assigned_halo_id'], lost_galaxies['mxxl_halo_id'])
        print("Fraction of time NN-assigned halo ID is the same as the galaxy's actual halo ID: {0:.3f}".format(np.sum(lost_galaxies_same_halo) / len(lost_galaxies_same_halo)))
        
        lost_galaxies_same_halo_mass = np.isclose(lost_galaxies['assigned_halo_mass'], lost_galaxies['mxxl_halo_mass'], atol=0.0, rtol=1e-03)
        print("Fraction of time NN-assigned halo mass is \'the same\' as the galaxy's actual halo mass: {0:.3f}".format(np.sum(lost_galaxies_same_halo_mass) / len(lost_galaxies_same_halo_mass)))

        z_thresh=0.003
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)         
        print("Fraction of time NN-assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))
        # TODO as a function of reshift

        z_thresh=0.001
        lost_galaxies_similar_z = np.isclose(lost_galaxies['z'], lost_galaxies['z_obs'], atol=z_thresh, rtol=0.0)        
        print("Fraction of time NN-assigned z is the target z +/- {0:.3f}:".format(z_thresh), np.sum(lost_galaxies_similar_z) / len(lost_galaxies_similar_z))
        # TODO as a function of reshift

        #z_bins = np.linspace(min(data.all_data.z), max(data.all_data.z), 20)
        #z_labels = z_bins[0:len(z_bins)-1] 
        #data.all_data['z_bin'] = pd.cut(x = data.all_data['z'], bins = z_bins, labels = z_labels, include_lowest = True)

        #groupby_z = lost_galaxies.groupby('z_bin')['same_halo_mass'].sum() / lost_galaxies.groupby('z_bin')['same_halo_mass'].count()

        #plt.plot(z_labels, groupby_z)
        #plt.xlabel('$z_{eff}$ (effective/assigned redshift)')
        #plt.ylabel('Fraction Assigned Halo = True Host Halo')
        


In [None]:
nn_halo_analysis(nn_kd)#, nn_kd20)

# TODO could compare z_cos instead


## Galaxy Neighborhood Examiner

In [None]:
data = nn_kd.all_data


In [None]:
lost_galaxies = data.loc[data['fiber_assigned_0'] == 0]
#lost_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 0]
obs_galaxies = data.loc[data['fiber_assigned_0'] == 1]
#obs_galaxies_2 = nn.all_data.loc[nn.all_data['fiber_assigned_0'] == 1]
print("Lost galaxies: ", len(lost_galaxies), "Observed Galaxies: ", len(obs_galaxies))

def close_enough(target_z, z_arr, threshold=0.003):
    return abs(z_arr - target_z) < threshold


# TODO could use angular size / redshift relation as part of this :-)
def getsize(z):
    if z < 0.05:
        return 300
    elif z < 0.1:
        return 200
    elif z < 0.2:
        return 120
    elif z < 0.2:
        return 75
    elif z < 0.3:
        return 45
    elif z < 0.4:
        return 25
    elif z < 0.5:
        return 15
    elif z < 0.6:
        return 8
    else:
        return 3

nearby_angle = coord.Angle('3m')

def neighbor_exam(target):
    z_eff = target.z
    target_dist_true = z_to_ldist(target.z_obs)

    ra_max = (coord.Angle(target.RA*u.degree) + nearby_angle).value
    ra_min = (coord.Angle(target.RA*u.degree) - nearby_angle).value
    dec_max = (coord.Angle(target.Dec*u.degree) + nearby_angle).value
    dec_min = (coord.Angle(target.Dec*u.degree) - nearby_angle).value

    nearby = obs_galaxies.query('RA < @ra_max and RA > @ra_min and Dec < @dec_max and Dec > @dec_min')

    close_neighbors = 0
    if len(nearby) > 0:
        close_neighbors = np.isclose(nearby.ldist_true.to_numpy(), target_dist_true, rtol=0.0, atol=20)

    return (np.sum(close_neighbors), len(nearby), np.sum(close_neighbors)/len(nearby))
    


In [None]:
catalog = coord.SkyCoord(ra=data.RA.to_numpy()*u.degree, dec=data.Dec.to_numpy()*u.degree, frame='icrs')

# This is too slow when called 1 at a time, not using. 
# TODO Could be faster when batched for the whole sample?
def neighbors_within(max_angle: coord.Angle, to_match: coord.Angle, catalog: np.ndarray, treekey: str):

    angular_distance = coord.Angle(0*u.arcsec)
    nth = 1 # cap at 100 for now, TODO remove when safe
    neighbor_ind = []
    neighbor_dist = []

    while angular_distance < max_angle and nth < 100:
        idx, d2d, d3d = coord.match_coordinates_sky(to_match, catalog, nthneighbor=nth, storekdtree=treekey)
        angular_distance = d2d
        nth = nth + 1
        neighbor_ind.append(idx)
        neighbor_dist.append(angular_distance)

    return neighbor_ind, neighbor_dist

def examine_around(target):

    target_observed = target.fiber_assigned_0
    #target = data.loc[index]

    target_pos = coord.SkyCoord(ra=target.RA*u.degree, dec=target.Dec*u.degree, frame='icrs')

    z_eff = target.z
    #target_dist_true = z_to_ldist(target.z_obs)

    ra_max = (coord.Angle(target.RA*u.degree) + nearby_angle).value
    ra_min = (coord.Angle(target.RA*u.degree) - nearby_angle).value
    dec_max = (coord.Angle(target.Dec*u.degree) + nearby_angle).value
    dec_min = (coord.Angle(target.Dec*u.degree) - nearby_angle).value

    # TODO replace with a non-angular search so all redshifts are treated equally
    #indexes, angular_distances = neighbors_within(nearby_angle, target_pos, catalog, 'treekey_nnkd')
    #nearby = data.iloc[indexes]
    nearby = data.query('RA < @ra_max and RA > @ra_min and Dec < @dec_max and Dec > @dec_min')
    nearby = nearby.drop(target.name) # drop the target itself from this df

    nearby_obs = nearby.loc[nearby['fiber_assigned_0'] == 1]
    nearby_unobs = nearby.loc[nearby['fiber_assigned_0'] == 0]

    z_match = nearby_obs.query('z == @z_eff')
    #assert len(z_match) == 1, len(z_match) # TODO need a better way to verify which row is the one that we assigned the z from
    if len(z_match) > 0:
        z_match = z_match.iloc[0]
    #nearby_obs = nearby_obs.drop(z_match.name)

    good_obs_z_filter = list(map(lambda a: close_enough(target.z_obs, a), nearby_obs.z))
    nearby_obs_good_z = nearby_obs.loc[good_obs_z_filter]
    nearby_obs_good_z_dim = nearby_obs_good_z.loc[nearby_obs_good_z.app_mag > 19.5]
    nearby_obs_good_z = nearby_obs_good_z.loc[np.invert(nearby_obs_good_z.app_mag > 19.5)]

    if len(good_obs_z_filter) > 0:
        nearby_obs_other = nearby_obs.loc[np.invert(good_obs_z_filter)]
    else:
        nearby_obs_other = nearby_obs
    nearby_obs_other_dim = nearby_obs_other.loc[nearby_obs_other.app_mag > 19.5]
    nearby_obs_other = nearby_obs_other.loc[np.invert(nearby_obs_other.app_mag > 19.5)]

    good_unobs_z_filter = list(map(lambda a: close_enough(target.z_obs, a), nearby_unobs.z))

    nearby_unobs_good_z = nearby_unobs.loc[good_unobs_z_filter]
    if good_unobs_z_filter:
        nearby_unobs_other = nearby_unobs.loc[np.invert(good_unobs_z_filter)]
        nearby_unobs_other_dim = nearby_unobs_other.loc[nearby_unobs_other.app_mag > 19.5]
        nearby_unobs_other = nearby_unobs_other.loc[np.invert(nearby_unobs_other.app_mag > 19.5)]
    else:
        nearby_unobs_other = nearby_unobs_good_z # empty df
        nearby_unobs_other_dim = nearby_unobs_good_z

    nearby_unobs_good_z_dim = nearby_unobs_good_z.loc[nearby_unobs_good_z.app_mag > 19.5]
    nearby_unobs_good_z = nearby_unobs_good_z.loc[np.invert(nearby_unobs_good_z.app_mag > 19.5)]

    if target_observed:
        title = "Observed Galaxy {0}: z_true={1:.3f}, z_NN={2:.3f}".format(target.name, target.z_obs, target.z)
    else:
        title = "Lost Galaxy {0}: z_true={1:.3f}, z_NN={2:.3f}".format(target.name, target.z_obs, target.z)

    if len(nearby) > 1:

        fig,ax = plt.subplots(1)
        fig.set_size_inches(10,10)
        ax.set_aspect('equal')

        # Add virial radii or MXXL Halos to the observed galaxies
        for k in range(len(nearby_obs)):
            current = nearby_obs.iloc[k]
            radius = current.mxxl_halo_vir_radius_guess_arcsec / 3600 # arcsec to degrees, like the plot
            circ = Circle((current.RA,current.Dec), radius, color=get_color(0), alpha=0.10)
            ax.add_patch(circ)

        textsize = 9
        dimalpha = 0.4

        plt.scatter(nearby_obs_other.RA, nearby_obs_other.Dec, s=list(map(getsize, nearby_obs_other.z)), color=get_color(0), label="Obs ({0})".format(len(nearby_obs_other)))
        plt.scatter(nearby_obs_other_dim.RA, nearby_obs_other_dim.Dec, s=list(map(getsize, nearby_obs_other_dim.z)), color=get_color(2), alpha=dimalpha, label="Obs dim ({0})".format(len(nearby_obs_other_dim)))
        
        plt.scatter(nearby_obs_good_z.RA, nearby_obs_good_z.Dec, s=list(map(getsize, nearby_obs_good_z.z)), color=get_color(2), label="Obs good z ({0})".format(len(nearby_obs_good_z)))
        plt.scatter(nearby_obs_good_z_dim.RA, nearby_obs_good_z_dim.Dec, s=list(map(getsize, nearby_obs_good_z_dim.z)), color=get_color(0), alpha=dimalpha, label="Obs good z dim ({0})".format(len(nearby_obs_good_z_dim)))

        plt.scatter(nearby_unobs_other.RA, nearby_unobs_other.Dec, marker='x', s=list(map(getsize, nearby_unobs_other.z)), color=get_color(0), label="Unobs ({0})".format(len(nearby_unobs_other)))
        plt.scatter(nearby_unobs_other_dim.RA, nearby_unobs_other_dim.Dec, marker='x', s=list(map(getsize, nearby_unobs_other_dim.z)), color=get_color(0), alpha=dimalpha, label="Unobs dim ({0})".format(len(nearby_unobs_other_dim)))
        
        plt.scatter(nearby_unobs_good_z.RA, nearby_unobs_good_z.Dec, marker='x', s=list(map(getsize, nearby_unobs_good_z.z)), color=get_color(2), label="Unobs good z ({0})".format(len(nearby_unobs_good_z)))
        plt.scatter(nearby_unobs_good_z_dim.RA, nearby_unobs_good_z_dim.Dec, marker='x', s=list(map(getsize, nearby_unobs_good_z_dim.z)), color=get_color(2), alpha=dimalpha, label="Unobs good z dim ({0})".format(len(nearby_unobs_good_z_dim)))
        
        # redshift data labels
        for k in range(len(nearby_obs)):
            plt.text(nearby_obs.iloc[k].RA, nearby_obs.iloc[k].Dec, "{0:.3f}".format(nearby_obs.iloc[k].z), size=textsize)
        for k in range(len(nearby_unobs)):
            plt.text(nearby_unobs.iloc[k].RA, nearby_unobs.iloc[k].Dec, "{0:.3f}".format(nearby_unobs.iloc[k].z), size=textsize)

        # Circle nearest neighbor
        if len(z_match) > 0:
            plt.scatter(z_match.RA, z_match.Dec, color=get_color(3), facecolors='none', s=getsize(z_match.z)*2, label="Nearest Neighbor")
            plt.text(z_match.RA, z_match.Dec, "{0:.3f}".format(z_match.z), size=textsize)

        # Target galaxy
        if target_observed:
            plt.scatter(target.RA, target.Dec, s=getsize(target.z_obs), color=get_color(1), label="Target")
        else:
            plt.scatter(target.RA, target.Dec, s=getsize(target.z_obs), marker='X', color=get_color(1), label="Target")  
        plt.text(target.RA, target.Dec, "{0:.3f}".format(target.z_obs), size=textsize)

        plt.xlim(ra_min, ra_max)
        plt.ylim(dec_min, dec_max)
        plt.xlabel('RA')
        plt.xlabel('Dec')
        plt.legend()
        plt.title(title)
        plt.draw()
    
    else:
        print("Skipping empty plot for {0}".format(title))

In [None]:
PLOTS_TO_MAKE = 10
GALAXY_POOL = lost_galaxies
OBSERVED = False

#START_INDEX = 777
#for i in range(START_INDEX, START_INDEX + PLOTS_TO_MAKE):
#    index = lost_galaxies.index[i]
#    examine_around(index)
print("Number of galaxies to choose from: ", len(GALAXY_POOL))
indexes = np.random.randint(0, len(GALAXY_POOL)-1, size=PLOTS_TO_MAKE)
for i in indexes:
    target = GALAXY_POOL.iloc[i]
    examine_around(target)

## Idea: analyze entire neighborhood and look for groups of similar z galaxies, choose a z from the biggest group

In [None]:
MAX = 300
close = np.empty(MAX)
total = np.empty(MAX)
frac = np.empty(MAX)
for i in range(0,MAX):
    target = lost_galaxies.iloc[i]
    close[i], total[i], frac[i] = neighbor_exam(target)

In [None]:
max_finished = 81408
finished_close = close[0:max_finished]
finished_total = total[0:max_finished]
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'wb') as f:
    np.save(f, finished_close)
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'wb') as f:
    np.save(f, finished_total)

In [None]:
with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_close.npy', 'rb') as f:
    close = np.load(f)

with open(ROOT_FOLDER + 'mxxl_lostgal_neighborhood_total.npy', 'rb') as f:
    total = np.load(f)

frac = close / total

In [None]:
bins = np.linspace(0,30,31)
trash = plt.hist(close, bins=bins)
plt.title("Lost Galaxies Neighbors at ~Correct z")
plt.xlabel("Count of Similar z Neighbors")
plt.ylabel("Count of Lost Galaxies")
print("Hopeless Fraction: ", np.sum(close==0) / len(close))
print("Essentially Hopeless Fraction: ", (np.sum(close==0) + np.sum(close==1)) / len(close))

In [None]:
viable = close > 1
frac[viable]
trash=plt.hist(frac[viable], bins=30)
plt.title("Viable Lost Galaxies: Correct z Neighbor Fraction")
plt.xlabel("Fraction with Similar z")
plt.ylabel("Count of Viable Lost Galaxies")