# Survey non-uniformity check pipeline:

The pipeline version will be in `rail_pipeline/[...]/survey_nonuniform.ipynb`. 

This notebook writes out each step in the pipeline explicitly for testing purpose.

1. Use FlowCreator to generate sample, ignore the semi major (minor) axis.
2. Degrade using ObsCondition
3. Apply QuantityCut
...

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# rail creation modules

from pzflow.examples import get_example_flow
from rail.creation.engines.flowEngine import FlowCreator
from rail.creation.degradation.spectroscopic_degraders import (
    InvRedshiftIncompleteness,
    LineConfusion,
)
from rail.creation.degradation.lsst_error_model import LSSTErrorModel
from rail.core.stage import RailStage

In [3]:
# degradation modules
from rail.creation.degradation import observing_condition_degrader
from rail.creation.degradation.observing_condition_degrader import ObsCondition

In [4]:
# quantity cut modules
from rail.creation.degradation.quantityCut import QuantityCut

In [7]:
# column mapper
from rail.core.utilStages import ColumnMapper

In [6]:
# deredden:
from rail.tools.utilPhotometry import Dereddener

In [8]:
# BPZ:
from rail.estimation.algos.bpz_lite import BPZliteInformer, BPZliteEstimator

In [9]:
# classifier:
from rail.estimation.algos.uniform_binning import UniformBinningClassifier

In [5]:
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

## Pre-defined variables:

## Step 1: RAIL.creation

### 1.1 Creation stage:
In this example, we load pre-trained flow:

In [8]:
# parameters:
flow_file = "/global/u2/q/qhang/desc/rail_base/src/rail/examples_data/goldenspike_data/data/pretrained_flow.pkl"
n_samples = int(30)
name="flow_engine_train"

In [9]:
## generate 100 sample
flowCreator_truth = FlowCreator.make_stage(
    name=name, 
    model=flow_file, 
    n_samples=n_samples,
)

In [8]:
flowCreator_truth.get_data("model")

<pzflow.flow.Flow at 0x7fef10e435d0>

In [10]:
samples_truth = flowCreator_truth.sample(n_samples, seed=0)
print(samples_truth())
print("Data was written to ", samples_truth.path)

Inserting handle into data store.  output_flow_engine_train: inprogress_output_flow_engine_train.pq, flow_engine_train
    mag_z_lsst  mag_y_lsst  mag_r_lsst  mag_i_lsst  redshift  mag_u_lsst  \
0    24.314548   23.934925   24.733057   24.470266  1.563762   24.547361   
1    24.104399   23.863514   24.899925   24.695818  1.195636   24.839766   
2    24.254185   24.201284   25.619396   24.678802  0.927820   26.561853   
3    23.510565   22.997267   25.324879   24.375275  1.116096   27.487751   
4    24.710445   24.600668   25.670317   24.854555  0.692564   27.339170   
5    24.345797   24.225451   25.062435   24.550554  0.480316   27.643591   
6    24.148859   23.869160   24.755489   24.613178  1.241605   24.625473   
7    24.128815   23.898533   25.791168   24.870123  1.013499   27.354488   
8    22.838263   22.556526   23.877661   23.555597  1.145932   24.100914   
9    24.432600   23.893427   24.866381   24.626150  1.505956   24.904629   
10   23.664469   23.409687   24.461821   24.0

### 1.2 Degradation:
Use the degradation to specify a set of systematic maps, and generate observed magnitudes and magnitude errors:
- Generate magnitude error according to survey condition maps supplied, generate degraded mag + magerr
    - Note: galactic extinction to the cosmoDC2 magnitudes are not applied yet, probably need pipeline expansion to do it
    - Note: currently `magerrscale` cannot be applied
    - Note: can also specify the detection limit via `sigLim` (set to 3 sigma here);
- Select data with i-band detection and flag non-detection in other bands as `np.nan`, apply further cuts such as the gold cut $i<23.5$.

In [11]:
# first define a set of input map directories:

base_path = "/pscratch/sd/q/qhang/rubin_baseline_v2/MAF-1year/"

# nside of these maps:
nside=128

# seeing maps:
seeing_u = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_u_and_nightlt365_HEAL.fits"
seeing_g = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_g_and_nightlt365_HEAL.fits"
seeing_r = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_r_and_nightlt365_HEAL.fits"
seeing_i = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_i_and_nightlt365_HEAL.fits"
seeing_z = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_z_and_nightlt365_HEAL.fits"
seeing_y = base_path + "baseline_v2_0_10yrs_Median_seeingFwhmEff_y_and_nightlt365_HEAL.fits"

# coadd depth maps:
coaddm5_u = base_path + "baseline_v2_0_10yrs_CoaddM5_u_and_nightlt365_HEAL.fits"
coaddm5_g = base_path + "baseline_v2_0_10yrs_CoaddM5_g_and_nightlt365_HEAL.fits"
coaddm5_r = base_path + "baseline_v2_0_10yrs_CoaddM5_r_and_nightlt365_HEAL.fits"
coaddm5_i = base_path + "baseline_v2_0_10yrs_CoaddM5_i_and_nightlt365_HEAL.fits"
coaddm5_z = base_path + "baseline_v2_0_10yrs_CoaddM5_z_and_nightlt365_HEAL.fits"
coaddm5_y = base_path + "baseline_v2_0_10yrs_CoaddM5_y_and_nightlt365_HEAL.fits"

#EBV = base_path + "ebv_map.fits"

# here we will set the observing year and number of visits per year to 1, because we are supplying coadd depth

# mask:
maskdir = base_path + "../wfd_footprint_nvisitcut_500_nside_128.fits"

# weight: for now we supply uniform weight

# choose the systematic map to examine, here we choose the combined depth:
sys_to_check = base_path + "baseline_v2_0_10yrs_CoaddM5_i_and_nightlt365_HEAL.fits"
sys = "CoaddM5"

# directory to save all the data:
savedir = "/pscratch/sd/q/qhang/PZflow-samples/DC2-test/"

In [12]:
# change column name:
renameDict={
    "u": "mag_u_lsst",
    "g": "mag_g_lsst",
    "r": "mag_r_lsst",
    "i": "mag_i_lsst",
    "z": "mag_z_lsst",
    "y": "mag_y_lsst",
}

In [27]:
# Create the degrader
# list of arguement here
# Note: number of years and visits set to 1 because coadd depth are supplied for m5
y1_degrader = ObsCondition.make_stage(
    map_dict={
        "renameDict": renameDict,
        "theta": {
            "u": seeing_u,
            "g": seeing_g,
            "r": seeing_r,
            "i": seeing_i,
            "z": seeing_z,
            "y": seeing_y,
        },
        "m5": {
            "u": coaddm5_u,
            "g": coaddm5_g,
            "r": coaddm5_r,
            "i": coaddm5_i,
            "z": coaddm5_z,
            "y": coaddm5_y,
        },
        "nYrObs": 1.,
        "nVisYr": {
            "u": 1.,
            "g": 1.,
            "r": 1.,
            "i": 1.,
            "z": 1.,
            "y": 1., 
        },
        #"EBV": EBV, # waiting for new version of obs_cond!
        "sigLim": 0,
        "ndFlag": np.nan,
        "extendedType": "point", # point for now, but for more realistic error set to auto/gaap
        "majorCol": "", # these are empty for now, but for realistic error will need them
        "minorCol": "", # these are empty for now, but for realistic error will need them
        "decorrelate": True,
        "highSNR": False,
    },
    nside=nside,
    mask = maskdir,
    weight = "",
)

In [28]:
# Compute the semi major and minor axes
#fullCat = get_semi_major_minor(catalog, scale=lsstError_calibration["semi_major_minor_scale"])

# degraded data below
data_degraded = y1_degrader(samples_truth.data)

Assigning pixels.
Inserting handle into data store.  output: inprogress_output.pq, ObsCondition


In [29]:
data_degraded.data

Unnamed: 0,mag_z_lsst,mag_z_lsst_err,mag_y_lsst,mag_y_lsst_err,mag_r_lsst,mag_r_lsst_err,mag_i_lsst,mag_i_lsst_err,redshift,mag_u_lsst,mag_u_lsst_err,mag_g_lsst,mag_g_lsst_err,pixel
0,24.474985,0.227802,23.564373,0.231154,24.750506,0.108209,24.520709,0.114184,1.563762,24.641481,0.248077,24.51727,0.084882,89707
1,24.127345,0.1533,23.662074,0.16049,24.880483,0.116298,24.718288,0.13439,1.195636,25.286566,0.432219,25.39606,0.165874,164102
2,24.249842,0.182119,23.689696,0.229649,25.678677,0.228554,24.837254,0.154672,0.92782,26.408635,0.598139,26.167483,0.316908,151752
3,23.527924,0.08777,23.140666,0.113604,25.490785,0.147245,24.466179,0.098284,1.116096,26.464244,0.871219,26.606591,0.415789,130042
4,24.587437,0.182458,24.371956,0.257418,25.541929,0.14309,24.89386,0.109007,0.692564,34.729902,8.225078,26.351095,0.276344,129459
5,24.304432,0.22806,24.262165,0.276321,25.028425,0.111326,24.412153,0.095518,0.480316,27.14013,1.075688,26.628484,0.482261,173487
6,24.349448,0.229708,23.763181,0.244388,24.852383,0.100612,24.500648,0.115591,1.241605,24.638835,0.240506,24.900562,0.109815,89129
7,23.84799,0.16886,23.697801,0.20027,25.71079,0.198079,24.660039,0.123714,1.013499,,,27.130444,0.52156,156156
8,22.824117,0.062477,22.480659,0.081176,23.86628,0.049522,23.514087,0.058499,1.145932,24.090855,0.138931,24.037739,0.061709,104009
9,24.353579,0.190932,23.753803,0.278736,24.807212,0.126191,24.352545,0.108016,1.505956,25.085179,0.399094,24.928578,0.117662,90715


### 1.3 Quantity cut:

In [42]:
SNR=5 # snr>5 in i-band
gold_cut_y1 = QuantityCut.make_stage(name="cuts", cuts={"mag_i_lsst": 24.1,
                                                       "mag_i_lsst_err": 2.5/np.log(10) * 1/SNR})

In [43]:
samples_gold_w_errs = gold_cut_y1(data_degraded)

Inserting handle into data store.  output_cuts: inprogress_output_cuts.pq, cuts


In [44]:
samples_gold_w_errs.data

Unnamed: 0,mag_z_lsst,mag_z_lsst_err,mag_y_lsst,mag_y_lsst_err,mag_r_lsst,mag_r_lsst_err,mag_i_lsst,mag_i_lsst_err,redshift,mag_u_lsst,mag_u_lsst_err,mag_g_lsst,mag_g_lsst_err,pixel
8,22.824117,0.062477,22.480659,0.081176,23.86628,0.049522,23.514087,0.058499,1.145932,24.090855,0.138931,24.037739,0.061709,104009
11,23.29123,0.077327,23.158427,0.124524,23.637027,0.042954,23.517618,0.059106,0.137048,24.498258,0.222444,24.039017,0.07136,189070
12,20.451312,0.008742,20.333458,0.012133,21.375486,0.007516,20.69632,0.007212,0.795977,21.601408,0.019815,21.67346,0.010757,160097
13,23.768597,0.114088,23.843481,0.17254,24.695478,0.065159,23.909308,0.05421,0.663661,27.874823,1.72412,26.103089,0.320227,162781
14,22.297572,0.027774,22.195507,0.05797,23.457138,0.028245,22.669153,0.019927,0.924681,23.858143,0.077463,23.891401,0.046806,158228
16,23.257588,0.073311,23.381947,0.144242,23.838942,0.053087,23.515698,0.051574,0.12032,25.956482,0.543682,24.29604,0.058492,137492
17,22.645041,0.043998,22.104024,0.056074,23.237451,0.028655,22.936592,0.02877,1.53359,23.429627,0.07439,23.31453,0.046242,95765
19,23.231206,0.062809,23.043206,0.194908,24.378445,0.065259,23.543002,0.047579,0.596799,26.499608,0.822404,25.802498,0.177974,131215
21,21.855282,0.020951,21.71641,0.041657,23.011152,0.023085,22.277727,0.018745,0.437434,,,24.654993,0.115289,123062
23,23.293637,0.093073,23.228206,0.14155,24.166588,0.071154,23.539692,0.05833,0.65171,,,25.761161,0.284405,182923


### 1.4 Column mapper:

In [None]:
columns={
    "mag_g_lsst_err": "mag_err_g_lsst"
    "mag_i_lsst_err": "mag_err_i_lsst"
    "mag_r_lsst_err": "mag_err_r_lsst"
    "mag_u_lsst_err": "mag_err_u_lsst"
    "mag_y_lsst_err": "mag_err_y_lsst"
    "mag_z_lsst_err": "mag_err_z_lsst"
}
colmapper=ColumnMapper.make_stage(name='column_mapper', columns=columns)
rename_data=colmapper(samples_gold_w_errs)

### 1.5 de-redden:

In [None]:
dustmap_dir = "./"

dereddener = Dereddener.make_stage(name='dereddener', dustmap_dir=dustmap_dir)
dereddener.fetch_map()

deredden_data = dereddener(mags_data)

## Step 2: RAIL.estimation

Below is an example of using `BPZ_lite` to estimate redshifts for the above sample
- First de-redden the magnitudes
- Point estimate of the BPZ redshifts by extracting redshift mode

In [None]:
# this is for the BPZ prior:
inform_bpz=BPZliteInformer.make_stage(name='inform_bpz', hdf5_groupname='', 
                                   #columns_file=inroot+'test_bpz.columns',
                                   #prior_file='CWW_HDFN_prior.pkl',
                                   nondetect_val=np.nan, #spectra_file='SED/CWWSB4.list',
                                   band_names=band_names,
                                   band_err_names=band_err_names,
                                   prior_band=prior_band,
                                   mag_limits = dict(mag_u_lsst=27.79,
                                                mag_g_lsst=29.04,
                                                mag_r_lsst=29.06,
                                                mag_i_lsst=28.62,
                                                mag_z_lsst=27.98,
                                                mag_y_lsst=27.05),
                                   output=output,
                                     model="bpz.pkl")
inform_bpz.inform(training_data)

In [None]:
# BPZ_lite version:

band_names = ['u','g','r','i','z','y']
band_err_names = ['u_err','g_err','r_err','i_err','z_err','y_err']
prior_band='i'

output = savedir + "BPZ_lite_photoz.hdf5"

estimate_bpz = BPZliteEstimator.make_stage(name='estimate_bpz', hdf5_groupname='', 
                                   #columns_file=inroot+'test_bpz.columns',
                                   #prior_file='CWW_HDFN_prior.pkl',
                                   nondetect_val=np.nan, #spectra_file='SED/CWWSB4.list',
                                   band_names=band_names,
                                   band_err_names=band_err_names,
                                   prior_band=prior_band,
                                   mag_limits = dict(mag_u_lsst=27.79,
                                                mag_g_lsst=29.04,
                                                mag_r_lsst=29.06,
                                                mag_i_lsst=28.62,
                                                mag_z_lsst=27.98,
                                                mag_y_lsst=27.05),
                                   output=output,model=inform_bpz.get_handle('model'))

bpz_estimated = estimate_bpz.estimate(deredden_data)
# we can also obtain the point estimate, e.g the mode:
#zmode = pd.DataFrame(data={"pz_point": bpz_estimated().ancil['zmode']})
#bpz_modes = bpz_estimated().mode(grid=zgrid)
# the bpz output will be kept along with the data

## Step 3: RAIL.classifier

We will use linear binning e.g. $z=[ 0.2, 1.2]$, 5 bins

In [None]:
tomographer=UniformBinningClassifier.make_stage(
    point_estimate="zmode",
    zmin=0.2,
    zmax=1.2,
    nbins=5,
    no_assign=-99,
)

In [None]:
tomo_bins = tomographer(bpz_estimated)

### TXPipe:

Using methods given by [TXPipie/lens_selector.py](https://github.com/LSSTDESC/TXPipe/blob/ad3844769f097d4e86f8ae090b1e9fbd0e99c801/txpipe/lens_selector.py) and [TXPipe/source_selector](https://github.com/LSSTDESC/TXPipe/blob/master/txpipe/source_selector.py). 

Notice for TXPipe, catalogue is derived from metacal or metadetect (e.g. using `riz`), and the end result is an additional column indicating which object is in which tomographic bin.

Having looked at TXPipe, it seems that you can pass on photo-z for objects in the catalogue. If these are passed, then tomographic bins are split given the bin edges in pz, if not, random forest will be used with the limited bands available. 

MetaDetect will have different variants of the data based on which object is detected, and so tomographic bin is determined in each case.

```
# TXpipe related imports

# Stages to run
stages:
    - name: FlowCreator             # Simulate a spectroscopic population
    - name: GridSelection          # Simulate a spectroscopic sample
    - name: TXParqetToHDF          # Convert the spec sample format
    - name: PZPrepareEstimatorLens   # Prepare the p(z) estimator
      classname: Inform_BPZ_lite   
    - name: PZEstimatorLens        # Measure lens galaxy PDFs
      classname: BPZ_lite
      threads_per_process: 1  
    - name: TXMeanLensSelector     # select objects for lens bins from the PDFs
    - name: Inform_NZDirLens       # Prepare the DIR method inputs for the lens sample     
      classname: Inform_NZDir
    - name: PZRailSummarizeLens    # Run the DIR method on the lens sample to find n(z)
      classname: PZRailSummarize  
    - name: PZRailSummarizeSource  # Run the DIR method on the lens sample to find n(z)
      classname: PZRailSummarize
    - name: TXSourceSelectorMetadetect  # select and split objects into source bins
    - name: Inform_NZDirSource     # Prepare the DIR method inputs for the source sample
      classname: Inform_NZDir
    - name: TXShearCalibration     # Calibrate and split the source sample tomographically
    - name: TXLensCatalogSplitter  # Split the lens sample tomographically
```

## Step 4: Evaluation

Here we write a simple code to evaluate the shifts and scatter of the photo-z bins for different depth:

In [1]:
# now for each quantile, find the data and sample the redshifts distirbution:

# split the systematic maps into 20 quantiles
nquantiles=20
# define the zgrid for nz
zgrid = np.linspace(0,3,101)
# number of bootstrap samples to use
nbootstrap=1000
# which redshift in the data set to use for nz
z_col = 'redshift' # true redshift, but can change here to pz_mode or something
# number of tomographic bins used 
npzbins=data_degraded_gold_tomo["tomo"].max()

# load the specific systematic map & mask to check correlation
mapin = hp.read_map(sys_to_check)
mask = hp.read_map(maskdir)
# quantile contains pixel indices, and meanv is the mean value of the systematic maps in each quantile
quantile, meanv = split_sys_map_quantiles(mapin, mask, nquantiles=nquantiles)

# compute simple summary statistic
nzstat_summary_split={}

for jj in range(npzbins):
    nzstat_summary_split["tomo-%d"%(jj+1)]={}
    
    ind0 = data_degraded_gold_tomo["tomo"] == (jj+1)

    for ii in range(nquantiles):
        ind = np.in1d(data_degraded_gold_tomo["pixels"], quantile[ii])
        ind = ind * ind0
        usecat = data_degraded_gold_tomo.loc[ind, :]
        # now for each tomographic bin, return redshift distribution:
        nzstat_summary_split["tomo-%d"%(jj+1)][ii] = compute_nzstats(usecat, z_col, 
                                                                     zgrid=zgrid, nbootstrap=nbootstrap)

    # compute the tot nz, meanz, sigmaz:
    nzstat_summary_tot["tomo-%d"%(jj+1)] = compute_nzstats(data_degraded_gold_tomo, z_col, 
                                                           zgrid=zgrid, nbootstrap=nbootstrap)

In [None]:
# write to file:
outroot = savedir + "test-pz-with-i-band-coadd-Y1.yml"
write_evaluation_results(outroot, meanv, nzstat_summary_split, nzstat_summary_tot)

In [None]:
# Show results in a plot:
fig,axarr=plt.subplots(3,npzbins,figsize=[15,10],gridspec_kw={'height_ratios': [3, 1, 1]})

## Top row: n(z) for each tomographic bin for each depth group
## Middle row: change in meanz as a function of depth with bootstrap errors
## Bottom row: change in sigmaz as a function of depth with boostrap errors

for ii in range(npzbins):
    
    # top row
    plt.sca(axarr[0,ii])
    for q in range(nquantiles):
        colorlab = q/(nquantiles*1.2)
        nz = stat_summary_split["tomo-%d"%(ii+1)][q][0]
        plt.plot(nz[:,0], nz[:,1]/np.sum(nz[:,1])/(nz[1,0]-nz[0,0]), 
                color=cmap(colorlab))
    plt.text(0.6, 3.5, "tomo-%d"%(ii+1))
    plt.yticks([])
    plt.xlabel("$z$")
    
    # middle row
    plt.sca(axarr[1, ii])
    for q in range(nquantiles):
        colorlab = q/(nquantiles*1.2)
        
        meanz = stat_summary_split["tomo-%d"%(ii+1)][q][1]
        meanztot = stat_summary_tot["tomo-%d"%(ii+1)][q][1]
        
        plt.errorbar(meanv[q], meanz[0], yerr=meanz[1],fmt='o',
                    color=cmap(colorlab))
    #dz = 0.005*(1+meanztot[0])
    plt.plot(meanv, np.ones(len(meanv))*meanztot[0], 'k-', alpha=0.5)
    #plt.fill_between([meanv[0], meanv[-1]], [-dz, -dz], 
                    #[dz, dz],color='k',alpha=0.2)
    if ii==0:
        plt.ylabel("$\\langle z\\rangle$")
    if ii>0:
        plt.yticks([])
    #plt.xlabel(sys)
    #plt.ylim([-0.015,0.015])
    #plt.xlim([24.6, 25.7])
    
    # bottom row
    plt.sca(axarr[2, ii])
    for q in range(nquantiles):
        colorlab = q/(nquantiles*1.2)
        
        sigmaz = stat_summary_split["tomo-%d"%(ii+1)][q][2]
        sigmaztot = stat_summary_tot["tomo-%d"%(ii+1)][q][2]
        
        plt.errorbar(meanv[q], sigmaz[0], yerr=sigmaz[1],fmt='o',
                    color=cmap(colorlab))
    #dz = 0.005
    plt.plot(meanv, np.ones(len(meanv))*sigmaztot[0], 'k-', alpha=0.5)
    #plt.fill_between([meanv[0], meanv[-1]], [-dz, -dz], 
                    #[dz, dz],color='k',alpha=0.2)
    if ii==0:
        plt.ylabel("$\\sigma_z$")
    if ii>0:
        plt.yticks([])
    plt.xlabel(sys)
    #plt.ylim([-0.015,0.015])

plt.tight_layout()
plt.saveifg(savedir + 'fig.png', bbox_inches='tight')