# BPZ RAIL - DP0.2 (sequilhos mesmo)
Bring it to memory? Not bring to memory? how should I do it?

## Imports

### common libs

In [1]:
import time
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline 

### RAIL


https://lsstdescrail.readthedocs.io/en/latest/source/installation.html

In [2]:
import rail
import qp
import tables_io

from rail.core.data import TableHandle
from rail.core.stage import RailStage
from rail.core.utilStages import ColumnMapper, TableConverter

from rail.creation.degradation import LSSTErrorModel, InvRedshiftIncompleteness, LineConfusion, QuantityCut
from rail.creation.engines.flowEngine import FlowEngine, FlowPosterior

from rail.estimation.algos.bpz_lite import Inform_BPZ_lite, BPZ_lite

from rail.evaluation.evaluator import Evaluator

In [40]:
help(rail)

Help on package rail:

NAME
    rail - RAIL, the Redshift Assesement Infrastructre Layers

PACKAGE CONTENTS
    __main__
    _version
    core (package)
    creation (package)
    estimation (package)
    evaluation (package)
    main
    version

VERSION
    0.96.dev326+ge3e6ed6

FILE
    /home/heloisamengisztki/.local/lib/python3.10/site-packages/rail/__init__.py




### LSST - TAP 

TAP - protocol  to access general table data

https://www.ivoa.net/documents/TAP/

motivação: despadronização dos dados astronoomicos

created based in html protocol and xml.

distinguises tabular and non tabular data

tabular - keyword value from data files and stores it in tables (on column per keyword)
ADQL service discovery followed by data discovery


non tabular - n-dimensional data arrais (images, spectra, data cubes)

 



In [3]:
from lsst.rsp import get_tap_service

## General Configs

In [4]:
pd.set_option('display.max_rows', 20)

In [5]:
service = get_tap_service()

assert service is not None
assert service.baseurl == "https://data.lsst.cloud/api/tap"

In [6]:
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

CURR_DIR = os.getcwd()
RAIL_DIR = os.path.join(os.path.dirname(rail.__file__), '..')
CURR_DIR, RAIL_DIR

('/home/heloisamengisztki/ic-photoz/RAIL/bpz_test_rail',
 '/home/heloisamengisztki/.local/lib/python3.10/site-packages/rail/..')

In [7]:
bands = ['u','g','r','i','z','y']
band_dict = {band:f'mag_{band}_lsst' for band in bands}
rename_dict = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

## Reading DP0.2 data

In [8]:
max_rec = 1000
use_center_coords = "62, -37"
use_radius = "1.0"

In [9]:
bands = ['g', 'i', 'r', 'u', 'y', 'z']

mags = ""
for band in bands:
    mags+= f"scisql_nanojanskyToAbMag({band}_cModelFlux) AS mag_{band}_cModel, {band}_cModelFluxErr, "

columns_query = f"objectId, coord_ra, coord_dec, {mags}detect_isPrimary "

In [10]:
query = "SELECT " + columns_query + \
        "FROM dp02_dc2_catalogs.Object " + \
        "WHERE CONTAINS(POINT('ICRS', coord_ra, coord_dec), CIRCLE('ICRS', " + use_center_coords + ", " + use_radius + ")) = 1 " + \
        "AND detect_isPrimary = 1 " + \
        "AND r_extendedness = 1 " + \
        "AND scisql_nanojanskyToAbMag(r_cModelFlux) > 17.0 " + \
        "AND scisql_nanojanskyToAbMag(r_cModelFlux) < 23.0 "
print(query)

SELECT objectId, coord_ra, coord_dec, scisql_nanojanskyToAbMag(g_cModelFlux) AS mag_g_cModel, g_cModelFluxErr, scisql_nanojanskyToAbMag(i_cModelFlux) AS mag_i_cModel, i_cModelFluxErr, scisql_nanojanskyToAbMag(r_cModelFlux) AS mag_r_cModel, r_cModelFluxErr, scisql_nanojanskyToAbMag(u_cModelFlux) AS mag_u_cModel, u_cModelFluxErr, scisql_nanojanskyToAbMag(y_cModelFlux) AS mag_y_cModel, y_cModelFluxErr, scisql_nanojanskyToAbMag(z_cModelFlux) AS mag_z_cModel, z_cModelFluxErr, detect_isPrimary FROM dp02_dc2_catalogs.Object WHERE CONTAINS(POINT('ICRS', coord_ra, coord_dec), CIRCLE('ICRS', 62, -37, 1.0)) = 1 AND detect_isPrimary = 1 AND r_extendedness = 1 AND scisql_nanojanskyToAbMag(r_cModelFlux) > 17.0 AND scisql_nanojanskyToAbMag(r_cModelFlux) < 23.0 


In [11]:
%%time
results = service.search(query, maxrec=max_rec).to_table().to_pandas()
results.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   objectId          1000 non-null   int64  
 1   coord_ra          1000 non-null   float64
 2   coord_dec         1000 non-null   float64
 3   mag_g_cModel      992 non-null    float64
 4   g_cModelFluxErr   999 non-null    float64
 5   mag_i_cModel      999 non-null    float64
 6   i_cModelFluxErr   1000 non-null   float64
 7   mag_r_cModel      1000 non-null   float64
 8   r_cModelFluxErr   1000 non-null   float64
 9   mag_u_cModel      985 non-null    float64
 10  u_cModelFluxErr   1000 non-null   float64
 11  mag_y_cModel      992 non-null    float64
 12  y_cModelFluxErr   1000 non-null   float64
 13  mag_z_cModel      991 non-null    float64
 14  z_cModelFluxErr   1000 non-null   float64
 15  detect_isPrimary  1000 non-null   bool   
dtypes: bool(1), float64(14), int64(1)
memory us

In [12]:
results

Unnamed: 0,objectId,coord_ra,coord_dec,mag_g_cModel,g_cModelFluxErr,mag_i_cModel,i_cModelFluxErr,mag_r_cModel,r_cModelFluxErr,mag_u_cModel,u_cModelFluxErr,mag_y_cModel,y_cModelFluxErr,mag_z_cModel,z_cModelFluxErr,detect_isPrimary
0,1567973949952790391,61.602992,-37.293849,23.696957,12.548565,21.099163,27.521535,22.430446,14.663127,27.550546,34.653608,20.262099,164.774788,20.507491,81.949101,True
1,1567973949952790668,61.590751,-37.292278,23.125924,26.289981,22.113301,55.264729,22.779456,31.375358,23.013545,74.512305,21.919335,297.971598,22.004578,172.905696,True
2,1567973949952790667,61.589349,-37.292548,22.021820,23.262162,20.059149,49.698642,20.897961,28.538184,22.511459,63.082859,19.516139,264.040413,19.733546,148.953966,True
3,1567973949952791963,61.636723,-37.283918,24.160903,18.812363,21.384871,43.120937,22.578849,24.011901,26.213431,57.266093,20.521930,244.658876,20.860803,129.454169,True
4,1567973949952790537,61.690688,-37.295615,22.045110,237.919748,21.875550,264.995561,22.003669,168.407295,22.973886,882.941371,21.685076,1244.641438,21.791332,720.584251,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1567894785115591245,62.000491,-37.513005,23.621447,19.538482,21.809799,46.369414,22.744800,24.589785,23.822215,61.771403,21.004018,230.937482,21.116708,128.961837,True
996,1567894785115591208,62.102973,-37.514974,22.613510,22.267567,21.093730,50.306923,21.908897,27.285908,22.825432,66.404090,20.643999,249.340109,20.831424,129.675148,True
997,1567894785115591209,62.101056,-37.515979,23.411734,16.536782,21.510806,38.055116,22.434253,20.498258,23.850466,49.942741,20.946592,201.987214,21.217881,100.566062,True
998,1567894785115591421,62.112586,-37.511924,20.695095,43.761440,18.640691,103.553990,19.260028,58.675204,22.634726,123.932048,18.163372,433.248979,18.374916,240.699822,True


---

## Rename Columns

In [25]:
columns_new = {
"objectId": "id",
"coord_ra": "coord_ra",
"coord_dec": "coord_dec",
"mag_g_cModel": "mag_g_lsst",
"g_cModelFluxErr": "mag_err_g_lsst",
"mag_i_cModel": "mag_r_lsst",
"i_cModelFluxErr": "mag_err_r_lsst",
"mag_r_cModel": "mag_i_lsst",
"r_cModelFluxErr": "mag_err_i_lsst",
"mag_u_cModel": "mag_u_lsst",
"u_cModelFluxErr": "mag_err_u_lsst",
"mag_y_cModel": "mag_y_lsst",
"y_cModelFluxErr": "mag_err_y_lsst",
"mag_z_cModel": "mag_z_lsst",
"z_cModelFluxErr": "mag_err_z_lsst",
"detect_isPrimary": "detect_isPrimary"
}

col_remapper_train = ColumnMapper.make_stage(name='col_remapper_train', columns=columns_new)

results_remmaped = col_remapper_train(results)
results_remmaped.data

Inserting handle into data store.  output_col_remapper_train: inprogress_output_col_remapper_train.pq, col_remapper_train


Unnamed: 0,id,coord_ra,coord_dec,mag_g_lsst,mag_err_g_lsst,mag_r_lsst,mag_err_r_lsst,mag_i_lsst,mag_err_i_lsst,mag_u_lsst,mag_err_u_lsst,mag_y_lsst,mag_err_y_lsst,mag_z_lsst,mag_err_z_lsst,detect_isPrimary
0,1567973949952790391,61.602992,-37.293849,23.696957,12.548565,21.099163,27.521535,22.430446,14.663127,27.550546,34.653608,20.262099,164.774788,20.507491,81.949101,True
1,1567973949952790668,61.590751,-37.292278,23.125924,26.289981,22.113301,55.264729,22.779456,31.375358,23.013545,74.512305,21.919335,297.971598,22.004578,172.905696,True
2,1567973949952790667,61.589349,-37.292548,22.021820,23.262162,20.059149,49.698642,20.897961,28.538184,22.511459,63.082859,19.516139,264.040413,19.733546,148.953966,True
3,1567973949952791963,61.636723,-37.283918,24.160903,18.812363,21.384871,43.120937,22.578849,24.011901,26.213431,57.266093,20.521930,244.658876,20.860803,129.454169,True
4,1567973949952790537,61.690688,-37.295615,22.045110,237.919748,21.875550,264.995561,22.003669,168.407295,22.973886,882.941371,21.685076,1244.641438,21.791332,720.584251,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1567894785115591245,62.000491,-37.513005,23.621447,19.538482,21.809799,46.369414,22.744800,24.589785,23.822215,61.771403,21.004018,230.937482,21.116708,128.961837,True
996,1567894785115591208,62.102973,-37.514974,22.613510,22.267567,21.093730,50.306923,21.908897,27.285908,22.825432,66.404090,20.643999,249.340109,20.831424,129.675148,True
997,1567894785115591209,62.101056,-37.515979,23.411734,16.536782,21.510806,38.055116,22.434253,20.498258,23.850466,49.942741,20.946592,201.987214,21.217881,100.566062,True
998,1567894785115591421,62.112586,-37.511924,20.695095,43.761440,18.640691,103.553990,19.260028,58.675204,22.634726,123.932048,18.163372,433.248979,18.374916,240.699822,True


In [14]:
results_remmaped.data["redshift"] = 0
results_remmaped.data

Unnamed: 0,id,coord_ra,coord_dec,mag_g_lsst,mag_err_g_lsst,mag_r_lsst,mag_err_r_lsst,mag_i_lsst,mag_err_i_lsst,mag_u_lsst,mag_err_u_lsst,mag_y_lsst,mag_err_y_lsst,mag_z_lsst,mag_err_z_lsst,detect_isPrimary,redshift
0,1567973949952790391,61.602992,-37.293849,23.696957,12.548565,21.099163,27.521535,22.430446,14.663127,27.550546,34.653608,20.262099,164.774788,20.507491,81.949101,True,0
1,1567973949952790668,61.590751,-37.292278,23.125924,26.289981,22.113301,55.264729,22.779456,31.375358,23.013545,74.512305,21.919335,297.971598,22.004578,172.905696,True,0
2,1567973949952790667,61.589349,-37.292548,22.021820,23.262162,20.059149,49.698642,20.897961,28.538184,22.511459,63.082859,19.516139,264.040413,19.733546,148.953966,True,0
3,1567973949952791963,61.636723,-37.283918,24.160903,18.812363,21.384871,43.120937,22.578849,24.011901,26.213431,57.266093,20.521930,244.658876,20.860803,129.454169,True,0
4,1567973949952790537,61.690688,-37.295615,22.045110,237.919748,21.875550,264.995561,22.003669,168.407295,22.973886,882.941371,21.685076,1244.641438,21.791332,720.584251,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1567894785115591245,62.000491,-37.513005,23.621447,19.538482,21.809799,46.369414,22.744800,24.589785,23.822215,61.771403,21.004018,230.937482,21.116708,128.961837,True,0
996,1567894785115591208,62.102973,-37.514974,22.613510,22.267567,21.093730,50.306923,21.908897,27.285908,22.825432,66.404090,20.643999,249.340109,20.831424,129.675148,True,0
997,1567894785115591209,62.101056,-37.515979,23.411734,16.536782,21.510806,38.055116,22.434253,20.498258,23.850466,49.942741,20.946592,201.987214,21.217881,100.566062,True,0
998,1567894785115591421,62.112586,-37.511924,20.695095,43.761440,18.640691,103.553990,19.260028,58.675204,22.634726,123.932048,18.163372,433.248979,18.374916,240.699822,True,0


## BPZ - RAIL

In [26]:
#add data that is already read in the memory (in the future handler is going to access that)
data = DS.add_data("data", results_remmaped.data, TableHandle)
DS.keys()

dict_keys(['input', 'output_col_remapper_train', 'data', 'model_inform_bpzlite'])

### PRIORS and configs

In [27]:
columns_file =  os.path.join(CURR_DIR, 'configs/bpz.columns')
inform_bpz = Inform_BPZ_lite.make_stage(name='inform_bpzlite', input='inprogress_output_table_conv_train.hdf5', model='trained_BPZ_output.pkl', hdf5_groupname='', columns_file=columns_file)
inform_bpz.config.to_dict()

{'output_mode': 'default',
 'hdf5_groupname': '',
 'save_train': True,
 'zmin': 0.0,
 'zmax': 3.0,
 'nzbins': 301,
 'band_names': ['mag_u_lsst',
  'mag_g_lsst',
  'mag_r_lsst',
  'mag_i_lsst',
  'mag_z_lsst',
  'mag_y_lsst'],
 'band_err_names': ['mag_err_u_lsst',
  'mag_err_g_lsst',
  'mag_err_r_lsst',
  'mag_err_i_lsst',
  'mag_err_z_lsst',
  'mag_err_y_lsst'],
 'nondetect_val': 99.0,
 'data_path': 'None',
 'columns_file': '/home/heloisamengisztki/ic-photoz/RAIL/bpz_test_rail/configs/bpz.columns',
 'spectra_file': 'SED/CWWSB4.list',
 'm0': 20.0,
 'nt_array': [1, 2, 3],
 'mmin': 18.0,
 'mmax': 29.0,
 'init_kt': 0.3,
 'init_zo': 0.4,
 'init_alpha': 1.8,
 'init_km': 0.1,
 'prior_band': 'mag_i_lsst',
 'redshift_col': 'redshift',
 'type_file': '',
 'name': 'inform_bpzlite',
 'input': 'inprogress_output_table_conv_train.hdf5',
 'model': 'trained_BPZ_output.pkl',
 'config': None,
 'aliases': {'model': 'model_inform_bpzlite'}}

In [17]:
%%time
inform_bpz.inform(data)

using 998 galaxies in calculation
best values for fo and kt:
[1.]
[0.3]
minimizing for type 0
[0.4 1.8 0.1] inf
[0.42 1.8  0.1 ] inf
[0.4  1.89 0.1 ] inf
[0.4   1.8   0.105] inf
[0.41333333 1.86       0.095     ] inf
[0.40333333 1.815      0.1025    ] inf
[0.41 1.8  0.1 ] inf
[0.4   1.845 0.1  ] inf
[0.4    1.8    0.1025] inf
[0.40666667 1.83       0.0975    ] inf
[0.40166667 1.8075     0.10125   ] inf
[0.405 1.8   0.1  ] inf
[0.4    1.8225 0.1   ] inf
[0.4     1.8     0.10125] inf
[0.40333333 1.815      0.09875   ] inf
[0.40083333 1.80375    0.100625  ] inf
[0.4025 1.8    0.1   ] inf
[0.4     1.81125 0.1    ] inf
[0.4      1.8      0.100625] inf
[0.40166667 1.8075     0.099375  ] inf
[0.40041667 1.801875   0.1003125 ] inf
[0.40125 1.8     0.1    ] inf
[0.4      1.805625 0.1     ] inf
[0.4       1.8       0.1003125] inf
[0.40083333 1.80375    0.0996875 ] inf
[0.40020833 1.8009375  0.10015625] inf
[0.400625 1.8      0.1     ] inf
[0.4       1.8028125 0.1      ] inf
[0.4        1.8      

  result = getattr(ufunc, method)(*inputs, **kwargs)
  np.max(np.abs(fsim[0] - fsim[1:])) <= fatol):


[0.4        1.80000275 0.1       ] inf
[0.4        1.8        0.10000015] inf
[0.40000041 1.80000183 0.09999985] inf
[0.4000001  1.80000046 0.10000008] inf
[0.40000031 1.8        0.1       ] inf
[0.4        1.80000137 0.1       ] inf
[0.4        1.8        0.10000008] inf
[0.4000002  1.80000092 0.09999992] inf
[0.40000005 1.80000023 0.10000004] inf
[0.40000015 1.8        0.1       ] inf
[0.4        1.80000069 0.1       ] inf
[0.4        1.8        0.10000004] inf
[0.4000001  1.80000046 0.09999996] inf
[0.40000003 1.80000011 0.10000002] inf
[0.40000008 1.8        0.1       ] inf
[0.4        1.80000034 0.1       ] inf
[0.4        1.8        0.10000002] inf
[0.40000005 1.80000023 0.09999998] inf
[0.40000001 1.80000006 0.10000001] inf
[0.40000004 1.8        0.1       ] inf
[0.4        1.80000017 0.1       ] inf
[0.4        1.8        0.10000001] inf
[0.40000003 1.80000011 0.09999999] inf
[0.40000001 1.80000003 0.1       ] inf
[0.40000002 1.8        0.1       ] inf
[0.4        1.80000009 0.

<rail.core.data.ModelHandle at 0x7ff57bc379d0>

___

## Posterior -> Handler -> roda o algoritmo


In [36]:
estimate_bpz = BPZ_lite.make_stage(name='estimate_bpz', hdf5_groupname='', columns_file=columns_file, model = inform_bpz.get_handle('model'))
estimate_bpz.is_parallel()

estimate_bpz.get_data("mag_u_lsst")

KeyError: 'mag_u_lsst'

In [29]:
help(estimate_bpz)

Help on class BPZ_lite in module rail.estimation.algos.bpz_lite:

class BPZ_lite(rail.estimation.estimator.CatEstimator)
 |  BPZ_lite(args, comm=None)
 |  
 |  CatEstimator subclass to implement basic marginalized PDF for BPZ
 |  
 |  Method resolution order:
 |      BPZ_lite
 |      rail.estimation.estimator.CatEstimator
 |      rail.core.stage.RailStage
 |      ceci.stage.PipelineStage
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, args, comm=None)
 |      Constructor, build the CatEstimator, then do BPZ specific setup
 |  
 |  open_model(self, **kwargs)
 |      Load the mode and/or attach it to this Estimator
 |      
 |      Keywords
 |      --------
 |      model : `object`, `str` or `ModelHandle`
 |          Either an object with a trained model,
 |          a path pointing to a file that can be read to obtain the trained model,
 |          or a `ModelHandle` providing access to the trained model.
 |      
 |      Returns
 |      -------
 |      se

In [21]:
bpz_estimated = estimate_bpz.estimate(results_remmaped.data)

Process 0 running estimator on chunk 0 - 1000


KeyError: 'mag_u_lsst'

In [None]:
#help(bpz_estimated())
bpz_estimated().build_tables()

#results_tables = tables_io.convertObj(bpz_estimated().build_tables()['ancil'], tables_io.types.PD_DATAFRAME)
#results_tables

In [None]:
the_eval = Evaluator.make_stage(name=f'bpz_eval', truth=test_data_orig)
print(bpz_estimated, test_data_orig)
result_dict = the_eval.evaluate(bpz_estimated, test_data_orig)

In [None]:
results_tables = tables_io.convertObj(result_dict.data, tables_io.types.PD_DATAFRAME)
results_tables.head()

___
## Resultado pz x spec-z

In [None]:
zmode = bpz_estimated().ancil['zmode']

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(test_data()['redshift'],zmode,s=1,c='k',label='simple bpz mode')
plt.plot([0,3],[0,3],'r--');
plt.xlabel("true redshift")
plt.ylabel("bpz photo-z")

## PIT COM PROBLEMAS

In [None]:
from rail.evaluation.metrics.pit import *
from rail.core.data import QPHandle, TableHandle

pdfs_file =  os.path.join(curr_dir, "output_estimate_bpz.hdf5")

ztrue = test_data_orig()['redshift']
fzdata = DS.read_file('pdfs_data', QPHandle, pdfs_file)

pitobj = PIT(fzdata(), ztrue)
quant_ens, metamets = pitobj.evaluate()
print(quant_ens, metamets)

In [None]:
pit_vals = np.array(pitobj._pit_samps)
print(pit_vals)

pit_out_rate = PITOutRate(pit_vals, quant_ens).evaluate()
print(f"PIT outlier rate of this sample: {pit_out_rate}") 

In [None]:
galid = 1500

zgrid = np.linspace(0, 3., 301)
single_gal = np.squeeze(bpz_estimated()[galid].pdf(zgrid))
print(bpz_estimated()[galid].pdf(zgrid))
single_zmode = zmode[galid]
truez = test_data()['redshift'][galid]

plt.plot(zgrid,single_gal,color='k',label='single pdf')
plt.axvline(single_zmode,color='k', ls='--', label='mode')
plt.axvline(truez,color='r',label='true redshift')
plt.legend(loc='upper right')
plt.xlabel("redshift")
plt.ylabel("p(z)")

### PIPELINES CECI

In [None]:
import ceci
pipe = ceci.Pipeline.interactive()
stages = [flow_engine_train, lsst_error_model_train, inv_redshift,
          line_confusion, quantity_cut, col_remapper_train, table_conv_train,
          flow_engine_test, lsst_error_model_test, col_remapper_test, table_conv_test,  
          inform_knn, inform_fzboost, inform_bpz, estimate_knn, 
          estimate_fzboost, estimate_bpz, point_estimate_test,
          naive_stack_test]
for stage in stages:
    pipe.add_stage(stage)

In [None]:
pipe.initialize(dict(flow=flow_file), dict(output_dir='.', log_dir='.', resume=False), None)
pipe.save('bpz_pipeline.yml')