# TPz lite

_Authors: Andreia Dourado, Bruno Moraes_

_Adapted from Sam Schmidt example notebook: https://github.com/LSSTDESC/rail_tpz ._


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rail
import qp
from rail.core.data import TableHandle, PqHandle, ModelHandle, QPHandle, DataHandle, Hdf5Handle
from rail.core.data import TableHandle
from rail.core.stage import RailStage

### 1. Reading the data

In [None]:
data = pd.read_csv("/home/andreia.dourado/ic-photoz/andreia_dourado/data_traning_set.csv")

In [None]:
data

### 2. Applying cuts

In [None]:
#bands = ['u','g', 'r', 'i','z','y']
#for f, filt in enumerate(bands):
   # tx = np.where(data.loc[:, 'magerr_'+filt] > 0.2)[0]
   # data.loc[tx, 'magerr_'+filt] = float('NaN')
   # del tx
#data

#### 2.1 i < 25.3, according to LSST gold sample (https://www.lsst.org/sites/default/files/docs/sciencebook/SB_3.pdf)

In [None]:
data = data[data["mag_i"] < 25.3]
data

#### 2.2 i > 16, according to detection limit

In [None]:
data = data[data["mag_i"] >  16]
data

#### 2.3 Removing NaN values

In [None]:
data = data.dropna()
data

### 3. Run TPz

In [None]:
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import tables_io

In [None]:
from rail.estimation.algos.tpz_lite import TPZliteInformer

In [None]:
import h5py

Train: 70%

In [None]:
training_csv = data.sample(18781,random_state=40)

In [None]:
training_csv

To write the hdf5 file correctly, it was converted the dataframe to a dictionary with parameter _"orient='list'"_, so that _'list': dict like {column->[values]}_

Creating the dictionary:

In [None]:
dict_list = training_csv.to_dict(orient='list')


Creating the hdf5 file:

In [None]:
hdf5_file_path = ('/home/andreia.dourado/ic-photoz/andreia_dourado/training_data.hdf5')

Writing the dictionary values in the hdf5 file:

In [None]:
with h5py.File(hdf5_file_path, 'w') as hdf5_file:
    def write_dict(group, data):
        for key, value in data.items():
            if isinstance(value, dict):
                subgroup = group.create_group(str(key))
                write_dict(subgroup, value)
            else:
                group[str(key)] = value

    write_dict(hdf5_file, dict_list)

In [None]:
datafile = ('/home/andreia.dourado/ic-photoz/andreia_dourado/training_data.hdf5')

Creating the training_data file:

In [None]:
training_data = DS.read_file("training_data", TableHandle, datafile)

In [None]:
print(training_data.data)

#### 3.1 Setting the parameters

In [None]:
bands = ["g", "i", "r", "u", "y", "z"]
new_err_dict = {}
train_atts = []
new_mag_limits={}
for band in bands:
    train_atts.append(f"mag_{band}")
    new_err_dict[f"mag_{band}"] = f"magerr_{band}"
    new_mag_limits[f"mag_{band}"] = "29"
# redshift is also an attribute used in the training, but it does not have an associated
# error its entry in the err_dict should be set to "None"
new_err_dict["redshift"] = None

print(new_err_dict)
print(train_atts)
print(new_mag_limits)

In [None]:
new_err_dict.items()

In [None]:
tpz_dict = dict(zmin=min(training_csv['redshift']), 
                zmax=max(training_csv['redshift']), 
                nzbins=301, 
                mag_limits=new_mag_limits,
                bands=train_atts,
                hdf5_groupname=None,
                use_atts=train_atts,
                err_dict=new_err_dict,
                nrandom=3, 
                ntrees=34,
                minleaf=30) 

#### 3.2 Inform method

In [None]:
pz_train = TPZliteInformer.make_stage(name='inform_TPZ', model='estimator_tpz_run2.pkl', **tpz_dict)

In [None]:
%%time
pz_train.inform(training_data)

#### 3.3 Estimate stage

Selecting the data:

In [None]:
validation= data.drop(training_csv.index)

In [None]:
validation

Writing hdf5, similary to training set:

In [None]:
dict_list = validation.to_dict(orient='list')

In [None]:
hdf5_file_path = ('/home/andreia.dourado/ic-photoz/andreia_dourado/test_data.hdf5')

In [None]:
with h5py.File(hdf5_file_path, 'w') as hdf5_file:
    def write_dict(group, data):
        for key, value in data.items():
            if isinstance(value, dict):
                subgroup = group.create_group(str(key))
                write_dict(subgroup, value)
            else:
                group[str(key)] = value

    write_dict(hdf5_file, dict_list)

Creating the test_data file:

In [None]:
testfile = ('/home/andreia.dourado/ic-photoz/andreia_dourado/test_data.hdf5')

In [None]:
test_data = DS.read_file("test_data", TableHandle, testfile)

In [None]:
test_data.data

Run:

In [None]:
from rail.estimation.algos.tpz_lite import TPZliteEstimator

In [None]:
test_dict = dict(hdf5_groupname=None,
                test_err_dict=new_err_dict,
                mag_limits=new_mag_limits)
test_dict

In [None]:
test_runner = TPZliteEstimator.make_stage(name="test_tpz", output="output.hdf5",
                                          model=pz_train.get_handle('model'), **test_dict)

In [None]:
%%time
results = test_runner.estimate(test_data)