# Preparação dos conjuntos de treino e teste para o TPZ

_Autores: Andreia Dourado, Bruno Moraes_

__Descrição: Divisão do training set nos conjuntos de treino(70%) e teste(30%), salvos em arquivos .hdf5 no formato compatível com o algortimo TPZ.__

### 1. Importando as bibliotecas

In [None]:
import pandas as pd
import tables_io
import h5py
import numpy as np

### 2. Lendo o arquivo com os dados:

In [None]:
sigma = 10
path_data = f'/lustre/t0/scratch/users/andreia.dourado/TCC/dp02/object/data/training_set_dp02_object_{sigma}sigma.csv'
print(path_data)

In [None]:
data_convert=pd.read_csv(path_data)
data_convert

##### Caso tenha valores infinitos no conjunto:

In [None]:
inf_mask = np.isinf(data_convert)

In [None]:
rows_with_inf = inf_mask.any(axis=1).sum()
rows_with_inf

In [None]:
infs_per_column = inf_mask.sum(axis=0)
infs_per_column

In [None]:
data_convert.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
data_convert.columns

In [None]:
columns_map = {
    'coord_radp02_object': 'ra',
    'coord_decdp02_object': 'dec',
    'redshiftdp01_test_truth': "redshift",
    'mag_udp02_object': 'mag_u',
    'mag_gdp02_object': 'mag_g',
    'mag_rdp02_object': 'mag_r',
    'mag_idp02_object': 'mag_i',
    'mag_zdp02_object': 'mag_z',
    'mag_ydp02_object': 'mag_y',
    'magerr_udp02_object': 'magerr_u',
    'magerr_gdp02_object': 'magerr_g',
    'magerr_rdp02_object': 'magerr_r',
    'magerr_idp02_object': 'magerr_i',
    'magerr_zdp02_object': 'magerr_z',
    'magerr_ydp02_object': 'magerr_y'
}
data_convert = data_convert.rename(columns=columns_map)

In [None]:
data_convert.columns

In [None]:
data_convert = data_convert[['ra','dec','redshift','mag_u','mag_g','mag_r','mag_i','mag_z','mag_y','magerr_u', 'magerr_g',
       'magerr_r', 'magerr_i', 'magerr_z', 'magerr_y']]
data_convert

### 3. Arquivo de treino

Caminho onde serão salvos os arquivos:

In [None]:
path_run = f'/lustre/t0/scratch/users/andreia.dourado/TCC/dp02/object/runs/'
print(path_run)

#### 3.1 Selecionando a fração de objetos para o treino:

In [None]:
fraction = int(0.7 * len(data_convert))
fraction

In [None]:
training_csv = data_convert.sample(fraction,random_state=40)
training_csv

#### 3.2 Criando o aqrquivo .hdf5:

In [None]:
train_file_path =f'{path_run}train_file_dp02_object_{sigma}sigma.hdf5'
print(train_file_path)

In [None]:
with h5py.File(train_file_path, 'w') as train_file:
    photometry_group = train_file.create_group('photometry')
    for column in training_csv.columns:
        photometry_group.create_dataset(column, data=training_csv[column].values)

#### 3.3 Verificando o arquivo:

In [None]:
train_table = tables_io.read(train_file_path, fmt='hdf5')
train_table

In [None]:
len(train_table['photometry']['mag_g'])

### 4. Arquivo de teste

#### 4.1 Selecionando os objetos restantes:

In [None]:
validation= data_convert.drop(training_csv.index)
validation

#### 4.1 Criando o arquivo .hdf5:

In [None]:
test_file_path = f'{path_run}test_file_dp02_object_{sigma}sigma.hdf5'
print(test_file_path)

In [None]:
with h5py.File(test_file_path, 'w') as test_file:
    photometry_group = test_file.create_group('photometry')
    for column in training_csv.columns:
        photometry_group.create_dataset(column, data=validation[column].values)

#### 4.2 Verificando o arquivo:

In [None]:
test_table = tables_io.read(test_file_path, fmt='hdf5')
test_table

In [None]:
len(test_table['photometry']['mag_g'])