# Generating mocks

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Generates mock datasets in Parquet format for testing the CRC pipeline.

Generated files:
1) test_survey_not_supported.parquet
2) test_homogenized_columns_out_of_range.parquet
3) test_survey_col_missing.parquet
4) test_survey_supported_but_wrong_flags.parquet
"""

import numpy as np
import pandas as pd


# Number of rows in each dataset
N_ROWS = 100

# Seed for reproduction (adjust or remove if you want it completely random)
np.random.seed(42)


def random_sky_coordinates(n: int):
    """Generate random RA/DEC on the sky (simple uniform in RA and DEC)."""
    ra = np.random.uniform(0.0, 360.0, n)    # RA em graus [0, 360)
    dec = np.random.uniform(-90.0, 90.0, n)  # DEC em graus [-90, 90]
    return ra, dec


# ============================================================
# 1) test_survey_not_supported.parquet
# ------------------------------------------------------------
# colunas: id, ra, dec, z, z_flag, survey
# - survey: "not_supported"
# - z_flag: [1, 10]
# - id: "1_001", "1_002", ...
# - z: float em [0, 1]
# ============================================================

ra1, dec1 = random_sky_coordinates(N_ROWS)

df1 = pd.DataFrame({
    "id": [f"1_{i:03d}" for i in range(1, N_ROWS + 1)],
    "ra": ra1,
    "dec": dec1,
    "z": np.random.uniform(0.0, 1.0, N_ROWS),
    "z_flag": np.random.randint(1, 11, N_ROWS),  # 1 a 10 (high Ã© exclusivo)
    "survey": ["not_supported"] * N_ROWS,
})

df1.to_parquet("test_survey_not_supported.parquet", index=False)


# ============================================================
# 2) test_homogenized_columns_out_of_range.parquet
# ------------------------------------------------------------
# colunas: id, ra, dec, z, z_flag_homogenized,
#          instrument_type_homogenized, survey
# - survey: "wrong_homogenized"
# - z_flag_homogenized: [1, 10]
# - id: "2_001", "2_002", ...
# - z: float em [0, 1]
# ============================================================

ra2, dec2 = random_sky_coordinates(N_ROWS)

instrument_types = ["galaxy", "qso", "star"]

df2 = pd.DataFrame({
    "id": [f"2_{i:03d}" for i in range(1, N_ROWS + 1)],
    "ra": ra2,
    "dec": dec2,
    "z": np.random.uniform(0.0, 1.0, N_ROWS),
    "z_flag_homogenized": np.random.randint(1, 11, N_ROWS),  # 1 a 10
    "instrument_type_homogenized": np.random.choice(instrument_types, N_ROWS),
    "survey": ["wrong_homogenized"] * N_ROWS,
})

df2.to_parquet("test_homogenized_columns_out_of_range.parquet", index=False)


# ============================================================
# 3) test_survey_col_missing.parquet
# ------------------------------------------------------------
# colunas: id, ra, dec, z, z_flag   (SEM survey)
# - z_flag: [1, 10]
# - id: "3_001", "3_002", ...
# - z: float em [0, 1]
# ============================================================

ra3, dec3 = random_sky_coordinates(N_ROWS)

df3 = pd.DataFrame({
    "id": [f"3_{i:03d}" for i in range(1, N_ROWS + 1)],
    "ra": ra3,
    "dec": dec3,
    "z": np.random.uniform(0.0, 1.0, N_ROWS),
    "z_flag": np.random.randint(1, 11, N_ROWS),  # 1 a 10
    # intencionalmente sem coluna "survey"
})

df3.to_parquet("test_survey_col_missing.parquet", index=False)


# ============================================================
# 4) test_survey_supported_but_wrong_flags.parquet
# ------------------------------------------------------------
# colunas: id, ra, dec, z, z_flag, survey
# - survey: "2DFGRS"
# - z_flag: [10, 20]
# - id: "4_001", "4_002", ...
# - z: float em [0, 1]
# ============================================================

ra4, dec4 = random_sky_coordinates(N_ROWS)

df4 = pd.DataFrame({
    "id": [f"4_{i:03d}" for i in range(1, N_ROWS + 1)],
    "ra": ra4,
    "dec": dec4,
    "z": np.random.uniform(0.0, 1.0, N_ROWS),
    "z_flag": np.random.randint(10, 21, N_ROWS),  # 10 a 20
    "survey": ["2DFGRS"] * N_ROWS,
})

df4.to_parquet("test_survey_supported_but_wrong_flags.parquet", index=False)


print("Arquivos Parquet gerados com sucesso!")

# Validation

In [None]:
import pandas as pd

In [None]:
df_1 = pd.read_parquet("test_survey_not_supported.parquet")
df_1.head()

In [None]:
df_2 = pd.read_parquet("test_homogenized_columns_out_of_range.parquet")
df_2.head()

In [None]:
df_3 = pd.read_parquet("test_survey_col_missing.parquet")
df_3.head()

In [None]:
df_4 = pd.read_parquet("test_survey_supported_but_wrong_flags.parquet")
df_4.head()