# PREPARE DATASET
Notebook para poner juntar los datos de todas los canales en un único CSV.

In [40]:
import pandas as pd
import os
import re

In [41]:
MISSION = 2
YEAR = 2000
CHANNELS = ["allchannels", "subset", "target"][2]
FIRST_CHANNEL_NUMBER = 18  # Only if CHANNELS == "subset"
LAST_CHANNEL_NUMBER = 28  # Only if CHANNELS == "subset"

DATA_PATH = "../data/Mission2-ESA/channels"
CHANNELS_LIST_PATH = "../data/Mission2-ESA/channels.csv"

In [42]:
start_date = f"{YEAR}-01-01 00:00:00"
end_date = f"{YEAR+1}-01-01 00:00:00"

if CHANNELS == "target":
    channels_info = pd.read_csv(CHANNELS_LIST_PATH)
    channels_list = list(channels_info[channels_info['Target']=="YES"]['Channel'])
else:
    channels_list = None if CHANNELS == "allchannels" else [f"channel_{i}" for i in range(FIRST_CHANNEL_NUMBER, LAST_CHANNEL_NUMBER+1)]
output_path = f"../data/Mission{MISSION}-Datasets/dataset_{CHANNELS if CHANNELS == 'allchannels' or CHANNELS == 'target' else f'channels{FIRST_CHANNEL_NUMBER}_{LAST_CHANNEL_NUMBER}'}_{YEAR}.csv"

In [43]:
def replace_value_x(value):
    match = re.match(r'value_(\d+)', str(value))
    if match:
        return int(match.group(1))
    return value


def get_channel_list(channel_list_path):
    return pd.read_csv(channel_list_path).set_index('Channel').T.to_dict()


def __list_files_in_data_path(_data_path):
    files = os.listdir(_data_path)
    return sorted([f for f in files if f.startswith("channel_")], key=lambda x: int(x.split('_')[1]))


def __compose_combined_df(start_date, end_date, channels, _data_path):
    files = __list_files_in_data_path(_data_path)

    # List to store the DataFrames of each channel
    dataframes = []
    # Read the data of each channel and add them to the list
    for channel in files:
        # if channel_list[channel]["Target"] == "YES" or not only_target:
        if channels is None or channel in channels:
            file_path = f"{_data_path}/{channel}"
            df = pd.read_pickle(file_path)
            df.rename(columns={df.columns[0]: f'{channel}'}, inplace=True)
            
            # Filter the DataFrame to include only the dates between start_date and end_date
            df = df[(df.index >= start_date) & (df.index <= end_date)]

            # df = df.applymap(replace_value_x)
            df = df.apply(lambda col: col.map(replace_value_x))

            dataframes.append(df)

    # Combine the DataFrames into one
    combined_df = pd.concat(dataframes, axis=1)
    combined_df.index.name = "time"
    return combined_df


def get_combined_df(start_date, end_date, channels, output_path, _data_path):
    combined_df = __compose_combined_df(start_date, end_date, channels, _data_path)
    if output_path is not None:
        combined_df.to_csv(output_path, index=True)

In [None]:
get_combined_df(start_date, end_date, channels_list, output_path, DATA_PATH)
print("Result save:", output_path)