# PREPROCESSING DATA
Notebook para preprocesar los datos de un dataset haciendo interpolación para obtener los datos que faltan.

In [24]:
import pandas as pd
from datetime import datetime
from libraries.utils import read_csv, interpolate_dataframe

In [25]:
MISSION = 2
YEAR = 2000
INTERPOLATION_METHOD = ["linear", "nearest", "previous", "zero-order", "frequency-previous"][4]

CHANNELS = ["allchannels", "subset", "target"][2]
FIRST_CHANNEL_NUMBER = 18  # Only if CHANNELS == "subset"
LAST_CHANNEL_NUMBER = 28  # Only if CHANNELS == "subset"

In [26]:
sample_frequency = ["30s", "18s"][MISSION-1]

data_path = f"../data/Mission{MISSION}-Datasets/dataset_{CHANNELS if CHANNELS == 'allchannels' or CHANNELS == 'target' else f'channels{FIRST_CHANNEL_NUMBER}_{LAST_CHANNEL_NUMBER}'}_{YEAR}.csv"
esa_anomalies_path = f"../esa-anomalies/anomalies_mission{MISSION}.csv"

start_date = datetime(YEAR, 1, 1, 0, 0, 0)
if MISSION == 2 and YEAR == 2003:
    end_date = datetime(YEAR, 7, 1, 0, 0, 0)
else:
    end_date = datetime(YEAR+1, 1, 1, 0, 0, 0)

save_path = f"../data/Mission{MISSION}-Preprocessed/data_preprocessed_{CHANNELS if CHANNELS == 'allchannels' or CHANNELS == 'target' else f'channels{FIRST_CHANNEL_NUMBER}_{LAST_CHANNEL_NUMBER}'}_{INTERPOLATION_METHOD}_{YEAR}.csv"

In [27]:
class BBDD:
    def __init__(self, data_path: str = "ESA-data/ESA-Mission1/dataset.csv", esa_anomalies_path: str = "notebooks/needed_data/anomalies.csv"):
        self.data_path = data_path
        
        self.esa_anomalies_path = esa_anomalies_path
        self.esa_anomalies = pd.read_csv(esa_anomalies_path)
        self.esa_anomalies['StartTime'] = pd.to_datetime(self.esa_anomalies['StartTime'], format='mixed', errors='coerce')
        self.esa_anomalies['StartTime'] = self.esa_anomalies['StartTime'].dt.tz_localize(None)
        self.esa_anomalies['EndTime'] = pd.to_datetime(self.esa_anomalies['EndTime'], format='mixed', errors='coerce')
        self.esa_anomalies['EndTime'] = self.esa_anomalies['EndTime'].dt.tz_localize(None)

        self.original_df = read_csv(data_path, sep=',')
        self.original_df.index = pd.to_datetime(self.original_df.index)
        
    
    def get_data_between_dates(self, start_date: datetime, end_date: datetime, save_path: str = None) -> pd.DataFrame:
        df = self.original_df[(self.original_df.index >= start_date) & (self.original_df.index <= end_date)]
        self.save_df_as_csv(df, save_path)
        return df
    
    def get_data_preprocessed_between_dates(self,
                                            start_date: datetime,
                                            end_date: datetime,
                                            save_path: str = None,
                                            interpolation_method: str = 'previous',
                                            sample_frequency: str = '30s') -> pd.DataFrame:
        data = self.get_data_between_dates(start_date, end_date)
        interpolated_result = interpolate_dataframe(data,
                                                    start_date,
                                                    end_date,
                                                    self.esa_anomalies,
                                                    interpolation_method=interpolation_method,
                                                    sample_frequency=sample_frequency)
        
        self.__check_result_format(interpolated_result)
        print('Formato: OK')
        print(interpolated_result.shape)
        self.save_df_as_csv(interpolated_result, save_path)
        return interpolated_result
    
    
    def __check_result_format(self, df: pd.DataFrame):
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("El índice del DataFrame no es un DateTimeIndex.")
        if df.isnull().any().any():
            raise ValueError("Hay valores NaN en el DataFrame.")
        if df.index.name != 'time':
            raise ValueError("El índice debe llamarse time")
    
    def save_df_as_csv(self, df: pd.DataFrame, save_path: str):
        if save_path is not None:
            df.to_csv(save_path, index=True, sep=';')

In [28]:
def print_time_difference(interpolated_result):
    diffs = interpolated_result.index.to_series().diff().dropna()
    diffs_in_seconds = diffs.dt.total_seconds()
    min_diff = diffs_in_seconds.min()
    max_diff = diffs_in_seconds.max()
    mean_diff = diffs_in_seconds.mean()

    print(f"Filas: {len(interpolated_result)}")
    print(f"Diferencia mínima: {min_diff} segundos")
    print(f"Diferencia máxima: {max_diff} segundos")
    print(f"Diferencia media: {mean_diff} segundos")
    print(f"Diferencia mínima / Diferencia máxima / Diferencia media: " +
          f"{min_diff:.4f}".rstrip('0').rstrip('.') + " / " +
          f"{max_diff:.4f}".rstrip('0').rstrip('.') + " / " +
          f"{mean_diff:.4f}".rstrip('0').rstrip('.'))

In [None]:
bbdd = BBDD(data_path = data_path, esa_anomalies_path=esa_anomalies_path)
interpolated_result = bbdd.get_data_preprocessed_between_dates(start_date, end_date, interpolation_method=INTERPOLATION_METHOD, save_path=save_path, sample_frequency=sample_frequency)
print(f"Result stored: {save_path}")
print_time_difference(interpolated_result)