# Preprocessing

#### 1/ Loads the dataset and resample timeseries

#### 2/ Construction of the dataset according to peak and off-peak hours or according to activity labels

#### 3/ Creation of sequences of length T and according to the overlapping period**

#### Return list of prepocessed ``3D-array`` ``[samples, sequence_length, features]`` (i.e sequences from the timeseries) 

In [1]:
# --- utils libs ---
import numpy as np
import pandas as pd
import datetime
from typing import Optional

from module import load_dataset, time_in_range, segmentDf, create_sequence

In [2]:
def preprocessing(filepath: str, timeframes: list
                  ,sequence_length: int, overlap_period: int
                  ,resample_period :Optional[str]=None
                  ,use_labels :Optional[bool]=False) -> np.array:
    """
    1/ Loads the dataset and resample timeseries
    2/ Construction of the dataset according to peak and off-peak hours 
    or according to activity labels
    3/ Creation of sequences of length T and according to the overlapping period
    
    Args:
        - filename: the path to the file to load
        - resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
        - df: dataframe with a hands column indicating the instantaneous power and the datetime as index
        - timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
        - use_labels: (False by default) use the activities labels
        - sequence_length: length of the sequence
        - overlap_period: overlap the sequences of timeseries
    Returns: 
        - list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries) 
    """
    # load dataset and resampled timeseries
    df_resampled = load_dataset(filepath, resample_period)
    
    # Construction of the dataset according to peak and off-peak hours 
    list_df = segmentDf(df_resampled, timeframes = timeframes)
    
    list_sequence = list()
    
    # Creation of sequences of length T and according to the overlapping period
    for df in list_df:
        sequences = create_sequence(df, sequence_length, overlap_period)
        list_sequence.append(sequences)
    
    return list_sequence

In [3]:
timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))]
sequences = preprocessing(filepath = "data/house1_power_blk2_labels.csv" 
                          ,timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))]
                          ,sequence_length = 10, overlap_period = 5, resample_period = "30min")

### Visualize shape of the sequence

In [4]:
sequences[1].shape

(7, 10, 1)

### Visualize the sequence

In [5]:
sequences[1]

array([[[3093.],
        [ 981.],
        [ 909.],
        [ 536.],
        [ 547.],
        [ 759.],
        [ 550.],
        [ 670.],
        [5549.],
        [5312.]],

       [[ 759.],
        [ 550.],
        [ 670.],
        [5549.],
        [5312.],
        [3898.],
        [1958.],
        [1856.],
        [ 915.],
        [1329.]],

       [[3898.],
        [1958.],
        [1856.],
        [ 915.],
        [1329.],
        [ 896.],
        [1120.],
        [3738.],
        [1260.],
        [1239.]],

       [[ 896.],
        [1120.],
        [3738.],
        [1260.],
        [1239.],
        [1439.],
        [1422.],
        [ 994.],
        [1172.],
        [1223.]],

       [[1439.],
        [1422.],
        [ 994.],
        [1172.],
        [1223.],
        [ 846.],
        [ 401.],
        [1088.],
        [ 726.],
        [ 480.]],

       [[ 846.],
        [ 401.],
        [1088.],
        [ 726.],
        [ 480.],
        [ 620.],
        [ 729.],
        [ 588.],
    