# Preprocessing

## Preprocessing

####  1/ Loads the dataset and resample timeseries

####  2/ Split a dataframe into train set and test set according to the split rate

####  3/ Standardize Data (Scaling the data)

####  4/ Construction of the dataset according to peak and off-peak hours or according to activity labels

####  5/ Creation of sequences of length T and according to the overlapping period

#### Return preprocessed ``3D-array`` ``[samples, SEQUENCE_LENGTH, features]`` (i.e sequences from the timeseries) , as required for **LSTM** network. We want our network to have memory of **10 days**, so we set ``SEQUENCE_LENGTH = 10``.

In [1]:
# ---- utils libs ----
import numpy as np
import pandas as pd
import datetime
from typing import Optional

# ---- ML libs ----
from sklearn.preprocessing import StandardScaler

from module import load_dataset, time_in_range, segmentDf, create_sequence, train_test_split_dataset

In [2]:
def preprocessing(filepath: str, timeframes: list
                  ,sequence_length: int, overlap_period: int
                  ,resample_period :Optional[str]=None
                  ,use_labels :Optional[bool]=False, split_rate :Optional[float]=0.2) -> np.array:
    """
    1/ Loads the dataset and resample timeseries
    2/ Split a dataframe into train set and test set according to the split rate
    3/ Standardize Data
    4/ Construction of the dataset according to peak and off-peak hours 
    or according to activity labels
    5/ Creation of sequences of length T and according to the overlapping period
    
    Args:
        - filename: the path to the file to load
        - resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
        - df: dataframe with a hands column indicating the instantaneous power and the datetime as index
        - timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
        - use_labels: (False by default) use the activities labels
        - sequence_length: length of the sequence
        - overlap_period: overlap the sequences of timeseries
        - split_rate: Rate of the test set size
    Returns: 
        - list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries) 
    """
    # load dataset and resampled timeseries
    df_resampled = load_dataset(filepath, resample_period)
    
    # split dataframe into train set and test set
    train_df, test_df = train_test_split_dataset(df_resampled)
    
    # Standardize Data
    scaler = StandardScaler()
    scaler_train = scaler.fit(train_df[['mains']])
    scaler_test = scaler.fit(test_df[['mains']])
    
    train_df['mains'] = scaler_train.transform(train_df[['mains']])
    test_df['mains'] = scaler_test.transform(test_df[['mains']])
    
    # --- TRAIN SEQUENCES ----
    # Construction of the dataset according to peak and off-peak hours 
    list_df_train = segmentDf(train_df, timeframes = timeframes)
    
    # init 3D-array [samples, sequence_length, features]
    first_df_train = list_df_train[0]
    list_X_sequence_train, list_y_sequence_train = create_sequence(first_df_train, sequence_length, overlap_period)
    list_df_train.pop(0) # delete the first element of the list of train dataframes
    
    # Creation of sequences of length T and according to the overlapping period
    for df_train_ in list_df_train:
        X_sequences_train, y_sequences_train = create_sequence(df_train_, sequence_length, overlap_period)
        list_X_sequence_train = np.append(list_X_sequence_train, X_sequences_train, axis = 0)
        list_y_sequence_train = np.append(list_y_sequence_train, y_sequences_train, axis = 0)
        
    # --- TEST SEQUENCES ----
    # Construction of the dataset according to peak and off-peak hours 
    list_df_test = segmentDf(test_df, timeframes = timeframes)
    
    # init 3D-array [samples, sequence_length, features]
    first_df_test = list_df_test[0]
    list_X_sequence_test, list_y_sequence_test = create_sequence(first_df_test, sequence_length, overlap_period)
    list_df_test.pop(0) # delete the first element of the list of test dataframes
    
    # Creation of sequences of length T and according to the overlapping period
    for df_test_ in list_df_test:
        X_sequences_test, y_sequences_test = create_sequence(df_test_, sequence_length, overlap_period)
        list_X_sequence_test = np.append(list_X_sequence_test, X_sequences_test, axis = 0)
        list_y_sequence_test = np.append(list_y_sequence_test, y_sequences_test, axis = 0)
    
    return list_X_sequence_train, list_y_sequence_train, list_X_sequence_test, list_y_sequence_test

In [3]:
X_sequence_train, y_sequence_train, X_sequence_test, y_sequence_test = preprocessing(filepath = "data/house1_power_blk2_labels.csv" 
                          ,timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0))
                                         , (datetime.time(11,0,0), datetime.time(14,0,0))
                                         ,(datetime.time(18,0,0), datetime.time(23,0,0))]
                          ,sequence_length = 10, overlap_period = 5, resample_period = "10min")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['mains'] = scaler_train.transform(train_df[['mains']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['mains'] = scaler_test.transform(test_df[['mains']])


### Visualize shape of the sequence

In [4]:
print("---- X_train sequence shape ----")
print(X_sequence_train.shape)

print("\n---- y_train sequence shape ----")
print(y_sequence_train.shape)

print("\n\n---- X_test sequence shape ----")
print(X_sequence_test.shape)

print("\n---- y_test sequence shape ----")
print(y_sequence_test.shape)

---- X_train sequence shape ----
(1506, 10, 1)

---- y_train sequence shape ----
(1506,)


---- X_test sequence shape ----
(380, 10, 1)

---- y_test sequence shape ----
(380,)


### Visualize the sequence

In [5]:
print("---- X_train sequence ----")
X_sequence_train

---- X_train sequence ----


array([[[-0.58211285],
        [-0.77701554],
        [-0.17170798],
        ...,
        [-0.7484932 ],
        [-0.17963086],
        [-0.39513302]],

       [[-0.5916203 ],
        [-0.16536968],
        [-0.7484932 ],
        ...,
        [-0.5726054 ],
        [-0.20656863],
        [-0.61538892]],

       [[-0.77384639],
        [-0.39830217],
        [-0.5726054 ],
        ...,
        [-0.17646171],
        [-0.5916203 ],
        [-0.77701554]],

       ...,

       [[ 1.18151887],
        [ 0.58571876],
        [ 0.47796768],
        ...,
        [ 0.61582568],
        [ 0.61741026],
        [-0.02434253]],

       [[ 0.16580645],
        [ 0.29891073],
        [ 0.61582568],
        ...,
        [ 0.47479853],
        [-0.41256335],
        [-0.40147132]],

       [[ 0.30207988],
        [ 0.54768897],
        [ 0.47479853],
        ...,
        [-0.1891383 ],
        [-0.384041  ],
        [-0.37611813]]])

In [6]:
print("---- y_train sequence ----")
y_sequence_train

---- y_train sequence ----


array([-0.77384639, -0.37294898, -0.36819525, ...,  0.30207988,
       -0.60746604, -0.38879473])

In [7]:
print("---- X_test sequence ----")
X_sequence_test

---- X_test sequence ----


array([[[-0.15110851],
        [-0.11624786],
        [ 0.32267935],
        ...,
        [ 0.21334369],
        [-0.10515584],
        [-0.11624786]],

       [[ 0.33377137],
        [ 0.13569952],
        [ 0.21334369],
        ...,
        [-0.38721015],
        [-0.16536968],
        [-0.56151338]],

       [[-0.55517508],
        [-0.35234951],
        [-0.38721015],
        ...,
        [ 0.03428674],
        [-0.56309795],
        [-0.3713644 ]],

       ...,

       [[ 0.69663899],
        [ 1.21637952],
        [-0.42999367],
        ...,
        [ 0.43676873],
        [ 0.08657771],
        [ 0.39398521]],

       [[ 1.3811753 ],
        [-0.00374306],
        [ 0.43676873],
        ...,
        [ 0.05805536],
        [ 2.3604425 ],
        [-0.17804628]],

       [[ 0.28940328],
        [ 0.23394316],
        [ 0.05805536],
        ...,
        [-0.38879473],
        [-0.19706118],
        [-0.3903793 ]]])

In [8]:
print("---- y_test sequence ----")
y_sequence_test

---- y_test sequence ----


array([-5.55175076e-01, -3.41257482e-01, -5.61513375e-01, -3.71364403e-01,
        2.56127207e-01,  4.57368203e-01, -1.25755313e-01,  4.46276180e-01,
        1.31303858e+00,  3.06833600e-01,  1.43622398e-01, -7.50489204e-02,
       -7.54831498e-01, -4.12563347e-01, -4.17317071e-01, -3.71364403e-01,
       -5.64682525e-01, -1.44770210e-01, -5.78943698e-01, -3.77702702e-01,
       -6.31234665e-01,  3.04656338e+00,  2.07005389e-01, -1.63785108e-01,
       -2.98473963e-01,  8.34085569e-02, -5.63097950e-01,  3.00495301e-01,
        1.32530375e-01, -1.14663290e-01,  3.13371499e+00,  2.02251665e-01,
        1.16684627e-01,  8.78865093e-01, -4.09394197e-01, -7.51662348e-01,
       -2.44598421e-01, -4.33162819e-01, -5.45667628e-01, -5.36160179e-01,
       -3.61856954e-01, -7.18386278e-01, -3.91963875e-01, -3.38499763e-02,
       -2.66782468e-01,  2.16101407e-02, -1.28924463e-01, -4.36331968e-01,
       -1.06740416e-01, -2.92135664e-01,  1.83236768e-01,  3.17925623e-01,
        3.21135916e+00,  