# Preprocessing

## Preprocessing

####  1/ Loads the dataset and resample timeseries

####  2/ Split a dataframe into train set and test set according to the split rate

####  3/ Standardize Data (Scaling the data)

####  4/ Construction of the dataset according to peak and off-peak hours or according to activity labels

####  5/ Creation of sequences of length T and according to the overlapping period

#### Return preprocessed ``3D-array`` ``[samples, SEQUENCE_LENGTH, features]`` (i.e sequences from the timeseries) , as required for **LSTM** network. We want our network to have memory of **10 days**, so we set ``SEQUENCE_LENGTH = 10``.

In [1]:
# ---- utils libs ----
import numpy as np
import pandas as pd
import datetime
from typing import Optional

# ---- ML libs ----
from sklearn.preprocessing import StandardScaler

from module import load_dataset, time_in_range, segmentDf, create_sequence, train_test_split_dataset, convertToSequenceParameters

In [2]:
TIME_STEP = datetime.timedelta(minutes=1, seconds=30) # duration of a step in the resample dataset, originally 1second
DURATION_TIME = datetime.timedelta(minutes=60) # duration of a sequence
OVERLAP_PERIOD_PERCENT = 0.8 # 0.5 <=> 50% overlapping
TIMEFRAMES = [(datetime.time(22,0,0), datetime.time(6,0,0))]

SEQUENCE_LENGTH, OVERLAP_PERIOD = convertToSequenceParameters(TIME_STEP, DURATION_TIME, OVERLAP_PERIOD_PERCENT)

print("\t\tValeur choisie \t Equivalent sequence\nTimestep : \t {}\nDuration :\t {} \t -->  {} \nOverlap :\t {} \t -->  {}".format(TIME_STEP, DURATION_TIME, SEQUENCE_LENGTH, OVERLAP_PERIOD_PERCENT, OVERLAP_PERIOD))

		Valeur choisie 	 Equivalent sequence
Timestep : 	 0:01:30
Duration :	 1:00:00 	 -->  40 
Overlap :	 0.8 	 -->  32


In [3]:
def preprocessing(filepath: str, timeframes: list
                  ,sequence_length: int, overlap_period: int
                  ,resample_period :Optional[str]=None
                  ,use_labels :Optional[bool]=False, split_rate :Optional[float]=0.2) -> np.array:
    """
    1/ Loads the dataset and resample timeseries
    2/ Split a dataframe into train set and test set according to the split rate
    3/ Standardize Data
    4/ Construction of the dataset according to peak and off-peak hours 
    or according to activity labels
    5/ Creation of sequences of length T and according to the overlapping period
    
    Args:
        - filename: the path to the file to load
        - resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
        - df: dataframe with a hands column indicating the instantaneous power and the datetime as index
        - timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
        - use_labels: (False by default) use the activities labels
        - sequence_length: length of the sequence
        - overlap_period: overlap the sequences of timeseries
        - split_rate: Rate of the test set size
    Returns: 
        - list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries) 
    """
    # load dataset and resampled timeseries
    df_resampled = load_dataset(filepath, resample_period)
    
    # split dataframe into train set and test set
    train_df, test_df = train_test_split_dataset(df_resampled)
    
    # Standardize Data
    scaler = StandardScaler()
    scaler_train = scaler.fit(train_df[['mains']])
    scaler_test = scaler.fit(test_df[['mains']])
    
    train_df['mains'] = scaler_train.transform(train_df[['mains']])
    test_df['mains'] = scaler_test.transform(test_df[['mains']])
    
    # --- TRAIN SEQUENCES ----
    # Construction of the dataset according to peak and off-peak hours 
    list_df_train = segmentDf(train_df, timeframes = timeframes)
    
    # init 3D-array [samples, sequence_length, features]
    first_df_train = list_df_train[0]
    list_X_sequence_train, list_y_sequence_train = create_sequence(first_df_train, sequence_length, overlap_period)
    list_df_train.pop(0) # delete the first element of the list of train dataframes
    
    # Creation of sequences of length T and according to the overlapping period
    for df_train_ in list_df_train:
        X_sequences_train, y_sequences_train = create_sequence(df_train_, sequence_length, overlap_period)
        list_X_sequence_train = np.append(list_X_sequence_train, X_sequences_train, axis = 0)
        list_y_sequence_train = np.append(list_y_sequence_train, y_sequences_train, axis = 0)
        
    # ---- TEST SEQUENCES ----
    X_sequences_test, y_sequences_test = create_sequence(test_df, sequence_length, overlap_period)
    
    return list_X_sequence_train, list_y_sequence_train, X_sequences_test, y_sequences_test

In [4]:
X_sequence_train, y_sequence_train, X_sequence_test, y_sequence_test = preprocessing(filepath = "data/house1_power_blk2_labels.csv" 
                          ,timeframes = TIMEFRAMES
                          ,sequence_length = SEQUENCE_LENGTH, overlap_period = OVERLAP_PERIOD, resample_period = TIME_STEP)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['mains'] = scaler_train.transform(train_df[['mains']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['mains'] = scaler_test.transform(test_df[['mains']])


### Visualize shape of the sequence

In [5]:
print("---- X_train sequence shape ----")
print(X_sequence_train.shape)

print("\n---- y_train sequence shape ----")
print(y_sequence_train.shape)

print("\n\n---- X_test sequence shape ----")
print(X_sequence_test.shape)

print("\n---- y_test sequence shape ----")
print(y_sequence_test.shape)

---- X_train sequence shape ----
(1826, 40, 1)

---- y_train sequence shape ----
(1826, 40, 2)


---- X_test sequence shape ----
(1506, 40, 1)

---- y_test sequence shape ----
(1506, 40, 2)


### Visualize the sequence

In [6]:
print("---- X_train sequence ----")
X_sequence_train

---- X_train sequence ----


array([[[-0.59421227],
        [-0.57840191],
        [-0.59263123],
        ...,
        [-0.76338314],
        [-0.62108988],
        [-0.12780661]],

       [[-0.60527952],
        [-0.35231374],
        [-0.38235343],
        ...,
        [-0.79026075],
        [-0.78235557],
        [-0.76022106]],

       [[-0.57682087],
        [-0.58156398],
        [-0.58156398],
        ...,
        [-0.1799808 ],
        [ 0.01448664],
        [-0.19579117]],

       ...,

       [[-0.44085177],
        [-0.43768969],
        [-0.41871726],
        ...,
        [ 4.53150687],
        [ 3.02636047],
        [ 5.81688924]],

       [[-0.64480543],
        [-0.64164335],
        [-0.64954853],
        ...,
        [ 2.67378941],
        [ 2.67378941],
        [-0.24638432]],

       [[ 0.32278869],
        [ 0.24847999],
        [ 3.14177611],
        ...,
        [ 0.33385594],
        [ 0.45717676],
        [ 0.44294743]]])

In [7]:
print("---- y_train sequence ----")
y_sequence_train

---- y_train sequence ----


array([[[Timestamp('2016-03-06 00:00:00'), 0],
        [Timestamp('2016-03-06 00:01:30'), 0],
        [Timestamp('2016-03-06 00:03:00'), 0],
        ...,
        [Timestamp('2016-03-06 00:55:30'), 0],
        [Timestamp('2016-03-06 00:57:00'), 0],
        [Timestamp('2016-03-06 00:58:30'), 0]],

       [[Timestamp('2016-03-06 00:12:00'), 0],
        [Timestamp('2016-03-06 00:13:30'), 0],
        [Timestamp('2016-03-06 00:15:00'), 0],
        ...,
        [Timestamp('2016-03-06 01:07:30'), 0],
        [Timestamp('2016-03-06 01:09:00'), 0],
        [Timestamp('2016-03-06 01:10:30'), 0]],

       [[Timestamp('2016-03-06 00:24:00'), 0],
        [Timestamp('2016-03-06 00:25:30'), 0],
        [Timestamp('2016-03-06 00:27:00'), 0],
        ...,
        [Timestamp('2016-03-06 01:19:30'), 0],
        [Timestamp('2016-03-06 01:21:00'), 0],
        [Timestamp('2016-03-06 01:22:30'), 0]],

       ...,

       [[Timestamp('2016-04-25 04:36:00'), 0],
        [Timestamp('2016-04-25 04:37:30'), 0],
  

In [8]:
print("---- X_test sequence ----")
X_sequence_test

---- X_test sequence ----


array([[[-0.18472391],
        [-0.51041736],
        [-0.51357943],
        ...,
        [-0.33176027],
        [-0.18156184],
        [ 0.03187804]],

       [[-0.39658275],
        [-0.15152215],
        [-0.39342068],
        ...,
        [-0.38077239],
        [-0.2716809 ],
        [-0.28749126]],

       [[ 0.01132457],
        [-0.1973722 ],
        [-0.03136341],
        ...,
        [-0.18472391],
        [-0.02345823],
        [-0.18156184]],

       ...,

       [[ 0.31330247],
        [ 0.28800589],
        [ 0.44927158],
        ...,
        [-0.38235343],
        [-0.54361911],
        [-0.54994326]],

       [[-0.35389478],
        [-0.3855155 ],
        [-0.46614834],
        ...,
        [-0.15152215],
        [-0.35705685],
        [-0.37761032]],

       [[-0.19104806],
        [-0.19421013],
        [-0.3855155 ],
        ...,
        [-0.58314502],
        [-0.57682087],
        [-0.34757063]]])

In [9]:
print("---- y_test sequence ----")
y_sequence_test

---- y_test sequence ----


array([[[Timestamp('2016-04-25 08:48:00'), 0],
        [Timestamp('2016-04-25 08:49:30'), 0],
        [Timestamp('2016-04-25 08:51:00'), 0],
        ...,
        [Timestamp('2016-04-25 09:43:30'), 0],
        [Timestamp('2016-04-25 09:45:00'), 0],
        [Timestamp('2016-04-25 09:46:30'), 0]],

       [[Timestamp('2016-04-25 09:00:00'), 0],
        [Timestamp('2016-04-25 09:01:30'), 0],
        [Timestamp('2016-04-25 09:03:00'), 0],
        ...,
        [Timestamp('2016-04-25 09:55:30'), 0],
        [Timestamp('2016-04-25 09:57:00'), 0],
        [Timestamp('2016-04-25 09:58:30'), 0]],

       [[Timestamp('2016-04-25 09:12:00'), 0],
        [Timestamp('2016-04-25 09:13:30'), 0],
        [Timestamp('2016-04-25 09:15:00'), 0],
        ...,
        [Timestamp('2016-04-25 10:07:30'), 0],
        [Timestamp('2016-04-25 10:09:00'), 0],
        [Timestamp('2016-04-25 10:10:30'), 0]],

       ...,

       [[Timestamp('2016-05-07 21:24:00'), 1],
        [Timestamp('2016-05-07 21:25:30'), 1],
  