# Hyperparameters Fine Tuning

### ==== Import libs ====

In [2]:
# ---- utils libs ----
import numpy as np
import pandas as pd
import datetime
from typing import Optional

# --- Import functions from utils.py ---
import sys
sys.path.insert(0,'../src')

from utils import plot_confusion_matrix, plot_activity_hist, load_dataset, load_aggregate_dataset, time_in_range, segmentDf, create_sequence, train_test_split_dataset, convertToSequenceParameters

# ---- Data Viz libs ---- 
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import hiplot as hip

# ---- ML libs ----
from sklearn.preprocessing import StandardScaler

# ---- Deep Learning libs ----
from tensorflow import keras
from tensorflow.keras import layers

### Optimizer hyperparameters model ###
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

## Preprocessing

####  1/ Load the dataset and resample timeseries
####  2/ Split a dataframe into train set and test set according to the split rate
####  3/ Standardize Data
####  4/ Construction of the dataset according to peak and off-peak hours or according to activity labels
####  5/ Creation of sequences of length T and according to the overlapping period

Return preprocessed ``3D-array`` ``[samples, SEQUENCE_LENGTH, features]`` (i.e sequences from the timeseries) , as required for **LSTM** network. We want our network to have memory of **10 days**, so we set ``SEQUENCE_LENGTH=10``.

In [3]:
def data_preprocessing(timeframes: list
                  ,sequence_length: int
                  , overlap_period: int
                  ,resample_period :Optional[str]=None
                  ,use_labels :Optional[bool]=False
                  ,strategy :Optional[str] = "off_peak_time" 
                  ,split_rate :Optional[float]=0.2
                  , split_method=None) -> np.array:
    """
    1/ Loads the dataset and resample timeseries
    2/ Split a dataframe into train set and test set according to the split rate
    3/ Standardize Data
    4/ Construction of the dataset according to peak and off-peak hours 
    or according to activity labels
    5/ Creation of sequences of length T and according to the overlapping period
    
    Args:
        - resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
        - timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
        - use_labels: (False by default) use the activities labels
        - sequence_length: length of the sequence
        - overlap_period: overlap the sequences of timeseries
        - device_approach: the aggregated load curve of the devices which, when in operation, do not allow us to predict an activity 
        - split_rate: Rate of the test set size
        - device_strategy: use inactive devices base load curve
    Returns: 
        - list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries) 
    """
    
    # Diplay preprocessing parameters
    print("\n---- Post Processing Parameters ----")
    print("TIMEFRAMES = ", timeframes)
    print("SEQUENCE_LENGTH = ", sequence_length)
    print("RESAMPLE_PERIOD = ", resample_period)
    print("OVERLAP_PERIOD = ", overlap_period)
    print("STRATEGY = ", strategy)
        
    # load dataset with labels and resampled timeseries
    print("")
    print("")
    print("#### Loading and Resampling Data... ####")
    print("")
    df_resampled = load_dataset("house1_power_blk2_labels.zip", resample_period)
    
    print("#### Creating Train and Test set... ####")
    print("")
    print("")
    # split dataframe into train set and test set
    train_df, test_df, mask_test = train_test_split_dataset(df_resampled, method=split_method, split_rate=split_rate)
    
    # Standardize Data
    print("#### Rescaling Data... ####")
    print("")
    print("")
    scaler = StandardScaler()
    scaler_train = scaler.fit(train_df.loc[:, ['mains']])
    
    train_df.loc[:, 'mains'] = scaler_train.transform(train_df.loc[:, ['mains']])
    test_df.loc[:, 'mains'] = scaler_train.transform(test_df.loc[:, ['mains']])
        
    # ---- TEST SEQUENCES ----
    print("#### Creating Test Sequence... ####")
    print("")
    print("")
    if split_method=="random_days":
        list_df_test = []
        mask = ((mask_test) != (np.roll(mask_test, 1)))[mask_test]
        a = np.where(mask)[0]
        if 0 in a and len(a>1):
            a = a[1:]
        for df in np.split(test_df, a):
            list_df_test.append(df)
        # init 3D-array [samples, sequence_length, features]
        first_df_test = list_df_test[0]
        X_sequences_test, y_sequences_test = create_sequence(first_df_test, sequence_length, overlap_period)
        list_df_test.pop(0) # delete the first element of the list of train dataframes

        # Creation of sequences of length T and according to the overlapping period
        for df_test_ in list_df_test:
            next_X_sequences_test, next_y_sequences_test = create_sequence(df_test_, sequence_length, overlap_period)
            X_sequences_test = np.append(X_sequences_test, next_X_sequences_test, axis = 0)
            y_sequences_test = np.append(y_sequences_test, next_y_sequences_test, axis = 0)
    else:
        X_sequences_test, y_sequences_test = create_sequence(test_df, sequence_length, overlap_period)
    
    if strategy == "off_peak_time":
        print("Strategy chosen : ", strategy)
        print("")
        print("#### Creating Train Sequence... ####")
        print("")
        print("")
        # --- TRAIN SEQUENCES ----
        # Construction of the dataset according to peak and off-peak hours 
        if split_method=="random_days":
            list_df_train = []
            mask = ((~mask_test) != (np.roll(~mask_test, 1)))[~mask_test]
            for df in np.split(train_df, np.where(mask)[0]):
                list_df_train.extend(segmentDf(df, timeframes = timeframes))
        else:
            list_df_train = segmentDf(train_df, timeframes = timeframes)

        # init 3D-array [samples, sequence_length, features]
        first_df_train = list_df_train[0]
        list_X_sequence_train, list_y_sequence_train = create_sequence(first_df_train, sequence_length, overlap_period)
        list_df_train.pop(0) # delete the first element of the list of train dataframes

        # Creation of sequences of length T and according to the overlapping period
        for df_train_ in list_df_train:
            X_sequences_train, y_sequences_train = create_sequence(df_train_, sequence_length, overlap_period)
            list_X_sequence_train = np.append(list_X_sequence_train, X_sequences_train, axis = 0)
            list_y_sequence_train = np.append(list_y_sequence_train, y_sequences_train, axis = 0)
        
        print("---- X_train sequence shape ----")
        print(list_X_sequence_train.shape)

        print("\n---- y_train sequence shape ----")
        print(list_y_sequence_train.shape)

        print("\n\n---- X_test sequence shape ----")
        print(X_sequences_test.shape)

        print("\n---- y_test sequence shape ----")
        print(y_sequences_test.shape)
        
        return train_df, test_df, list_X_sequence_train, list_y_sequence_train, X_sequences_test, y_sequences_test


## Build a custom architecture of an auto-encoder convolutional model

We will build a convolutional reconstruction autoencoder model. The model will take input of shape (``batch_size``, ``sequence_length``, ``num_features``) and return output of the same shape.

In [5]:
# To complete

## Build custom metric

In [7]:
# Appel de la fonction d'évaluation qui retourne un score (ex AUC)

In [4]:
# To complete

### Define objective function

In [3]:
# To complete

def objective(trial):
    
    params = {'n_estimators' : trial.suggest_int('n_estimators', 1, 5),
              'criterion' : trial.suggest_categorical('criterion', ["gini", "entropy"]),
             }
    
    RF_model = RandomForestClassifier(**params)
    
    RF_model.fit(X_train_RF, y_train_RF)
    
    y_pred = RF_model.predict(X_valid_RF)
    
    score = custom_metric(y_valid_RF, y_pred, X_valid_RF)
    return score


# boucler le fine-tuning optuna sur différents dataset de preprocessing

## Fine-tune hyperpamarameters with ``optuna``

In [None]:
#study = optuna.create_study(direction = "maximize")
#study.optimize(objective, n_trials = 2000)

## Show result into a dataframe

In [None]:
#show optuna results

#study.trials_dataframe()

In [None]:
# merger le dataframe avec les différents hyperparamètre utilisé pour
# le preprocessing avec le dataframe (i.e output) généré par optuna 

## Analize the impact of hyperparameter on the custom metric with ``HiPlot``

In [None]:
# result_hiplot = hip.Experiment.from_dataframe(result_df)
# result_hiplot.display()