# Overall comparison
Straight forward comparison between the two baselines (random, RLP) and the two proposed techniques expert-based and data-driven.

In [None]:
import altair as alt
# standard imports
import numpy as np
import pandas as pd

alt.data_transformers.disable_max_rows()
from pathlib import Path

# loading the data 
from scengen.data import generate_mockup_data

# preprocessing the yearly info 
from scengen.preprocessing import YearlyInfoPreprocessor

# sampling models 
from scengen.models.samplers import (RandomSampler, DailySamplerFromClusterSampler)
from scengen.models.basesamplers import (ExpertDaySelectionSampler, MetadataClusterSampler, ConsumptionClusterSampler)
from scengen.models.generators import RLPGenerator

# clustering helpers
from scengen.cluster.elbow import ElbowMethod
import scengen.cluster.metrics as dist_metrics

# clustering algorithms 
from sklearn.cluster import KMeans
from kmedoids import KMedoids

# classifiers 
from sklearn.ensemble import RandomForestClassifier

# evaluation 
from scengen.evaluation import SamplerEvaluator

## Load the mock-up data

In [None]:
yearly_data_df, daily_data_df, yearly_info_df, daily_info_df = generate_mockup_data()

In [None]:
yearly_data_df

# Generate folds for cross validation

In [None]:
rand_gen = np.random.default_rng(12341243)
folds = np.array_split(yearly_data_df.index, 5)

# Models to compare

In [None]:
models = dict()
NB_SAMPLES = 250
CLUSTER_RANGE = list(range(10, 101, 5))

# random baseline
models['random baseline'] = RandomSampler()

# RLP/SLP baseline
# this method needs to see the full dataset (both training and test set) in order to be able to assign the profiles to the correct RLP or SLP category.
models['SLP baseline'] = RLPGenerator(yearly_data_df)

# the expert-based technique
# the expert-based technique with metadata clustering for year selection
# and expert day selection for day selection
models['expert-based'] = (
    DailySamplerFromClusterSampler(
        yearly_sampler=MetadataClusterSampler(
            clusterer=ElbowMethod(KMeans(n_clusters=1, n_init=10), cluster_range=CLUSTER_RANGE),
            info_preprocessing=YearlyInfoPreprocessor(columns_to_use=['yearly_consumption', 'connection_power'],
                                                      normalized=True),
        ),
        daily_sampler=ExpertDaySelectionSampler()
    )
)

# data-driven technique
# the data driven approach with consumption clustering for both year and day selection
models['data-driven'] = (
    DailySamplerFromClusterSampler(
        yearly_sampler=ConsumptionClusterSampler(
            classifier=RandomForestClassifier(),
            clusterer=ElbowMethod(KMedoids(n_clusters=1, method='fasterpam'),
                                  metric=dist_metrics.euc_distance_matrix_missing, cluster_range=CLUSTER_RANGE,
                                  nb_repeats=10),
            fillna=False
        ),
        daily_sampler=ConsumptionClusterSampler(
            classifier=RandomForestClassifier(),
#             # In publication we use the elbow method here
#             clusterer=ElbowMethod(
#                 clusterer=KMeans(n_clusters=1),
#                 cluster_range=CLUSTER_RANGE,
#             ),
#            # for the mock-up example just use 10 clusters in every case
             clusterer = KMeans(n_clusters = 10),
        ),
    )
)



## Configure logging

In [None]:
import logging
import datetime

logging.basicConfig(format='%(asctime)s - %(name)s: %(message)s', level=logging.DEBUG,
                    filename=f'{datetime.datetime.now().strftime("%d-%m-%Y")}.log',
                    filemode='w')

## Run the experiments

In [None]:
%%time
result_path = Path() / 'results' / 'overall_comparison'
result_path.mkdir(parents=True, exist_ok=True)

energy_scores = []
evaluator = SamplerEvaluator(folds, yearly_data_df, daily_data_df, yearly_info_df, daily_info_df, None, 1,
                             nb_samples=100, crossval=False)
for key, model in list(models.items()):
    energy_score = evaluator.evaluate_and_save(model, result_path / f"{key}.pkl")
    energy_scores.append(energy_score)
energy_scores = pd.concat(energy_scores, axis=1, keys=models.keys())

## Overall energy scores

In [None]:
energy_scores.mean(axis = 0)