# Let's try to evaluate a random baseline 

In [None]:
from pathlib import Path
import pandas as pd 
import numpy as np
import altair as alt
from dask.distributed import Client

In [None]:
%load_ext autoreload
%autoreload 2

## Read the data

In [None]:
from energyclustering.data.fluvius.data import read_data_pickle
from energyclustering.data.weather.data import read_weather_data

In [None]:
from ast import literal_eval

In [None]:
info_df, data_df = read_data_pickle()
info_df = info_df.set_axis(info_df.index.to_flat_index().map(str), axis = 0)
data_df = data_df.set_axis(data_df.index.to_flat_index().map(str), axis = 0)

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)

In [None]:
print(f"data_df = {data_df.memory_usage().sum() / 1000 / 1000} MB")
print(f"info_df = {info_df.memory_usage().sum() / 1000 / 1000} MB")

In [None]:
weather_data = (
    read_weather_data('aalst')
    .drop(columns = ['moon_illumination', 'moonrise', 'moonset', 'sunrise', 'winddirDegree', 'location', 'DewPointC', 'sunset'])
    .set_index('date_time')
)
weather_data.head()

In [None]:
print(f"weather_df = {weather_data.memory_usage().sum() / 1000 / 1000} MB")

## Preprocess the data

In [None]:
import energyclustering.sampling.preprocessing as pre

In [None]:
%%time
daily_data_df = pre.yearly_profile_df_to_daily_df(data_df)

In [None]:
%%time
daily_info_df = pre.to_daily_metadata_df(info_df, weather_data)

### Remove all NaN days from data and info df

In [None]:
contains_nan_day = daily_data_df.isna().any(axis = 1)
daily_data_df = daily_data_df.loc[~contains_nan_day]
daily_info_df = daily_info_df.loc[~contains_nan_day]

In [None]:
print(f"daily_info_df = {daily_info_df.memory_usage().sum() / 1000 / 1000} MB")
print(f"daily_data_df = {daily_data_df.memory_usage().sum() / 1000/1000} MB")

## Delete unnecessary data to save space

In [None]:
del info_df
del weather_data

## Sampler

In [None]:
from energyclustering.sampling.samplers import RandomSamplerBaseline

## Evaluator

In [None]:
from energyclustering.sampling.evaluation import DaskEnergyScoreEvaluator, LocalEnergyScoreEvaluator, train_test_sets_from_folds
from pathlib import Path

In [None]:
result_path = Path().absolute() / 'results'

In [None]:
with Client(local_directory = '/cw/dtailocal/', n_workers=30, threads_per_worker = 1) as client: 
    evaluator = DaskEnergyScoreEvaluator(folds, daily_data_df, daily_info_df, data_df,  client, 1000)
    for i in range(10):
        evaluator.evaluate_and_save(RandomSamplerBaseline(10), result_path/f'random_10_repeat_{i}.pkl')
        evaluator.evaluate_and_save(RandomSamplerBaseline(50), result_path/f'random_50_repeat_{i}.pkl')
        evaluator.evaluate_and_save(RandomSamplerBaseline(100), result_path/f'random_100_repeat_{i}.pkl')
        evaluator.evaluate_and_save(RandomSamplerBaseline(200), result_path/f'random_200_repeat_{i}.pkl')
  

## Show results


In [None]:
result_df = pd.concat([pd.read_pickle(result_path/f"random_{number}.pkl").rename(number) for number in [10, 50, 100, 250, 500, 1000]], axis = 1)
result_df

In [None]:
plot_df = result_df.agg(['mean', 'std'], axis =0).T.rename_axis('nb_samples', axis = 0).reset_index().assign(lower = lambda x: x['mean']-x['std'], higher = lambda x: x['mean'] + x['std'])
plot_df

In [None]:
line = alt.Chart(plot_df, title = 'energy score of random sampling with limited samples').mark_line().encode(
    x = 'nb_samples:Q', 
    y = alt.Y('mean:Q', title = 'mean energy score', scale = alt.Scale(zero = False)),
)
line