In [None]:
%matplotlib inline

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('png')
from pathlib import Path

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor

# preprocessing the yearly info 
from energyclustering.sampling.preprocessing.info import YearlyInfoPreprocessor

# executing the results in parallell 
from dask.distributed import Client

# sampling models 
from energyclustering.sampling.day_of_year_samplers import RandomBaselineSampler # random day selection baseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler # to combine daily and yearly sampler
from energyclustering.sampling.day_of_year_samplers import GenerateSampleDecorator
from energyclustering.sampling.samplers import EnergyvilleDaySelectionBaseline
from energyclustering.sampling.samplers import MetadataSampler
from energyclustering.sampling.samplers import ConsumptionDataSampler


# elbow method to use for clustering 
from energyclustering.clustering.elbow import ElbowMethod
from energyclustering.clustering.preclustering import PreClusteringClusterer # two stage clustering procedure

# clustering algorithms 
from sklearn.cluster import KMeans, MiniBatchKMeans
from kmedoids import KMedoids
from energyclustering.clustering.kmedoids import CustomKMedoids

# cluster metrics 
import energyclustering.clustering.metrics as dist_metrics

# classifiers 
from sklearn.ensemble import RandomForestClassifier 

# evaluation 
from energyclustering.sampling.evaluation import SamplerEvaluator
from energyclustering.sampling.evaluation.energy_score import calculate_energy_score

In [None]:
%load_ext autoreload
%autoreload 2

# The private data

In [None]:
daily_data_df, private_data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('paper')
    .preprocess_weather_df('paper')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    # .subsample_years(1000)
    .get_data()
)

# The public data

In [None]:
from energyclustering.data.public.data import get_data_reading_preprocessed
data_df = get_data_reading_preprocessed()[['Consumption']]

# Take info from private data df 

In [None]:
public_df = data_df['Consumption'].unstack('datetime')
private_df = private_data_df.fillna(0)
public_df = public_df.reindex(columns = private_df.columns).fillna(0)
assert len(private_df.columns) == len(public_df.columns)

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(n_neighbors = 1)
model.fit(private_df) 
neigh_dist, neigh_ind = model.kneighbors(public_df)

## The public dataset is a subset of the private subset although the distances are quite high 
The distances are because we processed the datasets a bit to remove the weird peaks :) 

In [None]:
pd.Series(neigh_dist[:,0]).hist()

In [None]:
IDX = 2
def plot_serie(serie): 
    return alt.Chart(serie.to_frame('value').reset_index(), height = 200,  width = 2000).mark_line().encode(
        x = 'index', 
        y = 'value'
    )
(plot_serie(public_df.iloc[IDX]).properties(title = 'public profile') & plot_serie(private_df.iloc[neigh_ind[IDX,0]]).properties(title = 'closest match in private data')).resolve_scale(y = 'shared')

# Use the private profiles and private data that are in the public dataset 

In [None]:
data_df = private_data_df.iloc[np.unique(neigh_ind[:,0])]
daily_info_df = daily_info_df.loc[data_df.index]
daily_data_df = daily_data_df.loc[data_df.index]


In [None]:
print(f"{data_df.shape=}, {daily_info_df.shape=}, {daily_data_df.shape=}")

In [None]:
data_df.shape

In [None]:
# generate folds for cross validation 
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 5)

In [None]:
train_set = np.concatenate(folds[0:4])
test_set = folds[4]
print(f"{train_set.shape=}, {test_set.shape=}")

In [None]:
temp_df = daily_info_df.loc[:, 'household_info'].reset_index(level = 1, drop = True).drop_duplicates()
median_consumption = temp_df.yearly_consumption.median()
average_profile = (temp_df.loc[test_set].yearly_consumption - median_consumption).abs().pipe(lambda x: x == x.min()).pipe(lambda x: x[x]).index[0]
average_profile = daily_info_df.loc[[average_profile]].index[31]
average_profile

In [None]:
temp_df.loc[test_set].sort_values('yearly_consumption')

In [None]:
high_profile = temp_df.loc[test_set].sort_values('yearly_consumption').index[-1]
high_profile = daily_info_df.loc[[high_profile]].index[29]
high_profile

In [None]:
data_df.index.unique()

# Model

In [None]:
NB_SAMPLES = 100
# CLUSTER_RANGE = list(range(10, 101, 5))
CLUSTER_RANGE = list(range(5, 41, 5))
models = dict()

# random baseline 
models['random baseline'] = RandomBaselineSampler(NB_SAMPLES)

# rule based approach 
models['expert based'] = (
    GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = MetadataSampler(
                clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 10), cluster_range = CLUSTER_RANGE), 
                info_preprocessing = YearlyInfoPreprocessor(columns_to_use = ['yearly_consumption', 'connection_power'], normalized = True),
            ), 
            daily_sampler = EnergyvilleDaySelectionBaseline()
        ), 
        n_samples = NB_SAMPLES,
    )
)

# consumption clustering based approach 
models['data driven'] = (
     GenerateSampleDecorator(
        sampler = DailySamplerFromClusterSampler(
            yearly_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(KMedoids(n_clusters = 1, method = 'fasterpam'), metric = dist_metrics.euc_distance_matrix_missing, cluster_range = CLUSTER_RANGE, nb_repeats = 10), 
#                 clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 1), metric = None, cluster_range = CLUSTER_RANGE, nb_repeats = 1), 
#                 clusterer = KMeans(n_clusters = 5, n_init = 10), 
                fillna = False,
#                 fillna = True,
            ), 
            daily_sampler = ConsumptionDataSampler(
                classifier = RandomForestClassifier(),
                clusterer = ElbowMethod(
                    clusterer = KMeans(n_clusters = 1),
                    cluster_range = CLUSTER_RANGE,
#                     show_progress = True,
                ), 
#                 clusterer = KMeans(n_clusters = 30, n_init = 1), 
            ), 
#             show_progress= True,
        ), 
        n_samples = NB_SAMPLES,
    )
)

# models['data driven fixed'] = (
#      GenerateSampleDecorator(
#         sampler = DailySamplerFromClusterSampler(
#             yearly_sampler = ConsumptionDataSampler(
#                 classifier = RandomForestClassifier(),
#                 clusterer = CustomKMedoids(n_clusters = 50, metric = dist_metrics.euc_distance_matrix_missing), 
# #                 clusterer = ElbowMethod(KMeans(n_clusters = 1, n_init = 1), metric = None, cluster_range = CLUSTER_RANGE, nb_repeats = 1), 
# #                 clusterer = KMeans(n_clusters = 5, n_init = 10), 
#                 fillna = False,
# #                 fillna = True,
#             ), 
#             daily_sampler = ConsumptionDataSampler(
#                 classifier = RandomForestClassifier(),
#                 clusterer = KMeans(n_clusters = 25)
# #                     show_progress = True, 
# #                 clusterer = KMeans(n_clusters = 30, n_init = 1), 
#             ), 
# #             show_progress= True,
#         ), 
#         n_samples = NB_SAMPLES,
#     )
# )





In [None]:
from dask_util import get_dask_cluster
cluster = get_dask_cluster(
    pinac_numbers = [],
    himec_numbers = [8, 1, 2],
)

In [None]:
%%time
with Client(cluster) as client: 
    all_energy_scores = []
    for model_name, model in models.items(): 
        evaluator = SamplerEvaluator(folds, daily_data_df, daily_info_df, data_df, client, 100, crossval = False) 
        energy_scores = evaluator.evaluate(model)
        all_energy_scores.append(energy_scores)
cluster.close()

In [None]:
all_energy_scores = pd.concat(all_energy_scores, axis = 1, keys = models.keys())

In [None]:
mean_energy_scores = all_energy_scores.mean(axis = 1).sort_values()
mean_energy_scores

In [None]:
well_modelled_profile = mean_energy_scores.index[int(0.25*mean_energy_scores.shape[0])]
bad_modelled_profile = mean_energy_scores.index[int(0.75*mean_energy_scores.shape[0])]
well_modelled_profile, bad_modelled_profile

In [None]:
daily_info_df.loc[average_profile]

In [None]:
plot_df = daily_info_df.loc[[average_profile[0]]].pipe(lambda x: x[(x[('day_info', 'dayOfWeek')] == 0 ) & (x[('day_info', 'month')].isin([1,2,3]))])
plot_df = transform(daily_data_df.loc[plot_df.index]).assign(meter = lambda x: x.meter.astype('str'))
plot_df 

In [None]:
variability_chart = alt.Chart(plot_df).mark_line(strokeWidth = 1, opacity = 0.5, color = 'blue').encode(
            x = alt.X('timestamp:T',  axis=alt.Axis(format='%H:%M')),
            y = 'value', 
            detail = 'meter'
        )
variability_chart

In [None]:

def transform(data): 
    data.index = [str(index) for index in data.index]
    plot_df = data.stack().to_frame('value').reset_index().set_axis(['meter', 'timestamp', 'value'], axis = 1)
    return plot_df 


In [None]:
model_translation = {'random baseline': 'Random sampling', 'expert based': 'Expert-based', 'data driven': 'Data-driven'}

In [None]:
def make_plot_quantile(models, quantile): 
    profile = mean_energy_scores.index[int(quantile*mean_energy_scores.shape[0])]
    return make_plot(models, profile)
def make_plot_index(models,index): 
    profile = mean_energy_scores.index[index]
    return make_plot(models, profile)

def make_plot(models, profile): 
    charts = []
    real_data = daily_data_df.loc[[profile]]
    plot_df_real = transform(real_data)
    
    real_info = daily_info_df.loc[[profile]]

    
    # real_day = real_info[('day_info', 'dayOfWeek')].iloc[0]
    # display(real_day)
    # real_month = real_info[('day_info', 'month')].iloc[0]
    # plot_df = daily_info_df.loc[[average_profile[0]]].pipe(lambda x: x[(x[('day_info', 'dayOfWeek')] == real_day ) & (x[('day_info', 'month')].isin([real_month -1 ,real_month,real_month + 1]))])
    # plot_df = transform(daily_data_df.loc[plot_df.index]).assign(meter = lambda x: x.meter.astype('str'))
    # variability_chart = alt.Chart(plot_df, title = 'Daylong time series of similar days').mark_line(strokeWidth = 1, opacity = 0.5, color = 'blue').encode(
    #         x = alt.X('timestamp:T',  axis=alt.Axis(format='%H:%M')),
    #         y = 'value', 
    #         detail = 'meter'
    #     )
                                                                       
    for model_name, model in models.items(): 
        predicted_data = model.get_sampling_probabilities_daily(real_info)[0]
        predicted_scenarios = daily_data_df.loc[predicted_data.index]
        plot_df_pred_subset = transform(predicted_scenarios.sample(5))
        plot_df_pred = transform(predicted_scenarios)
        
        probs = np.full((NB_SAMPLES,), 1/NB_SAMPLES)
        samples = daily_data_df.loc[predicted_data.index].to_numpy()
        truth = real_data.to_numpy()[0,:]
        calculated_energy_score = calculate_energy_score(probs, samples, truth)
        
        predicted_chart = alt.Chart(plot_df_pred).mark_line(opacity = 0.3, strokeWidth = 0.5,  color = 'gray').encode(
            x = alt.X('timestamp',  title = None,  axis=alt.Axis(format='%H:%M')),
            y = alt.Y('value', title = 'Load (in kWh)'),
            detail =  'meter',
        )
        real_chart = alt.Chart(plot_df_real).mark_line(strokeWidth = 1, color = 'blue').encode(
            x = alt.X('timestamp',  title = None,  axis=alt.Axis(format='%H:%M')),
            y = alt.Y('value', title = 'Load (in kWh)'),
        )
        subset_chart = alt.Chart(plot_df_pred_subset).mark_line().encode(
             x = alt.X('timestamp',  title = None,  axis=alt.Axis(format='%H:%M')),
            y = alt.Y('value', title = 'Load (in kWh)'),
            color =  alt.Color('meter', legend = None, scale = alt.Scale(scheme = 'tableau10' )),
        )
        # charts.append( (predicted_chart + real_chart).properties(title = f"{model_translation[model_name]} (ES = {calculated_energy_score:.3f})").interactive(bind_x = False))
        charts.append(( (predicted_chart + real_chart).properties(title = f"{model_translation[model_name]}") & subset_chart).resolve_scale(y='shared').resolve_axis(y = 'shared'))
    return alt.hconcat(*charts).resolve_scale(y = 'shared', color = 'independent').resolve_axis(y = 'shared')


In [None]:
from chart_util import big_chart

In [None]:
big_chart(make_plot(models, average_profile), fontsize = 20)

In [None]:
big_chart(make_plot(models, high_profile), fontsize = 20)

In [None]:
daily_info_df.loc[average_profile]

In [None]:
plot_df = daily_info_df.loc[[average_profile[0]]].pipe(lambda x: x[(x[('day_info', 'dayOfWeek')] == 0 ) & (x[('day_info', 'month')].isin([1,2,3]))])
plot_df = transform(daily_data_df.loc[plot_df.index]).assign(meter = lambda x: x.meter.astype('str'))
plot_df 

In [None]:
variability_chart = alt.Chart(plot_df).mark_line(strokeWidth = 1, opacity = 0.5, color = 'blue').encode(
            x = alt.X('timestamp:T',  axis=alt.Axis(format='%H:%M')),
            y = 'value', 
            detail = 'meter'
        )
variability_chart