In [None]:
from pathlib import Path
from energyclustering.webapp.resultparser import COBRASResult, ResultInspector
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.preprocessing import OrdinalEncoder
from energyclustering.clustering.similarity import *
import pandas as pd
from dask.distributed import Client
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()
from energyclustering.sampling.sampler import Sampler
from energyclustering.sampling.metrics import calculate_energy_score_per_day, calculate_energy_score_for_day

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')

assert all((directory/name).exists() for name in [WASSER])

# Prepare the data

## The COBRAS result

In [None]:
cobras_result = COBRASResult('result_20211124_koen', directory/WASSER)

In [None]:
clustering_target = cobras_result.clusterings[-1]
print(f"#instances={clustering_target.shape[0]}")
print(f"#clusters={len(np.unique(clustering_target))}")

## The info used to sample 

In [None]:
data_df = cobras_result.data_df
info_df = cobras_result.info_df

In [None]:
total_yearly_consumption = data_df.sum(axis = 1)
total_yearly_consumption.head()

In [None]:
info_df.isna().sum(axis = 0)

In [None]:
info_subset = (
    cobras_result.info_df
    .assign(
        # add yearly consumption
        yearly_consumption = total_yearly_consumption
    )
    # only retain columns that will plausibly be available 
    [['#family_members', 'connection_power', 'consumer_type', 'PV', 'PV_power', 'yearly_consumption', 'heatpump']]
    .fillna(-1) #quick fix better preprocessing later 
    
)


In [None]:
info_subset.isna().sum(axis = 0)

## Encode the data

In [None]:
ORDINALS = ['consumer_type', 'PV', 'PV_power', 'heatpump']

info_subset[ORDINALS] = OrdinalEncoder().fit_transform(info_subset[ORDINALS].astype('str'))
household_info = info_subset.to_numpy()
consumption_data = data_df.fillna(0)
y = clustering_target

### Training test split

In [None]:

from sklearn.model_selection import train_test_split

In [None]:
household_info_train, household_info_test, consumption_data_train, consumption_data_test, clustering_train, clustering_test = train_test_split(household_info, consumption_data, y.astype('int'), train_size = 0.7, random_state = 123123)

## Decision Tree

In [None]:

sampler = Sampler( DecisionTreeClassifier(max_depth = 5, ccp_alpha = 0.005))
sampler.fit(household_info_train, consumption_data_train, clustering_train)

### Evaluate

In [None]:
# samples_to_use = [0,1,2,3,4,5]
# consumption_data_test = consumption_data_test.iloc[samples_to_use]
# household_info_test = household_info_test[samples_to_use]
# clustering_test = clustering_test[samples_to_use]
uniform_sample_probs = np.full((consumption_data_test.shape[0], consumption_data_train.shape[0]), 1/consumption_data_train.shape[0])
different_evaluations = dict(
    based_on_clustering_prob = lambda client : sampler.evaluate_sampling(household_info_test, consumption_data_test, client), 
    random = lambda client: calculate_energy_score_per_day(uniform_sample_probs, consumption_data_train, consumption_data_test, client),
    based_on_clustering_det = lambda client: sampler.evaluate_deterministic_sampling(household_info_test, consumption_data_test, client), 
    based_on_truth_clustering_det = lambda client: sampler.evaluate_deterministic_sampling_w_ground_truth(clustering_test, consumption_data_test, client)
)

In [None]:
sampler.classifier.predict(household_info_test)

In [None]:
clustering_test

In [None]:
np.sum(np.ones(3), axis = None)

In [None]:
%%time
from tqdm.notebook import tqdm
from dask.distributed import Client
with Client('pinac31.cs.kuleuven.be:8786') as client:
    client.restart()
    for name, f in tqdm(different_evaluations.items(), total = len(different_evaluations)): 
        if Path(f"{name}.pkl").exists():
            continue
        df = f(client)
        df.to_pickle(f"{name}.pkl")

In [None]:
result_series = pd.Series()
for name in different_evaluations.keys():
    df = pd.read_pickle(f"{name}.pkl")
    result_series.loc[name] = df.mean().mean()

In [None]:
result_series.to_frame('daily energy score')

In [None]:
df1 = pd.read_pickle('based_on_clustering_det.pkl')
df1

In [None]:
df2 = pd.read_pickle('based_on_truth_clustering_det.pkl')
(df1 -df2).sum().sum()

In [None]:
df

In [None]:
energy_score_per_day

In [None]:
energy_score = pd.read_pickle('energy_score.pkl')
random_energy_score = pd.read_pickle('energy_score_random.pkl')
energy_score.index = pd.to_datetime(energy_score.index)
random_energy_score.index = pd.to_datetime(random_energy_score.index)

chart = alt.Chart(energy_score.reset_index(), width = 500).mark_bar().encode(
    x = 'index:T', 
    y = 'energy_score:Q'
)
chart | chart.properties(data = random_energy_score.reset_index())

In [None]:
clustering = np.array([0,0, 1, 1,1])
predicted_probs = np.array([[0,1],[0.5,0.5]])
Sampler._calculate_sampling_per_training_instance(predicted_probs, clustering)

In [None]:
A = np.array([[1,2,3]])
A = A*A.T
A

In [None]:
selection = [1,2]
A[selection, selection]

In [None]:
random_energy_score

In [None]:
energy_score

#### Reference just sample randomly

In [None]:
from energyclustering.sampling.metrics import calculate_energy_score_per_day

In [None]:

    energy_score_per_day_random.to_frame('energy_score').to_pickle('energy_score_random.pkl')

In [None]:
energy_score_per_day_random

In [None]:
sampler.confusion_matrix()

## Decision Tree

In [None]:
from energyclustering.sampling.sampler import Sampler
sampler = Sampler( RandomForestClassifier(max_depth=6, ccp_alpha = 0.003))
sampler.fit(X,y)

### Evaluate

**Interpret these metrics properly**

In [None]:
sampler.evaluate()

In [None]:
sampler.confusion_matrix()

In [None]:
sampler.probabilistic_confusion_matrix()

In [None]:
cluster_sizes = sampler.cluster_sizes_chart()
cluster_sizes

In [None]:
cluster_sizes.loc[[2,3,5]].sum()/cluster_sizes.sum()