In [None]:
from pathlib import Path
from energyclustering.webapp.resultparser import COBRASResult, ResultInspector
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.preprocessing import OrdinalEncoder
from energyclustering.clustering.similarity import *
import pandas as pd
from dask.distributed import Client
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
alt.data_transformers.disable_max_rows()


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
WASSER = 'full_distance_matrix_wasserstein'
directory = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/distance_matrices/')

assert all((directory/name).exists() for name in [WASSER])

# Prepare the data

## The COBRAS result

In [None]:
cobras_result = COBRASResult('result_20211124_koen', directory/WASSER)

In [None]:
clustering_target = cobras_result.clusterings[-1]
print(f"#instances={clustering_target.shape[0]}")
print(f"#clusters={len(np.unique(clustering_target))}")

## The info used to sample 

In [None]:
data_df = cobras_result.data_df
info_df = cobras_result.info_df

In [None]:
total_yearly_consumption = data_df.sum(axis = 1)
total_yearly_consumption.head()

In [None]:
info_df.isna().sum(axis = 0)

In [None]:
info_subset = (
    cobras_result.info_df
    .assign(
        # add yearly consumption
        yearly_consumption = total_yearly_consumption
    )
    # only retain columns that will plausibly be available 
    [['#family_members', 'connection_power', 'consumer_type', 'PV', 'PV_power', 'yearly_consumption', 'heatpump']]
    .fillna(-1) #quick fix better preprocessing later 
    
)


In [None]:
info_subset.isna().sum(axis = 0)

## Encode the data

In [None]:
ORDINALS = ['consumer_type', 'PV', 'PV_power', 'heatpump']

info_subset[ORDINALS] = OrdinalEncoder().fit_transform(info_subset[ORDINALS].astype('str'))
X = info_subset.to_numpy()
y = clustering_target

## Decision Tree

In [None]:
from energyclustering.sampling.sampler import Sampler
sampler = Sampler( DecisionTreeClassifier(max_depth = 5, ccp_alpha = 0.005))
sampler.fit(X,y)

### Evaluate

**Interpret these metrics properly**

In [None]:
sampler.evaluate()

In [None]:
sampler.confusion_matrix()

## Decision Tree

In [None]:
from energyclustering.sampling.sampler import Sampler
sampler = Sampler( RandomForestClassifier(max_depth=6, ccp_alpha = 0.003))
sampler.fit(X,y)

### Evaluate

**Interpret these metrics properly**

In [None]:
sampler.evaluate()

In [None]:
sampler.confusion_matrix()

In [None]:
sampler.probabilistic_confusion_matrix()

In [None]:
cluster_sizes = sampler.cluster_sizes_chart()
cluster_sizes

In [None]:
cluster_sizes.loc[[2,3,5]].sum()/cluster_sizes.sum()