In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from energyclustering.sampling.inspection.consumptionclustering import ConsumptionClusteringInspector
from dask.distributed import Client
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import matplotlib.pyplot as plt
import seaborn as sns
from energyclustering.sampling.samplers import ConsumptionDataSampler, MetadataSampler, EnergyvilleDaySelectionBaseline, RandomSamplerBaseline
from energyclustering.sampling.day_of_year_samplers import DailySamplerFromClusterSampler, GenerateSampleDecorator
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from pathlib import Path
from energyclustering.sampling.evaluation.evaluation import SamplerEvaluator

In [None]:
from energyclustering.sampling.inspection.classificationinspection import ClassificationInspection

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
%matplotlib inline
# %config InlineBackend.figure_formats = ['svg']

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    # no subsampling this time
    .subsample_days(None)
    # for testing only!
    .subsample_years(1000)
    .get_data()
)
household_info = daily_info_df.loc[:, 'household_info'].droplevel('date').pipe(lambda x: x[~x.index.duplicated(keep = 'first')])

In [None]:
def inspect(daily_clusterer, daily_data_to_use, min_cluster_size = 10, **tree_params): 
    day_info = daily_info_df.loc[daily_data_to_use.index.unique(), 'day_info']
    inspector = ClassificationInspection(daily_clusterer, DecisionTreeClassifier(**tree_params), daily_data_to_use, day_info, daily_data_to_use.index, daily_data_to_use.index)
    inspector = inspector.fit_model()
    inspector.plot_clustering_line(sample = None)
    display(inspector.training_cluster_size_df().T)
    display(inspector.classification_performance())
    inspector.plot_tree()
    
    

In [None]:
data_df.loc[profile_to_use].isna().sum()

In [None]:
daily_data_df.loc[profile_to_use].shape

In [None]:
daily_data_df.loc[profile_to_use].isna().any(axis = 1).sum()

In [None]:
plot_df.tail()

In [None]:
plot_df = data_df.loc[profile_to_use].to_frame('value').reset_index()
alt.Chart(plot_df, width = 1000).mark_line().encode(
    x = 'index', 
    y = 'value'
    ).interactive()

In [None]:
from datetime import datetime 
  
# initializing day number 
day_num = str(230)
day_num.rjust(3 + len(day_num), '0') 
year = str(2016)
  
# converting to date 
res = datetime.strptime(year + "-" + day_num, "%Y-%j").strftime("%d-%m-%Y")
res

In [None]:
IDX = 100

all_profiles = data_df.index
profile_to_use = all_profiles[IDX]


inspect(KMeans(10), daily_data_df.loc[[profile_to_use]], min_cluster_size = 0, max_depth = 4, min_samples_leaf = 10)

In [None]:
plot_df = data_df.loc[profile_to_use].to_frame('value').reset_index()
alt.Chart(plot_df, width = 1000).mark_line().encode(
    x = 'index', 
    y = 'value'
    ).interactive(bind_y = False)

In [None]:
IDX =900

all_profiles = data_df.index
profile_to_use = all_profiles[IDX]


inspect(KMeans(15, random_state = 10), daily_data_df.loc[[profile_to_use]], min_cluster_size = 0, max_depth = 3, min_samples_leaf = 10, min_impurity_decrease = 0)

In [None]:
plot_df = daily_data_df.loc[[profile_to_use]].stack().rename_axis(['meterID', 'date', 'time'], axis = 0).to_frame('value').join(daily_info_df.loc[:, 'day_info'], how = 'left').reset_index().astype({'maxtempC': 'int', 'sunHour':'float', 'tempC':'float'})
plot_df = plot_df.query('sunHour <= 8.95 and tempC > 10.5')
line = alt.Chart(plot_df).mark_line(size = 0.2).encode(
    x = 'time', 
    y = 'value', 
    color = 'date:N', 
#     row = 'is_weekend'
)
box =  alt.Chart(plot_df).mark_boxplot().encode(
    x = 'time', 
    y = 'value', 
#     row = 'is_weekend'
) 
box

In [None]:
plot_df = daily_data_df.loc[[profile_to_use]].stack().rename_axis(['meterID', 'date', 'time'], axis = 0).to_frame('value').join(daily_info_df.loc[:, 'day_info'], how = 'left').reset_index().astype({'maxtempC': 'int', 'sunHour':'float'})
plot_df = plot_df.query('sunHour > 8.95')
line = alt.Chart(plot_df).mark_line(size = 0.2).encode(
    x = 'time', 
    y = 'value', 
    color = 'date:N', 
#     row = 'is_weekend'
)
box =  alt.Chart(plot_df).mark_boxplot().encode(
    x = 'time', 
    y = 'value', 
#     row = 'is_weekend'
) 
box

In [None]:
box

In [None]:
alt.Chart(plot_df).mark_line().encode(
    x = 'time', 
    y = 'value', 
    color = 'date:N', 
    row = 'is_weekend'
)