## Training test split of the data 

### Read the data

In [None]:
from energyclustering.data.fluvius import read_data_pickle
info_df, data_df = read_data_pickle(include_incomplete_profiles = True, process_errors = True)
data_df = data_df.rename_axis('timestamp', axis = 1)

In [None]:
data_df;

### Split the meterIDs in three groups

In [None]:
meterIDs = data_df.index.get_level_values(0).unique().to_numpy()
meterIDs

In [None]:
from numpy.random import default_rng
import numpy as np
import pandas as pd
import altair as alt

In [None]:
SEED = 0

In [None]:
SEED += 1
generator = default_rng(SEED)
print(SEED)

In [None]:
shuffled = meterIDs.copy()
generator.shuffle(shuffled)
shuffled
folds = np.split(shuffled, 3)

In [None]:
from pathlib import Path
from datetime import date
store_path = Path("/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/folds")
filename = f'{date.today().strftime("%Y-%m-%d")}_folds.npy'
if not (store_path/filename).exists():
    np.save(store_path/filename, folds)

### Check each fold

In [None]:
fold_dfs = [] 
for fold in folds: 
    fold_df = info_df.loc[fold, :]
    fold_dfs.append(fold_df)

#### C1: number of years should be balanced

In [None]:
lengths = [fold_df.shape[0] for fold_df in fold_dfs]
alt.Chart(pd.DataFrame(lengths, columns = ['#profiles']).reset_index()).mark_bar().encode(
    x = alt.X('index:N', title = 'fold'), 
    y = alt.Y('#profiles', title = '#years')
).display()
lengths

#### C2 has PV should be balanced

In [None]:
fold_dfs[0].PV.unique

In [None]:
PVs = [fold_df.PV.fillna(False).astype('int').sum() for fold_df in fold_dfs]
alt.Chart(pd.DataFrame(PVs, columns = ['#PVs']).reset_index()).mark_bar().encode(
    x = alt.X('index:N', title = 'fold'), 
    y = alt.Y('#PVs', title = 'years with PV panels')
)

In [None]:
PVs

#### C2 heatpump should be balanced

In [None]:
PVs = [fold_df.heatpump.fillna(False).astype('int').sum() for fold_df in fold_dfs]
alt.Chart(pd.DataFrame(PVs, columns = ['#PVs']).reset_index()).mark_bar().encode(
    x = alt.X('index:N', title = 'fold'), 
    y = alt.Y('#PVs', title = 'years with heatpump')
)

In [None]:
PVs


#### C3 check other conditions

In [None]:
columns = fold_dfs[0].columns
columns

In [None]:
for column in columns: 
    all_value_counts = []
    for fold_df in fold_dfs: 
        all_value_counts.append(fold_df[column].value_counts(dropna=False))
    value_count_df = pd.concat(all_value_counts, keys = [f'fold{i}' for i in range(3)]).rename_axis(('fold', 'value'), axis = 0).to_frame('count')
#     print(value_count_df.reset_index().columns)
    chart = alt.Chart(value_count_df.reset_index(), title = column, width = 800, height = 200).mark_bar().encode(
        x = alt.X('value:N', title = 'attribute value'), 
        y = alt.Y('count:Q', title = 'count'), 
        row = 'fold'
    ).configure_title(fontSize = 20, align = 'center')
    display(chart)