In [None]:
import numpy as np 

In [None]:
from energyclustering.sampling.preprocessing import DataPreprocessor
from dask.distributed import Client
import pandas as pd
import altair as alt

In [None]:
%load_ext autoreload
%autoreload 2

# The data

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
#     .subsample_days(week_reduction_factor = 5)
    # for testing only!
    .subsample_years(500)
    .get_data()
)
daily_data_df.shape

In [None]:
daily_info_df.loc[:, 'household_info'].head().drop(columns = 'consumer_type').droplevel('date').drop_duplicates()

In [None]:
day_info_df = daily_info_df.loc[:, 'day_info']
day_info_df.columns

# Folds

In [None]:
reduced_day_info_df = day_info_df.drop(columns = ['day_of_week', 'iso_day', 'month', 'day'])

In [None]:
generator = np.random.default_rng(1)
shuffled = data_df.index.to_numpy(copy=True)
generator.shuffle(shuffled)
folds = np.array_split(shuffled, 3)


[ ]:
# Check which attributes of the data are most informative for consumption 
## Cluster the days

In [None]:
%%time
# just k-means for simplicity and speed
from sklearn.cluster import KMeans
clusterer = KMeans(300)
clusterer.fit(daily_data_df)
cluster_labels = clusterer.labels_

## Learn a classifier from the day_info to cluster_idx

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators = 500)
classifier.fit(day_info_df, cluster_labels)
feature_importances = classifier.feature_importances_

In [None]:
classifier = RandomForestClassifier(n_estimators = 500)
classifier.fit(reduced_day_info_df, cluster_labels)
reduced_feature_importances = classifier.feature_importances_

## Check the feature importances

In [None]:
importance_df = pd.Series(feature_importances, index = day_info_df.columns).to_frame('feature_importance')
reduced_importance_df = pd.Series(reduced_feature_importances, index = reduced_day_info_df.columns).to_frame('feature_importance')

In [None]:
chart = alt.Chart().mark_bar().encode(
    x = 'feature_importance:Q', 
    y = alt.Y('index:N', sort = None), 
)
chart.properties(data = importance_df.reset_index()) | chart.properties(data = reduced_importance_df.reset_index())