# First try to work with sampling code I wrote

In [None]:
from pathlib import Path
import pandas as pd 
import numpy as np
import altair as alt

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from energyclustering.data.fluvius.data import read_data_pickle
from energyclustering.data.weather.data import read_weather_data
from sklearn.model_selection import train_test_split

In [None]:
NB_PROFILES = 2000 

In [None]:
full_info_df, full_data_df = read_data_pickle()

### Subsample the full data

In [None]:
data_df = full_data_df.sample(NB_PROFILES, random_state = 123)
data_train_df, data_test_df = train_test_split(data_df, test_size = 0.3)
info_train_df, info_test_df = full_info_df.loc[data_train_df.index], full_info_df.loc[data_test_df.index]

### Preprocess the metadata

In [None]:
from energyclustering.sampling.preprocessing import preprocess_info_df_for_sampling_classification

In [None]:
info_train_df.head()

In [None]:
info_train_df_pre = preprocess_info_df_for_sampling_classification(info_train_df, data_train_df)
info_test_df_pre = preprocess_info_df_for_sampling_classification(info_test_df, data_test_df)
info_train_df_pre.head()

### Load the weather data

In [None]:
weather_data = (
    read_weather_data('aalst')
    .drop(columns = ['moon_illumination', 'moonrise', 'moonset', 'sunrise', 'winddirDegree', 'location', 'DewPointC'])
)
weather_data.head()

# Try random sampling

In [None]:
from energyclustering.sampling.samplers import RandomSampler
from energyclustering.sampling.evaluation_metrics import calculate_energy_score_matrix

In [None]:
sampler = RandomSampler()
sampler.fit(info_train_df, data_train_df, None, None)
yearly_sampling_probs = sampler.get_sampling_probabilities(info_test_df)

In [None]:
yearly_energy_scores_per_profile = calculate_energy_score_matrix(yearly_sampling_probs.to_numpy(), data_train_df.fillna(0), data_test_df.fillna(0))
yearly_energy_score_random = yearly_energy_scores_per_profile.sum()
print(f"random samplings energy score is {yearly_energy_score_random:.2f}")

# Try sampling from clustering based on metadata

In [None]:
from energyclustering.sampling.samplers import MetadataSampler

### Cluster the info_train_df based on kmeans

In [None]:

from sklearn.cluster import KMeans

In [None]:
clusterer = KMeans(n_clusters = 5, random_state = 123)
clusterer.fit(info_train_df_pre)
clustering = clusterer.labels_
centroids = clusterer.cluster_centers_

### Fit the sampler

In [None]:
sampler = MetadataSampler()
sampler.fit(info_train_df, data_train_df, clustering, centroids)
yearly_sampling_probs = sampler.get_sampling_probabilities(info_test_df_pre)

### Evaluate

In [None]:
yearly_energy_scores_per_profile = calculate_energy_score_matrix(yearly_sampling_probs.to_numpy(), data_train_df.fillna(0), data_test_df.fillna(0))
yearly_energy_score_metadata = yearly_energy_scores_per_profile.sum()
print(f"clustering based on metadata samplings energy score is {yearly_energy_score_metadata:.2f}")

# Try sampling from the clustering based on consumption data

### First calculate the wasserstein distance matrix

In [None]:
from energyclustering.clustering.similarity.distmatrix import calculate_full_distance_matrix
from energyclustering.clustering.similarity.wasserstein import WassersteinDistanceMeasure


In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers = 30, threads_per_worker = 1, local_directory = '/cw/dtailocal/jonass')
filename = f'cache/cached_wasserstein_{NB_PROFILES}.pkl'
if not Path(filename).exists(): 
    with Client(cluster) as client: 
        distance_matrix = calculate_full_distance_matrix(data_train_df, WassersteinDistanceMeasure(),client, n_blocks = data_train_df.shape[0])
    distance_matrix.to_pickle(filename)
else: 
    distance_matrix = pd.read_pickle(filename)

### Cluster based on wasserstein distance matri

In [None]:
from sklearn_extra.cluster import KMedoids
clusterer = KMedoids(20, metric = 'precomputed', random_state  = 123).fit(distance_matrix)
labels = clusterer.labels_

### Fit the sampler

In [None]:
from energyclustering.sampling.samplers import ConsumptionDataSampler
from sklearn.tree import DecisionTreeClassifier

In [None]:
sampler = ConsumptionDataSampler(DecisionTreeClassifier(max_depth = 5), seed = 123)
sampler.fit(info_train_df_pre, data_train_df, labels)
yearly_sampling_probs = sampler.get_sampling_probabilities(info_test_df_pre)
yearly_sampling_probs

### Evaluate

In [None]:
yearly_energy_scores_per_profile = calculate_energy_score_matrix(yearly_sampling_probs.to_numpy(), data_train_df.fillna(0), data_test_df.fillna(0))
yearly_energy_score_consumption_data = yearly_energy_scores_per_profile.sum()
print(f"clustering based on consumption data energy score is {yearly_energy_score_consumption_data:.2f}")

# Overview of all three approaches

In [None]:
result_df = pd.DataFrame([[yearly_energy_score_random], [yearly_energy_score_metadata], [yearly_energy_score_consumption_data]], index = ['random', 'metadata', 'consumption_data'], columns = ['energy_score'])
result_df

In [None]:
pd.Series(labels).value_counts()