# Some dataset statistics 

In [None]:
# standard imports
import numpy as np 
import pandas as pd
import altair as alt
from pathlib import Path
import itertools

# loading the data 
from energyclustering.sampling.preprocessing import DataPreprocessor


In [None]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
            grid = False, 
        labelFontSize = fontsize,
        titleFontSize = fontsize, 
            # offset = 5, 
    ).configure_title(
        fontSize = fontsize
        ).configure_legend(
    titleFontSize=fontsize,
    labelFontSize=fontsize
    ).configure_view(
        strokeWidth=0
)

# The dataset

In [None]:
daily_data_df, data_df, daily_info_df, weather_df = (
    DataPreprocessor()
    .preprocess_info_df('baseline')
    .preprocess_weather_df('baseline')
    .drop_days_with_nan(True)
    .subsample_days(week_reduction_factor = None)
    # for testing only!
    # .subsample_years(1000)
    .get_data()
)
daily_data_df.shape

# Number of years

In [None]:
print(f"There are {data_df.shape[0]} years ")

# Distribution of years 

In [None]:
all_years = pd.Series(list(map(lambda x: x.split(',')[1][:-1], daily_info_df.index.get_level_values(0).unique()))).value_counts().to_frame('count').rename_axis('year', axis = 0).reset_index()
all_years;

In [None]:
big_chart(alt.Chart(all_years, width = 500).mark_bar().encode(
    x = alt.X('year', title = 'Year'), 
    y = alt.Y('count', title = '#Time Series'),
))

# Number of years per profile

In [None]:
years = list(map(lambda x: x.split(',')[1][:-1], daily_info_df.index.get_level_values(0)))
meterID = list(map(lambda x: x.split(',')[0][2:-1], daily_info_df.index.get_level_values(0)))
df = pd.DataFrame(columns = ['year', 'meterID']).assign(year = years, meterID = meterID)
nb_of_years_per_profile = df.drop_duplicates().groupby('meterID').count().value_counts()
nb_of_years_per_profile

In [None]:
print(f"There are {np.unique(meterID).shape[0]} unique measured consumers")

In [None]:
meterID

# available metadata

In [None]:
daily_info_df.loc[:, 'household_info']

In [None]:
print((", ".join(daily_info_df.loc[:, 'day_info'].columns)).replace('_', '\_'))

# Percentage of missing data

In [None]:
number = data_df.drop(columns = '2016-02-29').isna().mean().mean()
print(f"{number:.2%} of the load measurements are missing")

In [None]:
(data_df.drop(columns = '2016-02-29').isna().mean(axis = 1) < 0.004).mean()