# Look at some profiles
This notebook does a few things:  
- look at some random profiles
- look at the zero measurements
- look at the monthly trend
- look at the weekly trend
- look at the daily trend 

Conclusions: 
- Zeros seem to be due to disabled meters or weird profiles 
- We'll probably have to remove outliers

#### Imports

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
import pyxlsb
alt.data_transformers.disable_max_rows()
READ_DATA = False # just a defense against reading the data twice

#### Load the data

In [None]:
# PATH to the profile directory in the fluvius data
DATA_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/Data-2020-11/FluviusData/profiles')
# PATH to where the preprocessed files should be appear

PREPROCESSED_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/infrax')
if not READ_DATA:
    data_path = PREPROCESSED_PATH/'M_resid_data.csv'
    info_path = PREPROCESSED_PATH/'M_resid_info.csv'
    data_df = pd.read_csv(data_path, index_col = [0], parse_dates=True)
    info_df = pd.read_csv(info_path, index_col = [0], parse_dates = True)
    READ_DATA = True

In [None]:
data_df = data_df.set_index('Jaar', append=True)
data_df = data_df.sort_index()

In [None]:
data_df.columns = pd.to_datetime(data_df.columns)

In [None]:
data_df.head()

#### Some plotting code 
If you did not know already altair is amazing 

In [None]:
def inspect_profile_on_index(idx):
    ean_data = data_df.iloc[idx]
    data_to_plot = ean_data.to_frame()
    data_to_plot = data_to_plot.reset_index()
    data_to_plot.columns = ['time','value']
    return alt.Chart(data_to_plot).mark_line().encode(
        x = 'time:T', 
        y= 'value'
    ).interactive(bind_y = False).properties(width = 2000, height = 300)


def inspect_profile_on_ean_year(ean, year):
    ean_data = data_df.loc[(ean,year)]
    data_to_plot = ean_data.to_frame()
    data_to_plot = data_to_plot.reset_index()
    data_to_plot.columns = ['time','value']
    return alt.Chart(data_to_plot, title = f'Profile EAN {ean} year {year}').mark_line().encode(
        x = 'time:T', 
        y= alt.Y('value'),
    ).interactive(bind_y = False).properties(width = 2000, height = 300)

In [None]:
def show_clustering(df, cluster_df, x_axis = 'months'):
    plot_df = df.stack().to_frame('total_consumption').join(cluster_df).reset_index()
    print(plot_df.head())
    return alt.layer(
            alt.Chart(plot_df).mark_area(opacity = 0.3).encode(
                x = f'{x_axis}:O', 
                y = alt.Y('min(total_consumption):Q', stack = None),
                y2 = alt.Y2('max(total_consumption):Q'),
                color = alt.Color('cluster_label:N', scale=alt.Scale(scheme='set1'))
            ),
            alt.Chart(plot_df).mark_line().encode(
                x = f'{x_axis}:O', 
                y = 'mean(total_consumption):Q',
                color = alt.Color('cluster_label:N')
            )
        ).properties(width = 600, height = 400)

# Plot some random profiles

In [None]:
inspect_profile_on_index(6)

## Look for NaN's

In [None]:
data = data_df.iloc[5]
pd.set_option("display.max_rows", None)
data.loc[(data.index.month == 2) & (data.index.day == 28)].to_frame()

# Look for zero values/intervals

In [None]:
def multi_index_to_column(df): 
    df = df.copy()
    df.index = [ str(v1) + 'year'+str(v2) for v1, v2 in df.index.values]
    df.index.name = 'profile_year'
    return df.reset_index()

In [None]:
zeros_per_profile = (data_df == 0).sum(axis = 1)
alt.Chart(zeros_per_profile.to_frame('nb_of_zeros').reset_index(drop=True), title = 'Histogram of number of zeros per profile').mark_bar().encode(
    x = 'nb_of_zeros:N', 
    y = 'count()'
)

## Look at profiles with a lot of zeros

In [None]:
data_with_zeros = data_df[zeros_per_profile>10000]
data_with_zeros.index.to_frame()

In [None]:
inspect_profile_on_ean_year(155, 2014)

So this simply seems a weird profile, lets check if we also have data from 2015 from the same profile

In [None]:
inspect_profile_on_ean_year(155,2015)

Still some zero measurements but less!

In [None]:
inspect_profile_on_ean_year(161, 2014)

This profile simply had its meter disabled? also noteworthy a small negative and positive bump on feb fri 26

## Some profiles with a moderate amount of zeros

In [None]:
data_with_zeros = data_df[(zeros_per_profile<70)&(zeros_per_profile>50)]
data_with_zeros.index.to_frame()

In [None]:
inspect_profile_on_ean_year(66,2014)

profile is mostly fine but again it just seeems like the house is not occupied

## Check monthly trends

In [None]:
monthly_data_df = data_df.resample('1M',axis = 1).sum()
monthly_data_df.columns.name ='months'
monthly_data_df.head()

In [None]:
from sklearn.cluster import KMeans
SEED = 1131345
clusterer = KMeans(n_clusters=10, random_state = SEED)
clusterer.fit(monthly_data_df.values)
labels = clusterer.labels_
cluster_df = pd.DataFrame(labels, index = monthly_data_df.index, columns = ['cluster_label'])
show_clustering(monthly_data_df, cluster_df, x_axis = 'months')

This looks really clean! We see different levels of consumption, we see some nice curves of electrical heating (probably) and we see profiles that show the inverse trend!  

This coarse grained clustering is already quite cool! 

## Check weekly trends

In [None]:
monthly_data_df = data_df.resample('1W',axis = 1).sum()
monthly_data_df.columns.name ='weeks'
monthly_data_df.head()

In [None]:
from sklearn.cluster import KMeans
SEED = 1131345
clusterer = KMeans(n_clusters=10, random_state = SEED)
clusterer.fit(monthly_data_df.values)
labels = clusterer.labels_
cluster_df = pd.DataFrame(labels, index = monthly_data_df.index, columns = ['cluster_label'])
show_clustering(monthly_data_df, cluster_df, x_axis = 'weeks')

Does not seem ot tell us a lot more than the previous plot (on a monthly level)  
Except that we'll probably need to remove outliers to get clean clusters


## Check days 
For some profile check the daily timeseries

In [None]:
profile_to_use = 1
profile = data_df.iloc[profile_to_use].to_frame().reset_index()
profile.columns = 'timestamp', 'value'
profile.head()
profile['time'] = profile.timestamp.dt.time
profile['date'] = profile.timestamp.dt.date
daily_ts_df = pd.pivot_table(profile, index = 'date', columns = 'time', values = 'value')
daily_ts_df.index = pd.to_datetime(daily_ts_df.index)
daily_ts_df.columns = [str(time) for time in daily_ts_df.columns]
daily_ts_df.columns.name = 'time'
daily_ts_df.head()

In [None]:
SEED = 1131345
clusterer = KMeans(n_clusters=10, random_state = SEED)
clusterer.fit(daily_ts_df.values)
labels = clusterer.labels_
cluster_df = pd.DataFrame(labels, index = daily_ts_df.index, columns = ['cluster_label'])
show_clustering(daily_ts_df, cluster_df, x_axis = 'time')

Also looks really really clean (although the clustering with kmeans is not a good idea)