# Explore the Eandis 2017 data
in the folder "**_DATA Eandis 20170712 VREG study complete_**"

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
import matplotlib.pyplot as plt

In [None]:
PREPROCESSED_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/eandis2017')
info_path = PREPROCESSED_PATH / 'info.csv'
data_path = PREPROCESSED_PATH / 'data.csv'
data_processed_path = PREPROCESSED_PATH / 'data_processed.csv'

info_df = pd.read_csv(info_path, index_col=[0])
print('info_df loaded.')

data_df = pd.read_csv(data_path, index_col=[0])
data_df['timestamp'] = pd.to_datetime(data_df['timestamp'], format='%Y-%m-%d %H:%M:%S') # too slow when format is not specified
print('data_df loaded.')

data_processed_df = pd.read_csv(data_processed_path, index_col=[0, 1])
data_processed_df.reset_index(inplace=True)
data_processed_df['timestamp'] = pd.to_datetime(data_processed_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
data_processed_df.set_index(['meter ID', 'timestamp'], inplace=True)
data_processed_df.columns.name = 'measurement type'
print('data_processed_df loaded.')

## Calculate stats on the problems with the data

In [None]:
def no_consecutive_values(x, val): # number of consecutive values (val) in the array x, provided as a list
    if np.isnan(val):
        m1 = np.r_[False, np.isnan(x), False]
    else:
        m1 = np.r_[False, x==val, False]
    idx = np.flatnonzero(m1[:-1] != m1[1:])
    return (idx[1::2]-idx[::2])

def check_data(data_dfx, data_processed_dfx):
    # Duplicates:
    data_df_counts = pd.pivot_table(data_dfx, index=['meter ID', 'timestamp'], \
                                    columns='measurement type', values='measurement', aggfunc='count')
    print(f'Number of duplicate {data_df_counts.columns.to_list()} readings for the same meter ID and timestamp:' \
          f' {(data_df_counts > 1).sum().values}')
    print('\nDuplicate readings with their number of occurrences:')
    display(data_df_counts[(data_df_counts > 1).all(axis=1)])
    inds_duplicate = data_df_counts.index[(data_df_counts > 1).all(axis=1).eq(True)].values
    data_df_duplicates = data_dfx[data_dfx.set_index(['meter ID', 'timestamp']).index \
                                  .isin(data_df_counts.index[(data_df_counts > 1).all(axis=1).eq(True)])] \
                         .sort_values(['meter ID', 'timestamp'])
    data_df_duplicates_same = data_df_duplicates.groupby(['meter ID', 'timestamp', 'measurement type'])['measurement'].nunique().eq(1)
    x1 = len(data_df_duplicates_same)
    x2 = np.sum(data_df_duplicates_same == True)
    x3 = np.sum(data_df_duplicates_same == False)
    print(f'Out of {x1} duplicates, {x2} ({x2/x1*100}%) are consistent and {x3} are not.')
    
    # Missing values:
    #time_first, time_last = data_processed_dfx.index.get_level_values(1).sort_values()[[0, -1]]
    #time_indices_full = pd.date_range(start=time_first, end=time_last, freq="15min")
    print('\nNumber of missing values (consumption is considered as missing when both injection and offtake are missing):')
    display(data_processed_dfx.isna().sum())
    print('Maximum number of missing values for a meter:')
    display(data_processed_dfx.groupby(level=0).agg(lambda o: np.sum(pd.isna(o))).max())
    print('Maximum number of consecutive missing values:')
    print(data_processed_dfx.apply(lambda o: no_consecutive_values(o, np.nan).max()))
    
    # Zeros:
    print('\nNumber of zeros:')
    print(data_processed_dfx.eq(0).mean()*100)
    print('\nMaximum number of consecutive zeros')
    print(data_processed_dfx.apply(lambda o: no_consecutive_values(o, 0).max()))
    plt.figure(figsize=(8,8))
    plt.hist(no_consecutive_values(data_processed_dfx['offtake'], 0)/(4*24), bins=200, log=True)
    plt.xlabel('duration of zeros (days)')
    plt.ylabel('count')
    plt.title('histogram of duration of zero-offtake periods')

meter_IDs_data_small = data_df['meter ID'].unique()[:3]
data_df_small = data_df[data_df['meter ID'].isin(meter_IDs_data_small)]
data_processed_df_small = data_processed_df[data_processed_df.index.isin(meter_IDs_data_small, level=0)]
# check_data(data_df_small, data_processed_df_small); print(' ↑ for a small part of the data!!!')
check_data(data_df, data_processed_df); print('\n ↑ for the whole dataset\n')