# Explore the Eandis AMR data
in the folder "**_data eandis 20180822 AMR_**"

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
import matplotlib.pyplot as plt

In [None]:
PREPROCESSED_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/eandis_AMR')

AMR_df = pd.read_csv(PREPROCESSED_PATH / 'info.csv')
yearly_profiles = pd.read_csv(PREPROCESSED_PATH / 'data.csv')
yearly_profiles.set_index('EAN', inplace=True)
yearly_profiles.columns.name = 'datetime'

data_df = pd.read_csv(PREPROCESSED_PATH / 'data_raw.csv')
data_df.columns = ['timestamp', 'value']

## Calculate stats on the problems with the data

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

def no_consecutive_values(x, val): # number of consecutive values (val) in the array x, provided as a list
    if np.isnan(val):
        m1 = np.r_[False, np.isnan(x), False]
    else:
        m1 = np.r_[False, x==val, False]
    idx = np.flatnonzero(m1[:-1] != m1[1:])
    return (idx[1::2]-idx[::2])

# Duplicates:
data_df_counts = pd.pivot_table(data_df, index=['EAN', 'timestamp'], values='value', aggfunc='count')
print(f'Number of duplicate readings for the same EAN and timestamp: {(data_df_counts > 1).sum().values[0]}')
print('\nDuplicate readings with their number of occurrences:')
display(data_df_counts[(data_df_counts > 1).all(axis=1)])
data_df_duplicates = data_df[data_df.reset_index().set_index(['EAN', 'timestamp']).index \
                                  .isin(data_df_counts.index[(data_df_counts > 1).all(axis=1).eq(True)])] \
                         .sort_values(['EAN', 'timestamp'])
data_df_duplicates_same = data_df_duplicates.groupby(['EAN', 'timestamp'])['value'].nunique().eq(1)
x1 = len(data_df_duplicates_same)
x2 = np.sum(data_df_duplicates_same == True)
x3 = np.sum(data_df_duplicates_same == False)
print(f'Out of {x1} duplicates, {x2} ({x2/x1*100}%) are consistent and {x3} are not.')

# Missing values:
yearly_profiles.columns = pd.to_datetime(yearly_profiles.columns)
yearly_profiles = yearly_profiles.resample('15Min', axis=1).mean()
print(f'Number of missing values: {yearly_profiles.isna().sum().sum()}')
print(f'Maximum number of missing values for a meter: {yearly_profiles.isna().sum(axis=1).max()}')
print(f'Maximum number of consecutive missing values: {yearly_profiles.apply(lambda o: no_consecutive_values(o, np.nan).max(), axis=1).max()}')
print(f'Average number of consecutive missing values: {yearly_profiles.apply(lambda o: no_consecutive_values(o, np.nan), axis=1).apply(pd.Series).unstack().mean()}')


# Zeros:
print(f'Number of zeros: {yearly_profiles.eq(0).sum().sum()}')
print(f'Maximum number of consecutive zeros: {yearly_profiles.apply(lambda o: no_consecutive_values(o, 0).max(), axis=1).max()}')
print(f'Average number of consecutive zeros: {yearly_profiles.apply(lambda o: no_consecutive_values(o, 0), axis=1).apply(pd.Series).unstack().mean()}')
plt.figure(figsize=(8,8))
plt.hist(yearly_profiles.apply(lambda o: no_consecutive_values(o, 0), axis=1).apply(pd.Series).unstack().values/(4*24), \
         bins=200, log=True)
plt.xlabel('duration of zeros (days)')
plt.ylabel('count')
plt.title('histogram of duration of zero-offtake periods')

# yearly_profiles_flat = yearly_profiles.unstack().reset_index().sort_values('EAN').set_index(['EAN', 'datetime'])
# yearly_profiles_flat.columns.names = ['value']

