# Dataset statistics
Calculate some statistics about the dataset problems and try to figure out details about the problems that we can use to solve them
## Imports and set-up

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
alt.data_transformers.disable_max_rows()

In [None]:
PRE_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/combined')
info_path = PRE_PATH/'info.csv'
data_path = PRE_PATH/'data.csv'
assert info_path.exists() and data_path.exists(), 'These paths should exist'

## Read the data

In [None]:
info_df = pd.read_csv(info_path, index_col = [0,1])
idxs = np.arange(0, len(info_df))[info_df['data_source'] != 'EandisVREG']
idxs = idxs + 1

In [None]:
data_df = pd.read_csv(data_path, index_col = [0,1], skiprows = idxs)
data_df.columns = pd.to_datetime(data_df.columns)
data_df.columns.name = 'timestamp'

In [None]:
info_df.head()

In [None]:
data_df.head()

In [None]:
leap_years = [2012, 2016]
non_leap_years = [year for year in info_df.index.levels[1] if year not in leap_years]
print(f'leap years = {leap_years}')
print(f'non leap years = {non_leap_years}')

## Check the NaNs

In [None]:
nb_of_nas = data_df.isna().sum(axis = 1)
idx = pd.IndexSlice
nb_of_nas.loc[idx[:,non_leap_years]]  -= 96
nb_of_nas = nb_of_nas[nb_of_nas > 0]
print(f'there are {len(nb_of_nas)} profiles with NaN values')
print(f'the average number of NaNs in each profile with NaNs is {nb_of_nas.mean()}')

## Check the zeros 

In [None]:
nb_of_zeros = (data_df == 0).sum(axis = 1)
nb_of_zeros = nb_of_zeros[nb_of_zeros>0]
print(f'there are {len(nb_of_zeros)} profiles with zero values')
print(f'the average number of zeros in each these profiles is {nb_of_zeros.mean()}')
alt.Chart(nb_of_zeros.to_frame('zeros').reset_index()).mark_bar().encode(
    x = alt.X('zeros:O', bin=True), 
    y = alt.Y('count()')
)

## Look at single value zeros

In [None]:
profiles = nb_of_zeros.index[nb_of_zeros == 1]
profiles

#### Look at time of the zero

In [None]:
zero_timestamps = (data_df.loc[profiles] == 0).stack()
zero_timestamps = zero_timestamps[zero_timestamps]
zero_times = zero_timestamps.to_frame('value').reset_index().drop(columns = ['value'])
alt.Chart(zero_times).mark_bar().encode(
    x = 'timestamp:N', 
    y = 'count()'
)

### Look at some profiles (the zero is centered in the middle)

In [None]:
def show_single_zero_plot(profile, margin = '5D'): 
    zero_location = profile.index[profile == 0][0]
    interval_to_plot = zero_location - pd.Timedelta(PLOT_SIZE), zero_location + pd.Timedelta(PLOT_SIZE)
    profile_subset = profile.loc[interval_to_plot[0]:interval_to_plot[1]].to_frame('value')
    profile_subset['is_zero'] = profile_subset.value == 0
    return alt.Chart(profile_subset.reset_index(), width = 1600, title = str(zero_location)).mark_line().encode(
        x = 'timestamp:T', 
        y = 'value:Q'
    )

In [None]:
show_single_zero_plot(data_df.loc[profiles[0]])

## Look at all zeros jointly

In [None]:
year_values = data_df.index.levels[1]
year_values
year_to_investigate = year_values[0]
# only look at 2014 
data14_df = data_df.loc[idx[:, 2016], :]

# only look at profiles with at least one zero value
zero_profiles = data14_df.index[(data14_df == 0).sum(axis = 1) > 0]
data14_df = data14_df.loc[zero_profiles]
data14_df

In [None]:
def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

def zero_run_df(data_df): 
    dfs = []
    for index, row in data_df.iterrows(): 
        runs = zero_runs(row)
        dfs.append(pd.DataFrame(runs, columns = ['start', 'end'], index = [index]*runs.shape[0]))
    full_df = pd.concat(dfs, axis = 0).reset_index().rename(columns = {'index':'profile'})
#     full_df['start_time'] = data14_df.columns[full_df['start']]
#     full_df['end_time'] = data14_df.columns[full_df['end']-1]
    return full_df


zero_periods = zero_run_df(data14_df)
zero_periods

In [None]:
df = zero_periods.groupby(['start', 'end']).count().rename(columns = {'profile':'count'}).reset_index()
df

In [None]:
#def flexible_matching(df): 
representatives = 

In [None]:
zero_periods['period'] = zero_periods['start'].dt.strftime("%d/%m/%y %H:%M") + '-' + zero_periods['end'].dt.strftime("%d/%m/%y %H:%M")
zero_periods

In [None]:
periods_with_count = zero_periods['period'].value_counts().to_frame('count')
periods_with_count = periods_with_count[periods_with_count['count'] > 100]
periods_with_count

In [None]:
alt.Chart(periods_with_count.reset_index()).mark_bar().encode(
    x = 'index:N', 
    y = 'count:Q'
)