# Compare data issue in private data and public data

In [None]:
from pathlib import Path
import pandas as pd
from energyclustering.data.public.data import get_data_reading_preprocessed
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()

In [None]:
data_dir = Path().absolute().parent.parent.parent /'energyclustering/data/fluvius/data/profile-clustering/new_preprocessed/combined'
assert data_dir.exists(), f"{data_dir} does not exist"

In [None]:
%load_ext autoreload
%autoreload 2

# The meterIDs are NOT the same

In [None]:
# data_df = pd.read_csv(data_dir/'data.csv')
info_df = pd.read_csv(data_dir/'info.csv')
info_df = info_df.set_index(['meterID', 'year'])
info_df

In [None]:
IDS_OF_INTEREST = ['/5JOm8vz1sRZTA', '+EpBeN+/Wl7Osw']

# Read the public data

In [None]:
public_data_df = get_data_reading_preprocessed()
first_profile = public_data_df.loc[[IDS_OF_INTEREST[0]]]
first_profile.reset_index().columns

In [None]:
first_profile = first_profile.reset_index().pivot_table(index = 'iID', columns = 'datetime', values = 'Consumption')
first_profile

In [None]:
first_profile.columns

# Read the private data 

In [None]:
%%time
data_df = pd.read_csv(data_dir/'data.csv')
info_df = pd.read_csv(data_dir/'info.csv')

In [None]:
data_df = data_df.set_index(['meterID', 'year'], drop=True)
data_df

In [None]:
data_df.columns = pd.to_datetime(data_df.columns)
data_df

In [None]:
# make sure the indices are equal (there is a missing hour in the public data)
first_profile = first_profile.reindex(data_df.columns, axis = 1)
first_profile_array = first_profile.to_numpy()
first_profile

In [None]:
test_date = pd.to_datetime('10/01/2020')
test_date.replace(month = 10, day = 10)

In [None]:
def correct_data(date): 
    if date.day <=12: 
        return date.replace(day = date.month, month = date.day)
    return date

In [None]:
first_profile_corrected = (
    first_profile.iloc[0].reset_index()
    .rename(columns = {'index':'original_time', '/5JOm8vz1sRZTA':'value'})
    .assign(
        new_time = lambda x: x.original_time.apply(correct_data)
    )
    .drop('original_time', axis = 1)
    .set_index('new_time')
    
)
first_profile_corrected.index.name = None
first_profile_corrected

In [None]:
def distance_to_profile(x): 
    difference = x - first_profile_array
    return np.linalg.norm(difference[~np.isnan(difference)])

In [None]:
distances = data_df.apply(distance_to_profile , axis = 1, raw = True)
distances

In [None]:
distances.idxmin()
closest_match = data_df.loc[[distances.idxmin()]].droplevel(1)
closest_match.index = ['best_match']
closest_match.loc['public_profile',:] = first_profile.iloc[0]
closest_match

In [None]:
first_profile.iloc[0]

In [None]:
first_profile_corrected.value

In [None]:
closest_match.loc['corrected_public_profile', :] = first_profile_corrected.value
closest_match

In [None]:
plot_df = (
    closest_match.stack().reset_index()
    .rename(columns = {'level_0':'profile', 'level_1':'timestamp', 0:'consumption'})
    .pipe(lambda x: x[x.timestamp.dt.month == 5])
)
plot_df

In [None]:
alt.Chart(plot_df, width = 2000).mark_line().encode(
    x = "timestamp:T", 
    y = 'consumption', 
    row = 'profile', 
    color = 'profile'
)