# Exploration of data eandis AMR

In [None]:
import altair as alt #Altair is a declarative statistical visualization library for Python.
import numpy as np
import pandas as pd
from pathlib import Path #Offers classes representing filesystem paths with semantics appropriate for different OS.
import datetime
from tqdm import tqdm #Instantly make your loops show a smart progress meter. How? wrap any iterable with tqdm(it).
import matplotlib.pyplot as plt

# Configuration

In [None]:
# # Path to the AMR folder (Jonas)
# DATA_PATH = Path("/cw/dtaiproj/ml/2020-FLAIR-VITO/Data-2020-11/FluviusData/profiles/data eandis 20180822 AMR") 
# Path to the AMR folder (Lola)
DATA_PATH = Path("/Users/lolabotman/PycharmProjects/FluviusFullData/profiles/data eandis 20180822 AMR") 

# folder with all the .txt's
data_dir = DATA_PATH / 'Kwartuurwaarden AMR'
master_file_path = DATA_PATH/ 'E7856 Master data.xlsx'

# # location to store preprocessed files (Jonas)
# PREPROCESSED_PATH = Path("/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/preprocessed/eandis_AMR")
# location to store preprocessed files (Lola)
PREPROCESSED_PATH = Path("/Users/lolabotman/PycharmProjects/FluviusFullData/profiles/preprocessed/eandis_AMR")


# intermediate results
PREPROCESSED_PER_ID = PREPROCESSED_PATH / 'per_id'
PREPROCESSED_PER_ID.mkdir(mode = 0o770, parents= True, exist_ok=True)

# Parse the txt's to something workeable

### Master table
This one is also parsed to something a little bit more readeable later but is necessary to parse the data

In [None]:
xls = pd.ExcelFile(master_file_path)
sheets = xls.sheet_names
print(f"available sheets: {sheets}")
print(f"reading {sheets[0]}")
print()
master_df = xls.parse(sheet_name = 0).set_index('EAN')
master_df.head()

### Make new master table and parse txt's

In [None]:
# renaming master df from previous section 
master_table_df = master_df 
# creating empty df using the right index and columns
master_df_entry = pd.DataFrame(index = master_table_df.index, columns = ['ID', 'yearly_offtake', 'yearly_injection', 'yearly_net_offtake', 'PV', 'PV_power', 'connection_power', 'residential', 'ex_night', 'HP', 'startDate', 'endDate', 'data_source', 'ean'])
# listing all the .txt files in the chosen folder 
data_files = list(data_dir.glob('*.txt'))
# convert list to set (interesting mathematical properties different than lists)
unread_data_files = set(data_files)

# loop though all EANs from the master table
for ean_to_check in tqdm(master_df.index):
    
    # get all files that have ean in the filename (so all the files that contain info about this EAN)
    ean_data_files = [f  for f in data_files if str(ean_to_check) in f.stem] # .stem = gets the final path component, without its suffix (in this case .txt)
    unread_data_files.difference_update(ean_data_files) #unread_data_files becomes the difference (unread_data_files - ean_data_files)
    

    new_reading_df = pd.DataFrame(columns=['ID', 'offtake', 'injection']) #create empty df
    offtake = []
    injection = []
    date_index_off = []
    date_index_inj = []
    
    #loop through each of the selected .txt files in main loop (many files about this one EAN)
    # extract data in each .txt as dataframe & parse the data to make it useable 
    for ean_file in ean_data_files:
        data = pd.read_csv(ean_file, sep=';', skiprows=[0,1,2,3,4,5,6,7,8], decimal=',', header=None, skipfooter=2, engine='python')

        # can do this jointly using iloc 
        for c in range(9,9+96):
            data[c] = pd.to_numeric(data[c])

        # parse the dates
        data[0] = pd.to_datetime(data[0], format='%d%m%Y %H:%M')
        data[1] = pd.to_datetime(data[1], format='%d%m%Y %H:%M')

        # offtake rows
        offtake_rows = data[(data[6]=='E12-E17') & (data[7]=='KWT')]
        for ind in offtake_rows.index:
            date_index_off.extend(pd.date_range(offtake_rows.loc[ind,0], offtake_rows.loc[ind,1], freq = '15min', closed='left').values)
            nr_data_points = len(pd.date_range(offtake_rows.loc[ind,0], offtake_rows.loc[ind,1], freq = '15min', closed='left').values)
            offtake.extend(offtake_rows.loc[ind,9:9+nr_data_points-1].values)
       
        # injection rows 
        injection_rows = data[(data[6]=='E12-E18') & (data[7]=='KWT')]
        for ind in injection_rows.index:
            date_index_inj.extend(pd.date_range(injection_rows.loc[ind,0], injection_rows.loc[ind,1], freq = '15min', closed='left').values)
            nr_data_points = len(pd.date_range(injection_rows.loc[ind,0], injection_rows.loc[ind,1], freq = '15min', closed='left').values)
            injection.extend(injection_rows.loc[ind,9:9+nr_data_points-1].values)

    # does this mean when there is no data for offtake/inj you put zero values ?
    if len(offtake) ==0:
        offtake = np.zeros(len(date_index_inj))
    if len(injection) ==0:
        injection = np.zeros(len(date_index_off))

    new_reading_df['offtake'] = np.array(offtake)/4 #values in df should be in kWh, source data is in kW
    new_reading_df['injection'] = injection
    new_reading_df['ID'] = ean_to_check
    
    if len(date_index_off)>0:
        new_reading_df.index = date_index_off
    else:
        new_reading_df.index = date_index_inj

    # add info to the new master table
    total_yearly_consumption = new_reading_df['offtake'].sum()/len(new_reading_df)*(365*96) #new_reading_df[year_mask]['offtake'].sum()   #scaled to year ==> might not be correct because some months have different consumption than other months
    total_yearly_injection = new_reading_df['injection'].sum()/len(new_reading_df)*(365*96) #no data available in source data
    yearly_net_consumption = total_yearly_consumption

    master_df_entry.loc[ean_to_check,'ID'] = str(ean_to_check)
    master_df_entry.loc[ean_to_check,'yearly_offtake'] = total_yearly_consumption
    master_df_entry.loc[ean_to_check,'yearly_injection'] = total_yearly_injection
    master_df_entry.loc[ean_to_check,'yearly_net_offtake'] = yearly_net_consumption
    master_df_entry.loc[ean_to_check,'PV'] = master_table_df.loc[ean_to_check,'DCP']
    master_df_entry.loc[ean_to_check,'PV_power'] = master_table_df.loc[ean_to_check,'INSTALLED_POWER_DCP']
    master_df_entry.loc[ean_to_check,'connection_power'] = master_table_df.loc[ean_to_check,'PHY_CON_CAP']
    master_df_entry.loc[ean_to_check,'residential'] = 0
    master_df_entry.loc[ean_to_check,'ex_night'] = 0
    master_df_entry.loc[ean_to_check,'HP'] = 0
    master_df_entry.loc[ean_to_check,'startDate'] = new_reading_df.index.min()
    master_df_entry.loc[ean_to_check,'endDate'] = new_reading_df.index.max()
    master_df_entry.loc[ean_to_check,'data_source'] = 'EandisAMR'
    master_df_entry.loc[ean_to_check,'ean'] = str(ean_to_check)

    if not (PREPROCESSED_PER_ID / (str(ean_to_check)+".csv")).exists():
        new_reading_df.to_csv(PREPROCESSED_PER_ID / (str(ean_to_check)+".csv"))

print(f'there are {len(unread_data_files)} unread files')

#### yearly total 
NB : if we just make the total (in kWh), we sum up more than a year of data

In [None]:
# just give easier names 
new_master_df = master_df_entry
master_df = master_table_df

### Make one big dataframe of all the data

In [None]:
dfs = []
for file in PREPROCESSED_PER_ID.iterdir():
    data_df = pd.read_csv(file)
    dfs.append(data_df)
raw_data_df = pd.concat(dfs, axis = 0).rename(columns = {'Unnamed: 0': 'timestamp', 'ID':'meterID'}).reset_index(drop = True)
raw_data_df['timestamp'] = pd.to_datetime(raw_data_df.timestamp)
raw_data_df.set_index(['meterID', 'timestamp'], inplace = True)
raw_data_df.sort_index()

In [None]:
raw_data_df.reset_index().dtypes

In [None]:
def color_extra_hour(ind):
    if ind < pd.to_datetime('2017-10-30 03:00:00') and ind > pd.to_datetime('2017-10-30 01:45:00'):
        color = 'yellow'
    else:
        color = ''
    return 'background-color: %s' % color

def color_missing_hour(ind):
    if ind < pd.to_datetime('2017-03-26 03:00:00') and ind > pd.to_datetime('2017-03-26 01:45:00'):
        color = 'green'
    else:
        color = ''
    return 'background-color: %s' % color

### Check for duplicate/missing measurements due to summer-winter time
- In <b> March </b> clocks go forward (at 2 am, we say it is 3 am), causing <b> 4 missing values </b> corresponding to that one hour (which may have been interpolated somehow)
- In <b> October </b> clocks go back (at 3am, we say it is 2 am) so there should be <b> 4 duplicate measurements </b> (which may have already been averaged or the duplicates may have been omitted).

> NB. Only a few test are done here, the more exaustive work is at the end of the notebook

In [None]:
pd.set_option("display.max_rows", None)

### LOOKING FOR DUPLICATES (OCTOBER)
Conclusion : There are no duplicates 

In [None]:
occurence_count = raw_data_df.reset_index().groupby('meterID')['timestamp'].value_counts()
duplicate_values = occurence_count[occurence_count > 1]
duplicate_values

In [None]:
#check for the first smart meter 
sm_id = raw_data_df.index.get_level_values(0)[0]
print(f'HEAD : {raw_data_df.loc[[sm_id],:].sort_index().head()}\n\n')
print(f'TAIL : {raw_data_df.loc[[sm_id],:].sort_index().tail()}')

##this profiles happens to start in september 2017 and go until january 2018

In [None]:
#we can observe that there doesn't seem to be duplicate data in octobre 

all_timestamps = raw_data_df.loc[raw_data_df.index.get_level_values(0)[0]]['2017-10-30 1:00':'2017-10-30 4:00']
all_timestamps.reset_index().style.applymap(color_extra_hour, subset=['timestamp'])

In [None]:
#pick another smart meter to look at 

sm_id_2 = 541448810000108927
#print(f'HEAD : {raw_data_df.loc[[sm_id_2],:].sort_index().head()}\n\n')
#print(f'TAIL : {raw_data_df.loc[[sm_id_2],:].sort_index().tail()}')

In [None]:
all_timestamps = raw_data_df.loc[sm_id_2]['2017-10-30 1:00':'2017-10-30 4:00']
all_timestamps.reset_index().style.applymap(color_extra_hour, subset=['timestamp'])

### LOOKING FOR MISSING TIME STAMPS (MARCH)
So there does not seem to be any correction for winter/summer time in the data

In [None]:
#First smart meter : nothing can be observed in march (no data at all)
all_timestamps = raw_data_df.loc[sm_id]['2017-03-26 1:00':'2017-03-26 4:00']
all_timestamps

In [None]:
#Second smart meter : we have data in march, we can observe that there are no missing data
all_timestamps = raw_data_df.loc[sm_id_2]['2017-03-26 01:00':'2017-03-26 04:00']
all_timestamps.reset_index().style.applymap(color_missing_hour, subset=['timestamp'])

#### Let's check if the potentially missing hour due to DST has been interpolated 
We want to find out if some techniques have been applied to the data in order to stay coherent with the other datasets. It could be that these profiles are in UTC. This is what we try to check 

In [None]:
#drop injections
temp_off_val = all_timestamps.drop(columns=['injection'])

#make a column where we copy the offtake and then replace the potential originally missing hour with nan
temp_off_val['offtake_with_nan'] = temp_off_val.offtake
temp_off_val.at[pd.to_datetime('2017-03-26 02:00:00'):pd.to_datetime('2017-03-26 02:45:00'),'offtake_with_nan']=np.nan
temp_off_val.reset_index().style.applymap(color_missing_hour, subset=['timestamp'])

In [None]:
plt.plot(temp_off_val.offtake)
plt.plot(temp_off_val.offtake_with_nan)

In [None]:
#apply interpolation 
interp_method = 'spline'
temp_off_val_interp = temp_off_val.offtake_with_nan.interpolate(method=interp_method,order=3)
temp_off_val["filled_nan"] = temp_off_val_interp
temp_off_val.reset_index().style.applymap(color_missing_hour, subset=['timestamp'])

In [None]:
plt.plot(temp_off_val.offtake)
plt.plot(temp_off_val.filled_nan)

In [None]:
#check if values are the same by computing the difference 
diff_original_interp = (temp_off_val['offtake']-temp_off_val['filled_nan'])
diff_original_interp = pd.DataFrame(diff_original_interp, columns = ['diff'])
equal_interp = (diff_original_interp['diff'] == 0).all() #This will return True if all values are 0 otherwise it will return false

if equal_interp == True:
    print(f'The interpolated potential missing hour gives the same result as the given data : the hypothesis is true, the data has been interpolated using the method "{interp_method}"')
else:
    print(f'The interpolated potential missing hour gives a different result than the given data : the hypothesis is denied OR the interpolation method is not "{interp_method}"')


### Check if the hypothesis can be confirmed for all of the smartmeters

In [None]:
def check_interp_oct(sm_id, interp_method):
    
    #potential 'missing hour' timestamps due to DST
    start_missing_hour = pd.to_datetime('2017-03-26 02:00:00')
    end_missing_hour = pd.to_datetime('2017-03-26 02:45:00')
    
    #time samples before and after the 'missing hour'
    start_sample = start_missing_hour - 4*pd.DateOffset(minutes=15)
    end_sample = end_missing_hour + 5*pd.DateOffset(minutes=15)
    
    #select the sample of time in the dataframe 
    sample = pd.DataFrame(raw_data_df.loc[sm_id][start_sample:end_sample].offtake)
    
    #duplicate offtake column and replace 'missing hour' with nans
    sample['offtake_with_nan'] = sample.offtake
    sample.at[start_missing_hour:end_missing_hour,'offtake_with_nan']=np.nan
    
    #interpolate
    sample['interp'] = sample.offtake_with_nan.interpolate(method=interp_method).values
    
    #check if values are the same by computing the difference 
    sample['diff'] = sample['offtake']-sample['interp']
    equal_interp = (sample['diff'] == 0).all() 
        #If equal_interp = True : all differences are zero --> hypothesis confirmed. This 'missing hour' has been interpolated
        #If equal_interp = False --> hypothesis denied. This 'missing hour' has not been interpolated with the chosen method
    
    return sample, equal_interp

In [None]:
equal_interp = []
method_interp_x = 'cubic'

for sm_id in raw_data_df.index.levels[0]:
    idx = raw_data_df.loc[sm_id].index
    if pd.to_datetime('2017-03-26 02:00:00') in idx:
        sample_x, equal_interp_x = check_interp_oct(sm_id,method_interp_x)
        equal_interp.append(equal_interp_x)
    
check_all = all(equal_interp)

if check_all == True:
    print(f'✓ The interpolated potential missing hour gives the same result as the given data for all the smart meters :\n --> The hypothesis is true, the data has been interpolated using the method "{interp_method}"')
else:
    print(f'✘ The interpolated potential missing hour gives a different result than the given data for one of the smartmeters :\n --> The hypothesis is denied OR the interpolation method is not "{interp_method}"')


In [None]:
sample_x

In [None]:
pd.set_option("display.max_rows", 15)

### Make into a nice pivot_table

*note: there are no Nan values here so the calculation of consumption is correct!*

In [None]:
raw_data_df[raw_data_df.offtake.isna() | raw_data_df.injection.isna()]

In [None]:
data_df = raw_data_df.copy().reset_index()
data_df['consumption'] = data_df.offtake - data_df.injection
data_df = data_df.drop(columns = ['offtake', 'injection'])
data_df = pd.pivot_table(data_df, index = 'meterID', columns = 'timestamp', values = 'consumption')
data_df.head()

In [None]:
data_df.loc[[541448860013438368],:]

### take subset of 2017

In [None]:
data_2017_df = data_df.loc[:,pd.to_datetime(data_df.columns).year == 2017].copy()
data_2017_df

#### Remove profiles with only injection

In [None]:
eans_to_drop = list(data_2017_df[data_2017_df.isna().any(axis = 1)].index)
eans_with_injection = list(raw_data_df.index.get_level_values(0)[raw_data_df.injection>0].unique())
eans_to_drop.extend(eans_with_injection)
print(f"dropping eans: {eans_to_drop}")

In [None]:

data_2017_df = data_2017_df.loc[~data_2017_df.index.isin(eans_to_drop)].copy()

# add 2017 to the index
data_2017_df['year'] = 2017
data_2017_df = data_2017_df.set_index('year', append=True)
data_2017_df.head()

### Clean-up the master table

In [None]:
new_master_2_df = new_master_df.drop(columns = ['ID', 'ean','startDate', 'endDate', 'HP', 'yearly_offtake', 'yearly_injection', 'yearly_net_offtake'])
new_master_2_df = new_master_2_df[~new_master_2_df.index.get_level_values(0).isin(eans_to_drop)]
new_master_2_df.rename_axis(index = {'EAN':'meterID'}, inplace = True)
new_master_2_df = new_master_2_df.astype({'PV':'boolean', 'residential': 'boolean', 'ex_night':'boolean'})
new_master_2_df['year'] = 2017
new_master_2_df.set_index('year', append=True, inplace = True)
new_master_2_df.head()

#### At this time residential is false for all profiles and ex_night as well so drop these as well 

In [None]:
new_master_2_df.drop(columns = ['residential', 'ex_night'], inplace = True)

In [None]:
new_master_2_df['consumer_type'] = 'professional'

In [None]:
new_master_2_df.head()

## Save it all to disk

In [None]:
# new_master_2_df.to_csv(PREPROCESSED_PATH/ 'clean_info.csv') #only relevant meta data for AMR profiles
# new_master_df.to_csv(PREPROCESSED_PATH/'full_info.csv') #all relevant meta data for all profiles (when everything is merged)
# data_df.to_csv(PREPROCESSED_PATH/'data.csv') #all consumption values (offtake-inj) for the entire time inluding january 2018
# data_2017_df.to_csv(PREPROCESSED_PATH/'clean_data.csv') #cleaned consumption values (offtake-inj) for 2017
# raw_data_df.to_csv(PREPROCESSED_PATH/'raw_data.csv') #all 'raw data' from .txt file (with additinal month of january 2018) including injection and offtae values

# CHECK FOR DAYLIGHT SAVING TIME ARTEFACT 

## Data to be checked

In [None]:
data_2017_df

In [None]:
#work on a copy 
pivot_table_or = data_2017_df.copy()

In [None]:
list(pivot_table_or.index.levels[1])

All 'year' indexes are 2017 > remove them to not have to handle them 

In [None]:
pivot_table = pivot_table_or.droplevel(level=1)
pivot_table

In [None]:
pivot_table.columns = pd.to_datetime(pivot_table.columns)
pivot_table.columns

In [None]:
sm_ids = list(pivot_table.index)
sm_ids[0]

In [None]:
#Define potentially originally missing hour
start_missing_hour = pd.to_datetime('2017-03-26 02:00:00')
end_missing_hour = pd.to_datetime('2017-03-26 02:45:00')

index_start_missingh,index_end_missingh =  pivot_table.columns.get_indexer([start_missing_hour, end_missing_hour])

print(f'Start missing hour : {start_missing_hour}, ind : {index_start_missingh}\nEnd missing hour   : {end_missing_hour}, ind :{index_end_missingh}')

In [None]:
#Define time samples before and after the 'missing hour' (arbitrary chosen > one hour before and one hour after) 
#subset of the full df 
start_sample = start_missing_hour - 4*pd.DateOffset(minutes=15)
end_sample = end_missing_hour + 4*pd.DateOffset(minutes=15)

index_start_sample,index_end_sample =  pivot_table.columns.get_indexer([start_sample, end_sample])

In [None]:
original_sample = pivot_table.iloc[:,index_start_sample:index_end_sample+1].copy()
original_sample

In [None]:
#replace missing hour with nan (to be interpolated) in the pivot table - working on a copy 
pivot_copy = pivot_table.copy()
pivot_copy.at[:,start_missing_hour:end_missing_hour]=np.nan
nan_pivot_table = pivot_copy.copy()
nan_pivot_table

In [None]:
#nan sample (to compare with original sample )
nan_sample = nan_pivot_table.iloc[:,index_start_sample:index_end_sample+1].copy()
nan_sample

In [None]:
#HYPOTHESIS : missing hour = (h-1 + h+1)/2

interp_sample = pd.DataFrame()
for sm_id in nan_sample.index:
    sm_serie = nan_sample.loc[sm_id].copy()
    for i in range(0,4):
        sm_serie.iat[i+4] = (sm_serie[i]+sm_serie[i+8])/2
        sm_df = pd.DataFrame(sm_serie).T
    interp_sample = interp_sample.append(sm_df)

bool_output = interp_sample == original_sample
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]} out of {original_nb_of_false}\n')
bool_output

## it is clear that the results are not the same

In [None]:
#HYPOTHESIS : interpolation

chosen_meth = 'nearest'
#chose method from ‘linear’, ‘time’, ‘index’, ‘values’, 'pad’, ‘nearest’, 
# ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’
# ‘krogh’, ‘piecewise_polynomial’, ‘pchip’, ‘akima’, ‘cubicspline’

interp_sample = pd.DataFrame()
for sm_id in nan_sample.index:
    sm_serie = nan_sample.loc[sm_id].copy()
    sm_interp = sm_serie.interpolate(method = chosen_meth, axis=0) ##add order depending on the chosen method
    sm_df = pd.DataFrame(sm_interp).T
    interp_sample = interp_sample.append(sm_df)
    
bool_output = interp_sample == original_sample
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'Interp method "{chosen_meth}" yields : {counts[0]} wrongly guessed values out of {original_nb_of_false}\n')
bool_output

#nearest 286/356
#linear 323/356
#pad 295/356
#slinear 326/356
#cubic 340/356
#quadratic 340/356
#spline 340/356

In [None]:
#HYPOTHESIS : missing hour = (h-1)

interp_sample = pd.DataFrame()
for sm_id in nan_sample.index:
    sm_serie = nan_sample.loc[sm_id].copy()
    for i in range(0,4):
        sm_serie.iat[i+4] = sm_serie[i]
        sm_df = pd.DataFrame(sm_serie).T
    interp_sample = interp_sample.append(sm_df)

bool_output = interp_sample == original_sample
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of {original_nb_of_false}\n')
bool_output

## it is clear that the results are not the same

In [None]:
#HYPOTHESIS : missing hour = (h+1)

interp_sample = pd.DataFrame()
for sm_id in nan_sample.index:
    sm_serie = nan_sample.loc[sm_id].copy()
    for i in range(0,4):
        sm_serie.iat[i+4] = sm_serie[i+8]
        sm_df = pd.DataFrame(sm_serie).T
    interp_sample = interp_sample.append(sm_df)

bool_output = interp_sample == original_sample
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output

## it is clear that the results are not the same

In [None]:
#HYPOTHESIS : missing hour = same hour previous day

interp_table = pd.DataFrame()
for sm_id in nan_pivot_table.index:
    sm_serie = nan_pivot_table.loc[sm_id].copy()
    for i in range(index_start_missingh,index_end_missingh+1):
        sm_serie.iat[i] = sm_serie[i-(4*24)]
        sm_df = pd.DataFrame(sm_serie).T
    interp_table = interp_table.append(sm_df)

bool_output = interp_table == pivot_table
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output.iloc[:,index_start_sample:index_end_sample]

## it is clear that the results are not the same

In [None]:
#Same hour previous day 
interp_table.iloc[:,index_start_missingh-(4*24):index_end_missingh-(4*24)+1]

In [None]:
#interpolated hour 
interp_table.iloc[:,index_start_missingh:index_end_missingh+1]

In [None]:
#original hour 
pivot_table.iloc[:,index_start_missingh:index_end_missingh+1]

In [None]:
#HYPOTHESIS : missing hour = same hour next day

interp_table = pd.DataFrame()
for sm_id in nan_pivot_table.index:
    sm_serie = nan_pivot_table.loc[sm_id].copy()
    for i in range(index_start_missingh,index_end_missingh+1):
        sm_serie.iat[i] = sm_serie[i+(4*24)]
        sm_df = pd.DataFrame(sm_serie).T
    interp_table = interp_table.append(sm_df)

bool_output = interp_table == pivot_table
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output.iloc[:,index_start_sample:index_end_sample]

## it is clear that the results are not the same

In [None]:
#HYPOTHESIS : missing hour = same hour, same day previous week 

interp_table = pd.DataFrame()
for sm_id in nan_pivot_table.index:
    sm_serie = nan_pivot_table.loc[sm_id].copy()
    for i in range(index_start_missingh,index_end_missingh+1):
        sm_serie.iat[i] = sm_serie[i-(4*24*7)]
        sm_df = pd.DataFrame(sm_serie).T
    interp_table = interp_table.append(sm_df)

bool_output = interp_table == pivot_table
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output.iloc[:,index_start_sample:index_end_sample]


In [None]:
#HYPOTHESIS : missing hour = same hour, same day next week 

interp_table = pd.DataFrame()
for sm_id in nan_pivot_table.index:
    sm_serie = nan_pivot_table.loc[sm_id].copy()
    for i in range(index_start_missingh,index_end_missingh+1):
        sm_serie.iat[i] = sm_serie[i+(4*24*7)]
        sm_df = pd.DataFrame(sm_serie).T
    interp_table = interp_table.append(sm_df)

bool_output = interp_table == pivot_table
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output.iloc[:,index_start_sample:index_end_sample]

In [None]:
#HYPOTHESIS : missing hour = average of the same hour, same day previous and next week 

interp_table = pd.DataFrame()
for sm_id in nan_pivot_table.index:
    sm_serie = nan_pivot_table.loc[sm_id].copy()
    for i in range(index_start_missingh,index_end_missingh+1):
        sm_serie.iat[i] = (sm_serie[i-(4*24*7)] + sm_serie[i+(4*24*7)])/2
        sm_df = pd.DataFrame(sm_serie).T
    interp_table = interp_table.append(sm_df)

bool_output = interp_table == pivot_table
unique, counts = np.unique(bool_output, return_counts=True)
original_nb_of_false = 4*89
print(f'remaining errors : {counts[0]}  wrongly guessed values out of  {original_nb_of_false}\n')
bool_output.iloc[:,index_start_sample:index_end_sample]

In [None]:
#### HYPOTHESIS : The same sequence of four value is taken from somwhere else in the data. 

In [None]:
def rolling_window(a, size):
    shape = a.shape[:-1] + (a.shape[-1] - size + 1, size)
    strides = a.strides + (a. strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

In [None]:
sm_1_series = pivot_table.loc[541448810000108927]
sm_1_series

In [None]:
missing_hour_sm_id_1 = pivot_table.iloc[0,index_start_missingh:index_end_missingh+1]
missing_hour_sm_id_1

In [None]:
output_roll_w = rolling_window(sm_1_series.values, 4) == missing_hour_sm_id_1.values
df_output_roll_w = pd.DataFrame(output_roll_w)
# df_output_roll_w['timestamps']=sm_1_series.index
# df_output_roll_w = df_output_roll_w.set_index('timestamps')
df_output_roll_w

In [None]:
#which row has 4 Trues
df_output_roll_w.all(axis=1)

In [None]:
#count how many rows has 4 Trues 
df_output_roll_w.all(axis=1).value_counts()

We can see that there is somewhere else in the table a sequence of this four value (it appears twice)

In [None]:
#find index of the 4 Trues
df_trues = pd.DataFrame(df_output_roll_w.all(axis=1))
df_trues['timetamps'] = sm_1_series.index[0:len(df_trues)]
df_trues = df_trues.set_index('timetamps')
for index in df_trues.index:
    if df_trues.loc[index].values[0] == True:
        print(index)

Try and check it out for all the smart meters :

In [None]:
##TAKES A WHILE TO RUN >> check attached .txt file for output 

# #save all sm ids in a list 
# smartmeters_ids = list(pivot_table.index)

# for sm_id in smartmeters_ids:
    
#     sm_series = pivot_table.loc[sm_id]

#     missing_hour_sm_id = pivot_table.loc[sm_id,start_missing_hour:end_missing_hour]

#     output_roll_w = rolling_window(sm_series.values, 4) == missing_hour_sm_id.values
#     df_output_roll_w = pd.DataFrame(output_roll_w)

#     #which row has 4 Trues ?
#     all_true = df_output_roll_w.all(axis=1)

#     #count how many rows has 4 Trues 
#     all_true_count = df_output_roll_w.all(axis=1).value_counts()


#     #find index of the 4 Trues
#     df_trues = pd.DataFrame(df_output_roll_w.all(axis=1))
#     df_trues['timetamps'] = sm_1_series.index[0:len(df_trues)]
#     df_trues = df_trues.set_index('timetamps')
#     print(sm_id)
#     for index in df_trues.index:
#         if df_trues.loc[index].values[0] == True:
#             print(index)
#     print('\n')