# Preprocess the Eandis 2017 data
in the folder "**_DATA Eandis 20170712 VREG study complete_**"

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from pathlib import Path
import datetime
import tqdm
import pyxlsb #Excel 2007-2010 Binary Workbook (xlsb) parser for Python.

from zipfile import ZipFile
import zlib #this module allow data compression and decompression,

In [None]:
# PATH to the profile directory in the fluvius data
DATA_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/Data-2020-11/FluviusData/profiles')
# DATA_PATH = Path('/Users/lolabotman/PycharmProjects/FluviusFullData/profiles') #Path Lola

PATH_EANDIS2017 = DATA_PATH / 'DATA Eandis 20170712 VREG study complete/'

# PATH to where the preprocessed files should be appear
PREPROCESSED_PATH = Path('/cw/dtaiproj/ml/2020-FLAIR-VITO/profile-clustering/new_preprocessed/eandis2017')
# PREPROCESSED_PATH = Path('/Users/lolabotman/PycharmProjects/FluviusFullData/profiles/preprocessed/eandis2017') #Path Lola
PREPROCESSED_PATH.mkdir(mode = 0o770, parents = True, exist_ok=True)

## Parse data

### Check if the contents of the zipped files and the other files are the same

In [None]:
file_zip = 'VITO_VREG_EXPORT.zip'

def crc32(fileName):
    with open(fileName, 'rb') as fh:
        hash = 0
        while True:
            s = fh.read(65536)
            if not s:
                break
            hash = zlib.crc32(s, hash)
        # return "%08X" % (hash & 0xFFFFFFFF)
        return hash

with ZipFile(PATH_EANDIS2017 / file_zip, 'r') as zp:
    print('Contents of the .ZIP file:\n')
    zp.printdir()
    print('')
    
    for fl in zp.filelist:
        if crc32(PATH_EANDIS2017 / fl.filename) == fl.CRC:
            print(f'✓ The file {fl.filename} is consistent.')
        else:
            print(f'✘ The file {fl.filename} is different in the .ZIP file!')

### Import data

In [None]:
file_info = 'MASTER_TABLE_METERS.csv'
file_data = 'READINGS_2016.csv'

info_df = pd.read_csv(PATH_EANDIS2017 / file_info, sep=';', decimal=',')
data_df = pd.read_csv(PATH_EANDIS2017 / file_data, sep=';', decimal=',', header=None)

In [None]:
info_df.head()

In [None]:
data_df.head()

### Format of the dates

In [None]:
data_df.iloc[1250,3]

So the dates are in day/month/year format

### Process data

In [None]:
info_df.replace({'Ja':True, 'Nee':False}, inplace=True)

data_df.replace({'Elektriciteit':'electricity', 'Injectie':'injection', 'Afname':'offtake'}, inplace=True)
data_df.columns = ['type', 'meterID', 'measurement type', 'timestamp', 'unit', 'measurement', 'status']

# data_df['timestamp'] = pyxlsb.convert_date(data_df['timestamp']) # does not work
data_df['timestamp'] = pd.to_datetime(data_df['timestamp'], format='%d/%m/%Y %H:%M:%S') # too slow when format is not specified

### Remove redundant parts

In [None]:
data_df.drop(columns=['type', 'status', 'unit'], inplace=True) # values in these columns are constant

In [None]:
data_df.head()

In [None]:
data_df

## Summer/Winter time

### 30 Oktober an hour passes twice 

In [None]:
temp_df = data_df[data_df['measurement type'] == 'offtake'] #Select only offtake values 

occurence_count = temp_df.groupby('meterID')['timestamp'].value_counts() #count the number of time each timestamp is present (per meterid)
duplicate_values = occurence_count[occurence_count > 1] #select the rows for which the count is above 1 (meaning duplicates)

duplicate_values.to_frame('count').reset_index().drop_duplicates(subset = ['timestamp', 'count']) #show only the duplicates ones (same dates/hour for each smart meterid)

In [None]:
#visualise the duplicate values in offtake (visual check for the next ste[ in the handling of the duplicates])
visualize_dup = data_df.loc[data_df.meterID=='Sl2clpW0mYpJ3w'].set_index(['measurement type','timestamp']).loc['offtake'].loc['2016-10-30 2:00':'2016-10-30 3:00']
visualize_dup.sort_index(ascending=True)

We can see that for all of the smart meters, on the 30th of october 2016, the timestamps 2:00, 1:15, 2:30 and 2:45 occur twice. It is due to the clock change, at 3am, we decide to go back to 2am. 

### 27 March an hour is skipped

In [None]:
temp_df = data_df[data_df.meterID == data_df.meterID.iloc[0]] #select only one smart meter (the first one)
temp_df = temp_df[temp_df['measurement type'] == 'offtake'] #select only the offtake
temp_df = temp_df.set_index('timestamp') #set the index to the timestamps
temp_df.loc['2016-3-27 1:00': '2016-3-27 4:00'] #show the values between 1 and 4 am of the 27th of march 

We can see here that we have no rows for timestamps 2:00, 2:15, 2:30 and 2:45. It is due to the clock change forward, at 2 am, we change the clock to 3 am. 

### Handle duplicate measurements and calculate consumption

In [None]:
# data_processed_df = pd.pivot_table(data_df, index=['meter ID', 'timestamp'], \
#                                    columns='measurement type', values='measurement', aggfunc=np.sum) # JONAS This is a sum but should be mean? 

#pivot table and averages out duplicates, missing timestamps are still missing 
data_processed_df = pd.pivot_table(data_df, index=['meterID', 'timestamp'], \
                                   columns='measurement type', values='measurement', aggfunc=np.mean)



In [None]:
data_processed_df.loc['Sl2clpW0mYpJ3w'].loc['2016-3-27 1:00':'2016-3-27 4:00']

In [None]:
data_processed_df.loc['Sl2clpW0mYpJ3w'].loc['2016-10-30 2:00':'2016-10-30 3:00']

In [None]:
# fill in zeros because Nan - x = NaN and compute the consumption value = offtake - injection
data_processed_df['consumption'] = data_processed_df['offtake'].fillna(0) - data_processed_df['injection'].fillna(0)

# make consumption NaN when there are no measurements
data_processed_df['consumption'].loc[data_processed_df['offtake'].isna() & data_processed_df['injection'].isna()] = np.nan

# add rows with NaN for missing timestamps
data_processed_df = data_processed_df.reindex(pd.MultiIndex.from_product([data_processed_df.index.levels[0], pd.date_range('2016-01-01', '2017-01-01', freq = '15min')[:-1]]))
data_processed_df.index.names = ['meterID', 'timestamp']
data_processed_df.sort_index(inplace = True)
data_processed_df.head()

In [None]:
data_processed_df.loc['Sl2clpW0mYpJ3w'].loc['2016-3-27 1:00':'2016-3-27 4:00'] #visual inspection 

In [None]:
data_processed_df.loc['Sl2clpW0mYpJ3w'].loc['2016-10-30 2:00':'2016-10-30 3:00'] #visual inspection

### Make nice info table (consistent with the rest)

In [None]:
info_df.head()

In [None]:
clean_info_df = info_df.drop(columns = ['YEARLY_INJECTION', 'YEARLY_CONSUMPTION', 'YEARLY_BALANCE'])
clean_info_df.rename(columns = {'EQUIPMENT_ID':'meterID', 'CONTRACTED_POWER':'connection_power','TIME_OF_USE':'tarif_type', 'LOCAL_PROD_POWER':'PV_power', 'LOCAL_PROD': 'PV', 'CATEGORY': 'category', 'METERS_INSTALLED':'installed_meters', 'LOCATION_ID':'locationID'}, inplace = True)
clean_info_df['tarif_type'] = clean_info_df['tarif_type'].replace({'THNUTHNUTE':'single tarif', 'THNUTHNU':'single tarif', 'HILOHILO':'two tarifs', 'HILOHILOTE': 'two tarifs', 'EXNUEXNU':'ex_night', 'EXNUEXNUTE':'ex_night'})
clean_info_df.head()

#### There are exclusive night meters at the same location as another meter

In [None]:
ex_night_meters = clean_info_df.loc[clean_info_df.tarif_type == 'ex_night']
ex_night_meter_locations = ex_night_meters['locationID']
ex_night_location_meters = clean_info_df.loc[clean_info_df['locationID'].isin(ex_night_meter_locations)].set_index(['locationID', 'meterID']).sort_index()
ex_night_location_meters

#### Ignore locations with exclusive night meters for now

In [None]:
clean_info_df = clean_info_df[~ clean_info_df['locationID'].isin(ex_night_meter_locations)]
clean_info_df.set_index('meterID', inplace = True)
clean_info_df.drop(columns = ['installed_meters', 'locationID'],  inplace = True)
clean_info_df['data_source'] = 'EandisVREG'
clean_info_df['year'] = 2016
clean_info_df.set_index('year', append = True, inplace = True)
clean_info_df.head()

### Make nice pivot table

In [None]:
clean_data_df = data_processed_df.drop(columns = ['injection', 'offtake'])
clean_data_df = clean_data_df.reset_index()
clean_data_df = pd.pivot_table(clean_data_df, index = 'meterID', columns = 'timestamp', values = 'consumption')
# here as well exclude the exclusive night meters
clean_data_df = clean_data_df.loc[clean_data_df.index.isin(clean_info_df.index.levels[0])]
# make the index consistent with the other df's
clean_data_df['year'] = 2016
clean_data_df.set_index('year', append = True, inplace = True)
# make sure the columns are datetime
clean_data_df.columns = pd.to_datetime(clean_data_df.columns)
clean_data_df.head() 

### The timestamps of the missing hour are skipped in the clean data_df!

In [None]:
clean_data_df.loc[:, '2016-03-27 1:00': '2016-03-27 4:00']

### Save the data

In [None]:
info_df.to_csv(PREPROCESSED_PATH / 'full_info.csv')
data_df.to_csv(PREPROCESSED_PATH / 'raw_data.csv')
data_processed_df.to_csv(PREPROCESSED_PATH/ 'processed_data.csv')
clean_info_df.to_csv(PREPROCESSED_PATH / 'clean_info_no_night.csv')
clean_data_df.to_csv(PREPROCESSED_PATH / 'clean_data_no_night.csv')