# Data Preprocessing and Time Series Analysis

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 100
import matplotlib.pyplot as plt
import missingno as msno

In [None]:
# import base dataset
data = pd.read_csv('household_data_1min_singleindex.csv')

In [None]:
# get all the columns concerning the building industrial3
industrial = data.filter(like='industrial3')
# get timestamp
industrial['time'] = data.filter(like='cet_cest_timestamp')

In [None]:
# drop 3 columns that don't take our interest after reading the documentation
industrial = industrial.drop(columns = ['DE_KN_industrial3_compressor', 'DE_KN_industrial3_ev', 'DE_KN_industrial3_ventilation'])

### Visualization of Missing Values

In [None]:
# the data is cumulative so any 0 would be considered a missing value
industrial.replace(0, np.nan, inplace=True)
msno.matrix(industrial)
plt.title('Representation of Missing Values for the "Industrial 3" Building', size = 50)

### Study of Grid Import

The goal of this section was to see if grid import was greater than the sum of all the other columns, since it represented the imported energy from the grid.

In [None]:
# creata a dataset to study grid import and replace 0 by NA
grid_import = data.filter(like = 'industrial3')
grid_import.replace(0, np.nan, inplace=True)
# drop rows containing empty entries
grid_import = grid_import.dropna()

In [None]:
# create dataset to compare consumption and energy imports 
components = pd.DataFrame()
# column 'all': remove columns that are not considered as consumptions
components['all'] = grid_import.drop(columns = ['DE_KN_industrial3_grid_import', 'DE_KN_industrial3_pv_facade', 'DE_KN_industrial3_pv_roof']).sum(axis = 1)
# column 'energy': keep columns that are considered as energy production/import
components['energy'] = grid_import[['DE_KN_industrial3_grid_import', 'DE_KN_industrial3_pv_facade', 'DE_KN_industrial3_pv_roof']].sum(axis = 1)

In [None]:
# look at the difference to compare since the values are cumulative and don't start at the same time
components.diff(axis = 0).head()

In [None]:
# get percentage of data where energy import is greater than consumption
print(np.round((sum(components.diff(axis = 0)['energy'] - components.diff(axis = 0)['all'] > 0) / len(components)) * 100 ), "%")

Conclusion: grid import is almost the sum of all other energy consuptions, but is still under for some iterations. In 76% of the data, grid import is greater than all energy consumption.

### Descriptive Statistics

In this section we aggregate the data by hour, day, week and month. We take a look at the mean of the difference between each cumulative entry.

In [None]:
# read clean dataset (the creation can be found in preprocessing.ipynb)
industrial_clean = pd.read_csv('industrial_components.csv')

In [None]:
# compute difference since the values are cumulative
differences = industrial_clean.drop(columns = 'time').diff(axis = 0)
# get timestamp
differences['time'] = industrial_clean['time']
# create data frame with an reset index
index = differences.reset_index().drop('index', axis = 1)

In [None]:
# manually remove rows to start at a precise hour (12:00:00 here)
index_start = index.tail(-60+23)

In [None]:
index_start.head()

### Hourly Study

In [None]:
# create a dictionary to specify we compute the mean for each column
dic_mean = {'rooms': 'mean', 
'machine': 'mean', 
'cooling': 'mean', 
'appliances': 'mean'}

In [None]:
# create a dataset per hour
# we specify that we average over every 60 values (60 min in an hour)
hourly = index_start.groupby(index_start.index // 60).agg(dic_mean) 
hourly['hour'] = hourly.index + 1

In [None]:
mpl.rcParams['figure.dpi'] = 125
hourly.head(500).plot(x = 'hour',
                        subplots = True, 
                        layout = (2, 2), 
                        title = 'Mean Minutely Consumption per Hour in kW/h\nin an Industrial Building', 
                        ylim = [-0.2,1.5],
                        xlabel = "Hour",
                        ylabel = "kW/h",
                        sharey = True)
plt.tight_layout()

### Daily Study

In [None]:
# create a dataset per hour
# we specify that we average over every 1440 values (1440 min in a day)
daily = index_start.groupby(index_start.index // 1440).agg(dic_mean)
daily['day'] = daily.index + 1

In [None]:
mpl.rcParams['figure.dpi'] = 125
daily.head(200).plot(x = 'day',
                        subplots = True, 
                        layout = (2, 2), 
                        title = 'Mean Minutely Consumption per Day in kW/h\nin an Industrial Building', 
                        ylim = [-0.2,1.5],
                        xlabel = "Day",
                        ylabel = "kW/h",
                        sharey = True)

plt.tight_layout()

### Weekly Study

In [None]:
# create a dataset per hour
# we specify that we average over every 10080 values (10080 min in an week)
weekly = index_start.groupby(index_start.index // 10080).agg(dic_mean)
weekly['week'] = weekly.index + 1

In [None]:
mpl.rcParams['figure.dpi'] = 125
weekly.plot(x = 'week',
            subplots = True, 
            layout = (2, 2), 
            title = 'Mean Minutely Consumption per Week in kW/h\nin an Industrial Building', 
            ylim=[-0.2,1.5],
            xlabel = "Week",
            ylabel = "kW/h",
            sharey = True)
plt.tight_layout()

### Monthly Study

In [None]:
# create a dataset per hour
# we specify that we average over every 43800 values (43800 min in a month)
monthly = index_start.groupby(index_start.index // 43800).agg(dic_mean)
monthly['month'] = monthly.index + 1

In [None]:
mpl.rcParams['figure.dpi'] = 125
monthly.head(250).plot(x = 'month',
                        subplots = True, 
                        layout = (2, 2), 
                        title = 'Mean Minutely Consumption per Month in kW/h\nin an Industrial Building', 
                        ylim=[-0.2,1.5],
                        xlabel = "Month",
                        ylabel = "kW/h",
                        sharey = True)
plt.tight_layout()