# Feature Engineering

In [21]:
import pandas as pd
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf
import matplotlib.pyplot as plt

## Regular times

In [22]:
df = pd.read_pickle('../data/dropped_columns_and_weather.pkl')

In [None]:
df

Below are manipulations of the DateTime column provided with the data, these additional features will hopefully help a model find patterns to help predict future demand at certain times

In [24]:
df['year'] = df['DateTime'].dt.year
df['quarter'] = df['DateTime'].dt.quarter
df['month'] = df['DateTime'].dt.month
df['day_of_year'] = df['DateTime'].dt.day_of_year
df['day_of_month'] = df['DateTime'].dt.day
df['day_of_week'] = df['DateTime'].dt.dayofweek
df['hour'] = df['DateTime'].dt.hour
df['minute'] = df['DateTime'].dt.minute
df['time'] = df['DateTime'].dt.strftime('%H:%M')
df['minute_of_day'] = df['hour'] * 60 + df['minute']


df['DateTime'] = df['DateTime'].dt.strftime('%Y-%m-%d %H:%M')
df['DateTime'] = pd.to_datetime(df['DateTime'])



A feature of the sum of the demand for previous lengths of times is added,
originally I had used a lag but as the intervals between readings changed in the last few months, lags represented different amounts of time.

It is closed 'left' so that the time at the reading isn't included in the previous amount of time for the sum as to not leak any data into the training sets

In [25]:

df.set_index('DateTime', inplace=True)


df['sum_3min'] = df['demand'].rolling('3min', closed='left').sum()
df['sum_5min'] = df['demand'].rolling('5min', closed='left').sum()
df['sum_10min'] = df['demand'].rolling('10min', closed='left').sum()
df['sum_15min'] = df['demand'].rolling('15min', closed='left').sum()
df['sum_20min'] = df['demand'].rolling('20min', closed='left').sum()
df['sum_30min'] = df['demand'].rolling('30min', closed='left').sum()
df['sum_45min'] = df['demand'].rolling('45min', closed='left').sum()
df['sum_1h'] = df['demand'].rolling('1h', closed='left').sum()
df['sum_24h'] = df['demand'].rolling('24h', closed='left').sum()

Here is more rolling statistics (means, stds, mins, max) which might help the model

In [26]:

df['rolling_mean_30min'] = df['demand'].rolling('30min', closed='left').mean()
df['rolling_std_30min'] = df['demand'].rolling('30min', closed='left').std()

df['rolling_mean_1h'] = df['demand'].rolling('1h', closed='left').mean()
df['rolling_std_1h'] = df['demand'].rolling('1h', closed='left').std()

df['rolling_mean_24h'] = df['demand'].rolling('24h', closed='left').mean()
df['rolling_std_24h'] = df['demand'].rolling('24h', closed='left').std()

df['rolling_min_24h'] = df['demand'].rolling('24h', closed='left').min()
df['rolling_max_24h'] = df['demand'].rolling('24h', closed='left').max()

df.reset_index(inplace=True)

In [None]:
df.head()

In [28]:
df.to_pickle('../data/fe_temp_data.pkl')

## Cyclic times

The previous times added to the data don’t capture the cyclical nature of these periods (i.e., after December (12) is January (1)). So, we use sine and cosine transformations to encode time features like month, day of the year, hour etc.

In [29]:

df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)


df['day_of_month_sin'] = np.sin(2 * np.pi * df['day_of_month'] / 31)
df['day_of_month_cos'] = np.cos(2 * np.pi * df['day_of_month'] / 31)


df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['minute_of_day_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
df['minute_of_day_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

df['minute_sin'] = np.sin(2 * np.pi * df['minute_of_day'] / 1440)
df['minute_cos'] = np.cos(2 * np.pi * df['minute_of_day'] / 1440)

df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)


df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)


df.drop(columns=['hour', 'minute', 'day_of_week', 'month','minute_of_day','quarter','day_of_year','day_of_month'], inplace=True)



In [30]:
df.to_pickle('../data/fe_temp_cyclic_data.pkl')

Plotting the autocorrelation function for the demand shows that the demand is correlated both with the previous close by times, but there is also a little raise in betwwen 80 and 100, (96 would most normally represent a day as it would be 96 x 15minute intervals)

In [None]:
plot_acf(df.iloc[:-1].demand, lags=150)
plt.show()