# The `eensight` functionality for generating features

All feature generators generate pandas DataFrames

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [3]:
from eensight.features.generate import TrendFeatures, DatetimeFeatures, CyclicalFeatures

## Create time trend features 

### `eensight.features.generate.TrendFeatures` 

Generates time trend features.
    
    Parameters
    ----------
    feature : str, default=None
        The name of the input dataframe's column that contains datetime information.
        If None, it is assumed that the datetime information is provided by the
        input dataframe's index.
    include_bias : bool, default=False
        If True, a column of ones is added to the output.
    remainder : str, :type : {'drop', 'passthrough'}, default='passthrough'
        By specifying ``remainder='passthrough'``, all the remaining columns of the
        input dataset will be automatically passed through (concatenated with the
        output of the transformer).
    replace : bool, default=False
        Specifies whether replacing an existing column with the same name is allowed
        (when `remainder=passthrough`).

We can create a dummy dataset:

In [None]:
data = pd.DataFrame(index=pd.date_range(start='1/1/2018', end='31/12/2019', freq='H'))
data['day'] = data.index.dayofyear
data['x1'] = 4 + 3*np.sin(data['day']/365*2*np.pi)
data['x2'] = 4 * np.sin(data['day']/365*4*np.pi+365/2)

noise = np.random.normal(loc=0, scale=0.9, size=len(data))
data['y'] = data['x1'] + data['x2'] + noise
data.head()

In [None]:
enc = TrendFeatures(include_bias=True, remainder='passthrough')
features = enc.fit_transform(data)
features.head()

In [None]:
with plt.style.context('seaborn-whitegrid'):    
    fig = plt.figure(figsize=(14, 3.54), dpi=96)
    layout = (2, 1)
    ax1 = plt.subplot2grid(layout, (0, 0))
    ax2 = plt.subplot2grid(layout, (1, 0))
    
    ax1.plot(features['offset'], label='offset')
    ax2.plot(features['growth'], label='growth')
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper left')

## Add date and time features

### `eensight.features.encode.DatetimeFeatures` 

Generates date and time features

    Parameters
    ----------
    feature : str, default=None
        The name of the input dataframe's column that contains datetime information.
        If None, it is assumed that the datetime information is provided by the
        input dataframe's index.
    remainder : str, :type : {'drop', 'passthrough'}, default='passthrough'
        By specifying ``remainder='passthrough'``, all the remaining columns of the
        input dataset will be automatically passed through (concatenated with the
        output of the transformer).
    replace : bool, default=False
        Specifies whether replacing an existing column with the same name is allowed
        (when `remainder=passthrough`).
    subset : str or list of str (default=None)
        The names of the features to generate. If None, all features will be produced:
        'month', 'week', 'dayofyear', 'dayofweek', 'hour', 'hourofweek'.
        The last 2 features are generated only if the timestep of the input's
        `feature` (or index if `feature` is None) is smaller than `pd.Timedelta(days=1)`.

In [None]:
enc = DatetimeFeatures(remainder='drop')
features = enc.fit_transform(data)
features

In [None]:
enc = DatetimeFeatures(remainder='drop', subset=['month', 'hourofweek'])
features = enc.fit_transform(data)
features

## Encode cyclical (seasonal) features

### `eensight.features.encode.CyclicalFeatures` 

Creates cyclical (seasonal) features as fourier terms

    Parameters
    ----------
    seasonality : str
        The name of the seasonality.
    feature : str, default=None
        The name of the input dataframe's column that contains datetime information.
        If None, it is assumed that the datetime information is provided by the
        input dataframe's index.
    period : float, default=None
        Number of days in one period.
    fourier_order : int, default=None
        Number of Fourier components to use.
    remainder : str, :type : {'drop', 'passthrough'}, default='passthrough'
        By specifying ``remainder='passthrough'``, all the remaining columns of the
        input dataset will be automatically passed through (concatenated with the
        output of the transformer).
    replace : bool, default=False
        Specifies whether replacing an existing column with the same name is allowed
        (when `remainder=passthrough`).

**Note**: The encoder can provide default values for `period` and `fourier_order` if `seasonality` is one of `daily`, `weekly` or `yearly`.

In [None]:
with plt.style.context('seaborn-whitegrid'):    
    fig = plt.figure(figsize=(14, 3.54), dpi=96)
    layout = (1, 1)
    ax = plt.subplot2grid(layout, (0, 0))
    
    data['y'].plot(ax=ax, alpha=0.5)

In [None]:
enc = CyclicalFeatures(seasonality='yearly', fourier_order=3, remainder='drop')
features = enc.fit_transform(data)
features.head()

Now letâ€™s plot the new features:

In [None]:
with plt.style.context('seaborn-whitegrid'):    
    fig, axs = plt.subplots(2*enc.fourier_order, figsize=(14, 7), dpi=96)
    
    for i, col in enumerate(features.columns):
        features[col].plot(ax=axs[i])
    
fig.tight_layout()

Let's also see how well this transformation works:

In [27]:
regr = LinearRegression(fit_intercept=True).fit(features, data['y'])
pred = regr.predict(features)

In [None]:
with plt.style.context('seaborn-whitegrid'):    
    fig = plt.figure(figsize=(14, 3.54), dpi=96)
    layout = (1, 1)
    ax = plt.subplot2grid(layout, (0, 0))
    
    data['y'].plot(ax=ax, alpha=0.5)
    pd.Series(pred, index=data.index).plot(ax=ax)

The root mean squared error is very close to the standard deviation of the noise that was injected in the data (0.9):

In [None]:
mean_squared_error(data['y'], pred, squared=False)