In [1]:
%load_ext autoreload
%autoreload 2

# Load Modules

In [2]:
import sys
sys.path.append('..')

from datefeatures import DateComponents

import numpy as np
import pandas as pd
from randdate import randdate
from datetime import datetime

from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector

# Example 1

In [3]:
# generate fake dates
X = np.c_[np.array(randdate(10)), np.array(randdate(10))]

# transform date variable to fetures
cmp = DateComponents(year=False, month=True, day=False, hour=False, minute=False, second=False)
cmp.fit(X)
Z = cmp.transform(X)

Z.head()

Unnamed: 0,0_na,0_eoq,0_quarter,0_eom,0_month,1_na,1_eoq,1_quarter,1_eom,1_month
0,False,0,2,0,5,False,0,3,0,9
1,False,0,3,0,9,False,0,2,0,5
2,False,0,1,0,3,False,0,3,0,9
3,False,0,3,0,9,False,0,1,0,3
4,False,0,4,0,12,False,0,1,0,1


# Example 2

In [4]:
n_samples = 100000
X = np.c_[np.array(randdate(n_samples)), np.array(randdate(n_samples)), np.array(randdate(n_samples))]

In [5]:
cmp = DateComponents(year=True, month=False, day=False, hour=False, minute=False, second=False)
%time Z = cmp.fit_transform(X)

CPU times: user 222 ms, sys: 11.4 ms, total: 233 ms
Wall time: 236 ms


In [6]:
cmp = DateComponents(year=False, month=False, day=False, hour=True, minute=True, second=False)
%time Z = cmp.fit_transform(X)

CPU times: user 161 ms, sys: 4.64 ms, total: 166 ms
Wall time: 165 ms


In [7]:
cmp = DateComponents(year=True, month=True, day=True, hour=True, minute=True, second=True, microsecond=True)
%time Z = cmp.fit_transform(X)

CPU times: user 968 ms, sys: 70.8 ms, total: 1.04 s
Wall time: 1.04 s


# Example 3

In [8]:
n_samples = 5

# generate fake dates
X = np.c_[np.array(randdate(n_samples))]

# emulate missing value
X[1,0] = np.nan

## Example 3a -- without correction

In [9]:
cmp = DateComponents(missing=False)

# What will happen?
Z = cmp.fit_transform(X)

In [10]:
Z.dtypes

0_eoq           bool
0_quarter    float64
0_eom           bool
0_month      float64
0_dy         float64
0_dw         float64
0_week       float64
0_day        float64
dtype: object

In [11]:
Z.head()

Unnamed: 0,0_eoq,0_quarter,0_eom,0_month,0_dy,0_dw,0_week,0_day
0,False,1.0,False,3.0,89.0,0.0,14.0,29.0
1,False,,False,,,,,
2,False,3.0,False,8.0,241.0,1.0,35.0,29.0
3,False,2.0,False,4.0,93.0,4.0,14.0,3.0
4,False,4.0,False,12.0,362.0,2.0,52.0,28.0


## Example 3b -- with missing value correction

In [12]:
cmp = DateComponents(missing=True, year=True)
Z = cmp.fit_transform(X)

In [13]:
Z.dtypes

0_na          bool
0_leap        int8
0_year       int16
0_eoq         int8
0_quarter     int8
0_eom         int8
0_month       int8
0_dy         int16
0_dw          int8
0_week        int8
0_day         int8
dtype: object

In [14]:
Z.head()

Unnamed: 0,0_na,0_leap,0_year,0_eoq,0_quarter,0_eom,0_month,0_dy,0_dw,0_week,0_day
0,False,1,1976,0,1,0,3,89,0,14,29
1,True,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,False,0,1978,0,3,0,8,241,1,35,29
3,False,0,1981,0,2,0,4,93,4,14,3
4,False,0,1994,0,4,0,12,362,2,52,28


# Example 4

In [15]:
X = np.array(datetime(2016, 1, 1, 23, 59, 58, 12345)).reshape(1, -1)
cmp = DateComponents(
            year=False, month=False, day=False,
            hour=True, minute=True, second=True, microsecond=True)
Z = cmp.fit_transform(X)
Z

Unnamed: 0,0_na,0_hour,0_min,0_sec,0_ms
0,False,23,59,58,12345


# Example 5

In [16]:
# generate fake dates
n_samples = 5
X = np.c_[np.array(randdate(n_samples))]

# make pipeline
pipe = Pipeline(steps=[
    ('pre', DateComponents())
])

Z = pipe.fit_transform(X)
Z

Unnamed: 0,0_na,0_eoq,0_quarter,0_eom,0_month,0_dy,0_dw,0_week,0_day
0,False,0,1,0,3,86,4,13,27
1,False,0,2,0,6,165,4,24,14
2,False,0,1,0,2,40,0,7,9
3,False,0,2,0,4,101,5,15,11
4,False,0,4,0,12,357,0,52,23


# Example 6

In [17]:
# generate fake dates
n_samples = 5
X = pd.DataFrame(data=randdate(n_samples), columns=['this_date'])
X['some_numbers'] = np.random.randn(n_samples)
X

Unnamed: 0,this_date,some_numbers
0,1985-04-16 14:06:32.745629,-0.387743
1,1992-11-19 02:56:48.846815,0.723293
2,2002-11-21 09:00:36.383417,0.330115
3,2016-01-07 14:42:31.335709,-0.466094
4,2002-02-26 00:57:35.396127,-0.716016


In [18]:
# make pipeline
pipe = Pipeline(steps=[
    # process column by column
    ('col_by_col', FeatureUnion(transformer_list=[
        ('dates', Pipeline(steps=[
            ('sel1', ColumnSelector(cols=('this_date'))),
            ('pre1', DateComponents())
        ])),
        ('numbers', ColumnSelector(cols=('some_numbers')))
    ]))
    # do some other stuff ..
])

Z = pipe.fit_transform(X)
Z

array([[False, 0, 2, 0, 4, 106, 1, 16, 16, -0.3877432622991901],
       [False, 0, 4, 0, 11, 324, 3, 47, 19, 0.7232931454473194],
       [False, 0, 4, 0, 11, 325, 3, 47, 21, 0.33011450757540833],
       [False, 0, 1, 0, 1, 7, 3, 1, 7, -0.46609446684241734],
       [False, 0, 1, 0, 2, 57, 1, 9, 26, -0.7160159716877422]],
      dtype=object)