In [1]:
%load_ext autoreload
%autoreload 2

# Load modules

In [2]:
import sys
sys.path.append('..')

from datefeatures import MonthCircle

import numpy as np
import pandas as pd
from randdate import randdate
from datetime import datetime

from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector

# Example 1

In [3]:
# generate fake dates
X = np.c_[np.array(randdate(10)), np.array(randdate(10))]

# transform date variable to fetures
cmp = MonthCircle()
cmp.fit(X)
Z = cmp.transform(X)

Z.head()

Unnamed: 0,0_na,0_month_sin,0_month_cos,1_na,1_month_sin,1_month_cos
0,False,0.968271,-0.249903,False,0.666717,-0.745311
1,False,-0.146113,0.989268,False,0.735904,-0.677086
2,False,-0.757918,-0.652349,False,-0.279163,-0.960244
3,False,-0.264796,-0.964305,False,-0.310488,0.950577
4,False,-0.479523,0.877529,False,-0.984764,0.173894


In [4]:
cmp.feature_names_

['0_month_na',
 '0_month_sin',
 '0_month_cos',
 '1_month_na',
 '1_month_sin',
 '1_month_cos']

# Example 2

In [5]:
# generate fake dates
X = np.c_[np.array(randdate(10)), np.array(randdate(10))]

# emulate missing value
X[1,0] = np.nan

# transform date variable to fetures
cmp = MonthCircle()
cmp.fit(X)
Z = cmp.transform(X)

Z.head()

Unnamed: 0,0_na,0_month_sin,0_month_cos,1_na,1_month_sin,1_month_cos
0,False,-0.654729,0.755864,False,-0.333658,0.942694
1,True,,,False,0.9361,-0.351735
2,False,0.999865,0.016429,False,-0.987212,0.159411
3,False,-0.619826,-0.784739,False,-0.882327,-0.470637
4,False,0.772964,0.634449,False,0.087073,0.996202


# Example 3

In [6]:
n_samples = 100000
X = np.c_[np.array(randdate(n_samples)), np.array(randdate(n_samples)), np.array(randdate(n_samples))]

In [7]:
cmp = MonthCircle()
%time Z = cmp.fit_transform(X)

CPU times: user 4.29 s, sys: 122 ms, total: 4.41 s
Wall time: 4.68 s


# Example 4

In [8]:
# generate fake dates
n_samples = 5
X = np.c_[np.array(randdate(n_samples))]
X[1,0] = np.nan

# make pipeline
pipe = Pipeline(steps=[
    ('pre', MonthCircle(out=['sin', 'cos', 'frac']))
])

Z = pipe.fit_transform(X)
Z

Unnamed: 0,0_na,0_month_sin,0_month_cos,0_month_frac
0,False,0.997687,-0.067979,0.260828
1,True,,,
2,False,0.987755,0.156013,0.225068
3,False,-0.203928,0.978986,0.967315
4,False,0.991991,-0.126305,0.270156


# Example 5

In [9]:
# generate fake dates
n_samples = 5
X = pd.DataFrame(data=randdate(n_samples), columns=['this_date'])
X['some_numbers'] = np.random.randn(n_samples)
X

Unnamed: 0,this_date,some_numbers
0,1986-05-05 04:44:35.877908,0.839019
1,1974-01-03 09:38:12.990136,0.023113
2,1974-06-18 06:07:03.338543,-1.022649
3,1993-04-07 01:46:57.748680,0.223739
4,1994-07-05 09:21:52.147709,0.467056


In [10]:
# make pipeline
pipe = Pipeline(steps=[
    # process column by column
    ('col_by_col', FeatureUnion(transformer_list=[
        ('dates', Pipeline(steps=[
            ('sel1', ColumnSelector(cols=('this_date'))),
            ('pre1', MonthCircle())
        ])),
        ('numbers', ColumnSelector(cols=('some_numbers')))
    ]))
    # do some other stuff ..
])

Z = pipe.fit_transform(X)
Z

array([[False, 0.8691469551508987, 0.4945539104606511,
        0.8390191687963799],
       [False, 0.6361017338563584, 0.7716051996875959,
        0.02311254426791141],
       [False, -0.6301175679634105, -0.7764997427841666,
        -1.022649301384859],
       [False, 0.9960276397912641, 0.08904459990276777,
        0.2237393005168318],
       [False, 0.887780995877341, 0.46026612232385355,
        0.46705556763075823]], dtype=object)

In [11]:
colnam = list(pipe.steps[0][1].transformer_list[0][1].steps[1][1].feature_names_)
colnam += ['some_numbers']
colnam

['0_month_na', '0_month_sin', '0_month_cos', 'some_numbers']

In [12]:
pd.DataFrame(Z, columns=colnam)

Unnamed: 0,0_month_na,0_month_sin,0_month_cos,some_numbers
0,False,0.869147,0.494554,0.839019
1,False,0.636102,0.771605,0.0231125
2,False,-0.630118,-0.7765,-1.02265
3,False,0.996028,0.0890446,0.223739
4,False,0.887781,0.460266,0.467056
