## Datetime Variables

In [93]:
import pandas as pd

In [94]:
s = pd.date_range('2020-01-06', '2020-01-14', freq='10h').to_series()

In [95]:
s

2020-01-06 00:00:00   2020-01-06 00:00:00
2020-01-06 10:00:00   2020-01-06 10:00:00
2020-01-06 20:00:00   2020-01-06 20:00:00
2020-01-07 06:00:00   2020-01-07 06:00:00
2020-01-07 16:00:00   2020-01-07 16:00:00
2020-01-08 02:00:00   2020-01-08 02:00:00
2020-01-08 12:00:00   2020-01-08 12:00:00
2020-01-08 22:00:00   2020-01-08 22:00:00
2020-01-09 08:00:00   2020-01-09 08:00:00
2020-01-09 18:00:00   2020-01-09 18:00:00
2020-01-10 04:00:00   2020-01-10 04:00:00
2020-01-10 14:00:00   2020-01-10 14:00:00
2020-01-11 00:00:00   2020-01-11 00:00:00
2020-01-11 10:00:00   2020-01-11 10:00:00
2020-01-11 20:00:00   2020-01-11 20:00:00
2020-01-12 06:00:00   2020-01-12 06:00:00
2020-01-12 16:00:00   2020-01-12 16:00:00
2020-01-13 02:00:00   2020-01-13 02:00:00
2020-01-13 12:00:00   2020-01-13 12:00:00
2020-01-13 22:00:00   2020-01-13 22:00:00
Freq: 10h, dtype: datetime64[ns]

In [96]:
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.quarter.values,
    "is_weekend":(s.dt.dayofweek.values >=5).astype(int)
}

## Aggregated Features

In [97]:
df = pd.DataFrame()

In [98]:
for key in features:
    df[f'{key}'] = features[key]

In [99]:
df.head()

Unnamed: 0,dayofweek,dayofyear,hour,is_leap_year,is_weekend
0,0,6,0,1,0
1,0,6,10,1,0
2,0,6,20,1,0
3,1,7,6,1,0
4,1,7,16,1,0


In [100]:
df = df.reset_index(drop = False)
df.rename(columns = {"index":"customer_id"}, inplace=True)

In [102]:
aggs = {}

aggs['month'] = ['nunique', 'mean']
aggs['weekofyear'] = ['nunique', 'mean']
aggs['num1'] = ['sum', 'max','min','max']
aggs['customer_id'] = ['nunique', 'size']

In [103]:
# agg_df = df.groupby('customer_id').agg(aggs)
# agg_df = agg_df.reset_index()

## Polynomial Features

In [106]:
import numpy as np
import pandas as pd

df = pd.DataFrame(
    np.random.rand(100,2),
    columns=[f"f_{i}" for i in range(1,3)]
)

In [107]:
from sklearn import preprocessing

In [108]:
pf = preprocessing.PolynomialFeatures(
    degree = 2,
    interaction_only=False,
    include_bias=False
)

In [109]:
pf.fit(df)

In [112]:
poly_feats = pf.transform(df)

In [113]:
num_feats = poly_feats.shape[1]

df_transformed = pd.DataFrame(
    poly_feats,
    columns = [f"f_{i}" for i in range(1, num_feats + 1)]
)

In [114]:
df_transformed

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.770402,0.566085,0.593519,0.436113,0.320452
1,0.001911,0.830989,0.000004,0.001588,0.690543
2,0.357714,0.473234,0.127959,0.169283,0.223951
3,0.930533,0.574942,0.865891,0.535003,0.330559
4,0.760733,0.463771,0.578715,0.352806,0.215084
...,...,...,...,...,...
95,0.287770,0.222574,0.082812,0.064050,0.049539
96,0.433671,0.795334,0.188070,0.344913,0.632557
97,0.157491,0.234957,0.024804,0.037004,0.055205
98,0.086306,0.498967,0.007449,0.043064,0.248968


## Binning Features: converting numeric features to categorical features

In [116]:
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)
df

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.770402,0.566085,7,77
1,0.001911,0.830989,0,0
2,0.357714,0.473234,3,35
3,0.930533,0.574942,9,93
4,0.760733,0.463771,7,76
...,...,...,...,...
95,0.287770,0.222574,2,28
96,0.433671,0.795334,4,43
97,0.157491,0.234957,1,15
98,0.086306,0.498967,0,8


## Log/Exponential Transformation

## Handling Missing
1 - Fill with Mean/Median/Mode

2 - Use KNN method to impute

3 - train a regression model that tries to predict missing values in a column based on other columns

## Handling Outliers

## Normalization for Regression Model/Tree-based models don't need