In [1]:
import pandas as pd

In [2]:
s = pd.date_range('2020-01-06', '2020-02-07', freq='7D').to_series()

# create some features based on datetime
features = {
    'dayofweek': s.dt.dayofweek.values,
    'dayofyear': s.dt.dayofyear.values,
    'hour': s.dt.hour.values,
    'is_leap_year': s.dt.is_leap_year.values,
    'quarter': s.dt.quarter.values,
    'weekofyear': s.dt.isocalendar().week.values
}

print(features)

{'dayofweek': array([0, 0, 0, 0, 0], dtype=int32), 'dayofyear': array([ 6, 13, 20, 27, 34], dtype=int32), 'hour': array([0, 0, 0, 0, 0], dtype=int32), 'is_leap_year': array([ True,  True,  True,  True,  True]), 'quarter': array([1, 1, 1, 1, 1], dtype=int32), 'weekofyear': <IntegerArray>
[2, 3, 4, 5, 6]
Length: 5, dtype: UInt32}


In [3]:
_dict = {
    'date': [pd.to_datetime('2016-09-01'), pd.to_datetime('2017-04-01'), pd.to_datetime('2017-08-01'), pd.to_datetime('2017-12-01'), pd.to_datetime('2017-09-01')],
    'customer_id': [146361, 180838, 157857, 159772, 80014],
    'cat1': [2, 4, 3, 5, 3],
    'cat2': [2, 1, 3, 1, 2],
    'cat3': [0, 0, 1, 1, 1],
    'num1': [-0.518679, 0.415853, -2.061687, -0.276558, -1.456827]
}

In [4]:
df = pd.DataFrame.from_dict(_dict)
print(df.dtypes)

df.sample(5, random_state=42)

date           datetime64[ns]
customer_id             int64
cat1                    int64
cat2                    int64
cat3                    int64
num1                  float64
dtype: object


Unnamed: 0,date,customer_id,cat1,cat2,cat3,num1
1,2017-04-01,180838,4,1,0,0.415853
4,2017-09-01,80014,3,2,1,-1.456827
2,2017-08-01,157857,3,3,1,-2.061687
0,2016-09-01,146361,2,2,0,-0.518679
3,2017-12-01,159772,5,1,1,-0.276558


In [None]:
def generate_features(data):
    """

    param data:
    return:
    """
    # create a bunch of features using the date column in the input.
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.isocalendar().week
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday>=5).astype(int)

    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month we calculate the
    # number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']

    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column.
    aggs['num1'] = ['sum', 'max', 'min', 'mean']

    # for customer id, we calculate the total count
    aggs['customer_id'] = ['size']

    # again for customer id, we calculate the total unique
    aggs['customer_id'] = ['nunique']

    # we group by customer_id and calcualte the aggregates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()

    return agg_df

In [None]:
agg_df = generate_features(df)

agg_df.sample(5)

KeyboardInterrupt: 