# Feature Generation and Timing

## Initialization

In [1]:
import numpy as np
import pandas as pd
from tailor import data

In [2]:
df = data.load_csv()

In [3]:
# because we need to have datetime category for transaction_date
df = data.transform_datatypes(df)

In [4]:
df.head()

Unnamed: 0,article_id,transaction_date,markdown_start_date,markdown_end_date,original_price,sells_price,discount,markdown,article_count,season,brand,color,stock_total,avq,Abteilung,WHG,WUG,time_on_sale,revenue
0,900001,2014-03-27,2013-10-18,2014-05-31,59.95,53.95,6.0,0.0,2,Sommer,Fimmilena,mittelbraun,1499,0.133422,Abteilung005,WHG021,WUG073,0,107.9
1,900001,2014-03-28,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,2,Sommer,Fimmilena,mittelbraun,1499,0.266845,Abteilung005,WHG021,WUG073,1,119.9
2,900001,2014-03-29,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,3,Sommer,Fimmilena,mittelbraun,1499,0.466978,Abteilung005,WHG021,WUG073,2,179.85
3,900001,2014-03-31,2013-10-18,2014-05-31,59.95,59.55,0.4,0.0,5,Sommer,Fimmilena,mittelbraun,1499,0.800534,Abteilung005,WHG021,WUG073,4,297.75
4,900001,2014-04-01,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,6,Sommer,Fimmilena,mittelbraun,1499,1.200801,Abteilung005,WHG021,WUG073,5,359.7


## Date Information Expansion and Method Comparison

### timing on big scale

In [5]:
%%time

df_group = df.groupby('article_id')

CPU times: user 3.61 ms, sys: 2.88 ms, total: 6.49 ms
Wall time: 59.8 ms


In [6]:
%%time

# expand transaction_date information
season = list()
weekday = list()
months = list()

for i in df.transaction_date:
    month = i.month
    # meteorological seasons
    if 2 < month < 6:
        season.append('spring')
    elif 5 < month < 9:
        season.append("summer")
    elif 8 < month < 12:
        season.append("fall")
    else:
        season.append("winter")

    months.append(month)
    day = i.weekday()
    weekday.append(day)

df['season_buy'] = pd.Series(season, index=df.index)
df['month'] = pd.Series(months, index=df.index)
df['weekday'] = pd.Series(weekday, index=df.index)

CPU times: user 11.2 s, sys: 240 ms, total: 11.4 s
Wall time: 11.5 s


In [7]:
%%time

# expand transaction_date information


def get_season(month):
    if 2 < month < 6:
        return 'spring'
    elif 5 < month < 9:
        return "summer"
    elif 8 < month < 12:
        return "fall"
    else:
        return "winter"


df['month'] = df['transaction_date'].apply(lambda x: x.month)
df['season_buy'] = df['month'].apply(lambda x: get_season(x))
df['weekday'] = df['transaction_date'].apply(lambda x: x.weekday())

CPU times: user 15.8 s, sys: 281 ms, total: 16 s
Wall time: 16 s


In [8]:
%%time

new_seasons = df_group.apply(lambda x : get_season(x.transaction_date.min().month))

CPU times: user 9.14 s, sys: 206 ms, total: 9.35 s
Wall time: 9.62 s


In [21]:
%%time

df['season'] = df['article_id'].apply(lambda x : new_seasons[x])

CPU times: user 416 ms, sys: 8.35 ms, total: 424 ms
Wall time: 422 ms


In [17]:
%%time

df = df.merge(pd.DataFrame(new_seasons), left_on='article_id', right_index=True)

CPU times: user 702 ms, sys: 265 ms, total: 967 ms
Wall time: 1.04 s


In [13]:
df.head()

Unnamed: 0,article_id,transaction_date,markdown_start_date,markdown_end_date,original_price,sells_price,discount,markdown,article_count,season,...,avq,Abteilung,WHG,WUG,time_on_sale,revenue,season_buy,month,weekday,0
0,900001,2014-03-27,2013-10-18,2014-05-31,59.95,53.95,6.0,0.0,2,spring,...,0.133422,Abteilung005,WHG021,WUG073,0,107.9,spring,3,3,spring
1,900001,2014-03-28,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,2,spring,...,0.266845,Abteilung005,WHG021,WUG073,1,119.9,spring,3,4,spring
2,900001,2014-03-29,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,3,spring,...,0.466978,Abteilung005,WHG021,WUG073,2,179.85,spring,3,5,spring
3,900001,2014-03-31,2013-10-18,2014-05-31,59.95,59.55,0.4,0.0,5,spring,...,0.800534,Abteilung005,WHG021,WUG073,4,297.75,spring,3,0,spring
4,900001,2014-04-01,2013-10-18,2014-05-31,59.95,59.95,0.0,0.0,6,spring,...,1.200801,Abteilung005,WHG021,WUG073,5,359.7,spring,4,1,spring


### timing on small scale

In [22]:
df = df.head().copy()

In [23]:
%%time

# expand transaction_date information
season = list()
weekday = list()
months = list()

for i in df.transaction_date:
    month = i.month
    # meteorological seasons
    if 2 < month < 6:
        season.append('spring')
    elif 5 < month < 9:
        season.append("summer")
    elif 8 < month < 12:
        season.append("fall")
    else:
        season.append("winter")

    months.append(month)
    day = i.weekday()
    weekday.append(day)

df['season_buy'] = pd.Series(season, index=df.index)
df['month'] = pd.Series(months, index=df.index)
df['weekday'] = pd.Series(weekday, index=df.index)

CPU times: user 17.4 ms, sys: 31.7 ms, total: 49 ms
Wall time: 48.2 ms


In [24]:
%%time

# expand transaction_date information


def get_season(month):
    if 2 < month < 6:
        return 'spring'
    elif 5 < month < 9:
        return "summer"
    elif 8 < month < 12:
        return "fall"
    else:
        return "winter"


df['month'] = df['transaction_date'].apply(lambda x: x.month)
df['season_buy'] = df['month'].apply(lambda x: get_season(x))
df['weekday'] = df['transaction_date'].apply(lambda x: x.weekday())

CPU times: user 9.17 ms, sys: 2.41 ms, total: 11.6 ms
Wall time: 10.4 ms
