In [1]:
"""
Feature engineering 

As from last notebook, we need to do following in feature engineering
- exclude 0 sales
- exclude store closure
- state holiday: one hot encoding
- date: add year, month, week, quarter
"""

'\nFeature engineering \n\nAs from last notebook, we need to do following in feature engineering\n- exclude 0 sales\n- exclude store closure\n- state holiday: one hot encoding\n- date: add year, month, week, quarter\n'

In [2]:
import pandas as pd
import warnings
import seaborn as sns

warnings.simplefilter("ignore")
pd.options.display.max_columns = None

In [3]:
input_path = 'rossmann/'
input_file = 'train.csv'
processed_path = 'processed/'
processed_file = 'processed_data.csv'


In [4]:
df_train = pd.read_csv(input_path + input_file)

# Exclude 0 sales and store closure

In [5]:
def exclude_store_closure_and_zero(df):
    df = df.loc[df['Sales']>0]
    return df.loc[df['Open']==1].reset_index(drop=True)

In [6]:
df = exclude_store_closure_and_zero(df_train)

In [7]:
df.shape

(844338, 9)

In [8]:
df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1
...,...,...,...,...,...,...,...,...,...
844333,682,2,2013-01-01,3375,566,1,0,a,1
844334,733,2,2013-01-01,10765,2377,1,0,a,1
844335,769,2,2013-01-01,5035,1248,1,0,a,1
844336,948,2,2013-01-01,4491,1039,1,0,a,1


# Deal with state holiday

In [9]:
def one_hot_encoding_state_holiday(df):
    """a = public holiday, b = Easter holiday, c = Christmas, 0 = None
    """
    df.loc[df['StateHoliday'] == 0, 'StateHoliday'] = '0'
    state_dummy = df.pop('StateHoliday')
    state_dummy = pd.get_dummies(state_dummy)
    state_dummy.rename(columns={'0': 'holiday_no', 'a': 'holiday_public', 
                               'b': 'holiday_easter', 'c':'holiday_christmas'}, inplace=True)
    
    combined = pd.concat([df, state_dummy], axis =1)
    return combined

In [10]:
df = one_hot_encoding_state_holiday(df)

# Date

In [11]:
def add_year_month(df):
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df = df.sort_values('Date', ascending = True)
    df['year'] = df['Date'].dt.year
#     df['month'] = df['Date'].dt.month
#     df['quarter'] = df['Date'].dt.quarter
    df['weekofyear'] = df['Date'].dt.weekofyear
    return df

In [12]:
df = add_year_month(df)

In [13]:
df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,holiday_no,holiday_public,holiday_easter,holiday_christmas,year,weekofyear
844337,1097,2,2013-01-01,5961,1405,1,0,1,0,1,0,0,2013,1
844321,85,2,2013-01-01,4220,619,1,0,1,0,1,0,0,2013,1
844322,259,2,2013-01-01,6851,1444,1,0,1,0,1,0,0,2013,1
844323,262,2,2013-01-01,17267,2875,1,0,1,0,1,0,0,2013,1
844324,274,2,2013-01-01,3102,729,1,0,1,0,1,0,0,2013,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743,745,5,2015-07-31,8363,746,1,1,1,1,0,0,0,2015,31
744,746,5,2015-07-31,9082,638,1,1,1,1,0,0,0,2015,31
745,747,5,2015-07-31,10708,826,1,1,1,1,0,0,0,2015,31
739,741,5,2015-07-31,11253,1137,1,1,1,1,0,0,0,2015,31


# Combine all together 

In [14]:
def feature_engineering(df):
    df = exclude_store_closure_and_zero(df)
    df = one_hot_encoding_state_holiday(df)
    df = add_year_month(df)
    return df

In [15]:
df = feature_engineering(df_train)

In [16]:
df

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,holiday_no,holiday_public,holiday_easter,holiday_christmas,year,weekofyear
844337,1097,2,2013-01-01,5961,1405,1,0,1,0,1,0,0,2013,1
844321,85,2,2013-01-01,4220,619,1,0,1,0,1,0,0,2013,1
844322,259,2,2013-01-01,6851,1444,1,0,1,0,1,0,0,2013,1
844323,262,2,2013-01-01,17267,2875,1,0,1,0,1,0,0,2013,1
844324,274,2,2013-01-01,3102,729,1,0,1,0,1,0,0,2013,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743,745,5,2015-07-31,8363,746,1,1,1,1,0,0,0,2015,31
744,746,5,2015-07-31,9082,638,1,1,1,1,0,0,0,2015,31
745,747,5,2015-07-31,10708,826,1,1,1,1,0,0,0,2015,31
739,741,5,2015-07-31,11253,1137,1,1,1,1,0,0,0,2015,31


# Save to processed folder

In [17]:
df.to_csv(processed_path + processed_file, index=False)