In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [17]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.extend(['IsPromoMonth', 'IsPromoNextMonth', 'IsPromoLastMonth'])
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data['nextMonthStr'] = data.Month.apply(lambda x: x + 1).map(month2str)
    data['lastMonthStr'] = data.Month.apply(lambda x: x - 1).map(month2str)
    
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    data['IsPromoNextMonth'] = 0
    data['IsPromoLastMonth'] = 0
    
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1
                data.loc[(data.nextMonthStr == month) & (data.PromoInterval == interval), 'IsPromoNextMonth'] = 1
                data.loc[(data.lastMonthStr == month) & (data.PromoInterval == interval), 'IsPromoLastMonth'] = 1

    return data

In [20]:
print train.sample(10)
features = []
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../data/store_features.pd")
for feature in store.columns:
    if '_' in feature:
        features += [feature]

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

train = train[train["Open"] != 0]
train = train[train["Sales"] > 0]
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')
train = build_features(features, train)
test = build_features([], test)


        Store  DayOfWeek       Date  Sales  Customers  Open  Promo  \
50338      67          4 2014-01-17   6159        691     1      0   
15828      21          5 2013-11-09   4649        512     1      0   
304523    403          2 2013-03-27   9009        897     1      1   
256777    341          4 2014-04-25   4510        670     1      0   
792037   1049          5 2015-06-20   2642        403     1      0   
298064    395          2 2014-06-25   3404        456     1      0   
283958    377          1 2015-04-14   9976       1166     1      1   
123135    165          1 2015-06-09   4137        416     1      0   
803281   1063          0 2013-05-13   6056        725     1      1   
188263    251          5 2014-11-15  19215       2533     1      0   

        StateHoliday  SchoolHoliday  StoreType  Assortment  \
50338              0              0          1           3   
15828              0              0          3           3   
304523             0              1        

In [19]:
train[features].sample(10)

Unnamed: 0,SalesDayOfWeek4_Median,SalesDayOfWeek4_25th,SalesDayOfWeek4_75th,SalesDayOfWeek3_Median,SalesDayOfWeek3_25th,SalesDayOfWeek3_75th,SalesDayOfWeek2_Median,SalesDayOfWeek2_25th,SalesDayOfWeek2_75th,SalesDayOfWeek1_Median,SalesDayOfWeek1_25th,SalesDayOfWeek1_75th,SalesDayOfWeek0_Median,SalesDayOfWeek0_25th,SalesDayOfWeek0_75th,SalesDayOfWeek5_Median,SalesDayOfWeek5_25th,SalesDayOfWeek5_75th,SalesDayOfWeek6_Median,SalesDayOfWeek6_25th,SalesDayOfWeek6_75th,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,StoreType,Assortment,StateHoliday,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth,IsPromoNextMonth,IsPromoLastMonth
259674,4540.0,4063.25,4989.5,4376.5,3868.0,5038.5,4375.0,3903.0,5052.0,4678,3845.75,5735.0,6084.0,4320.0,6775.0,3265.5,3042.5,3505.0,0,0,0,345,120,1,1,0,1,1,0,3,1,9,2014,2,24169,19.0,1,0,0
288128,8667.5,7569.0,9490.5,8265.0,6697.5,9315.25,7912.0,5939.25,8653.0,8169,5688.0,9297.0,11008.0,6143.0,12112.5,5744.5,5379.0,6248.75,0,0,0,382,26130,0,0,1,3,3,0,4,5,30,2014,22,138,0.0,0,0,0
364904,4612.0,4058.25,5043.25,5603.0,4827.5,6253.0,4779.5,4024.25,5625.75,6538,5046.0,7439.0,6692.0,4543.0,7434.0,1658.0,1497.0,1905.75,0,0,0,483,2310,0,1,0,1,3,0,4,10,17,2014,42,37,55.25,1,0,0
103271,7448.0,6630.0,8189.0,7941.5,6850.75,8989.25,7456.5,6140.5,8739.0,8058,6371.5,9499.5,10276.5,6657.75,11543.25,3104.5,2792.25,3461.25,0,0,0,138,25360,1,0,1,1,3,0,4,2,20,2015,8,4,0.0,0,0,0
625721,5655.0,4654.0,6383.0,5664.0,4598.0,6347.75,5560.0,4397.25,6454.75,6432,4433.0,7339.0,7233.0,4912.5,8078.0,5647.5,5335.5,6189.0,0,0,0,829,110,0,0,0,3,1,0,5,5,30,2015,22,24185,0.0,0,0,0
779336,9741.5,8731.0,10722.0,9022.5,8011.75,10191.25,8679.0,7425.25,9826.75,10492,8364.0,11802.0,12256.0,8101.5,13623.5,5056.5,4648.5,5516.5,0,0,0,1032,270,0,1,0,4,3,0,2,5,14,2014,20,15,19.0,0,0,1
792728,7703.0,6573.0,8528.0,6971.5,5876.25,7931.0,6763.5,5583.25,8026.0,7005,5357.5,8140.0,9188.5,6055.25,10242.0,6636.5,6232.75,7250.25,0,0,0,1050,13170,0,1,0,4,3,0,2,3,25,2015,13,7,49.0,1,0,0
630477,5236.0,4641.75,6138.5,4691.5,4056.25,5370.25,4858.5,4004.5,5657.75,5222,4069.75,6102.5,6878.0,4446.25,7973.25,4013.0,3724.5,4360.5,0,0,0,835,2890,1,1,0,1,1,0,0,3,2,2015,10,87,12.0,1,0,0
317471,9278.0,8622.0,10919.0,8095.0,7093.0,9011.5,7949.0,7083.0,9338.0,8128,6854.25,9316.75,9675.5,7628.75,11176.5,11981.5,10687.25,13486.25,0,0,0,421,3530,0,1,0,3,3,0,3,9,11,2014,37,27,24.5,1,0,0
223084,8903.5,8161.0,9984.5,7776.0,6996.0,8749.5,7508.0,6451.0,8685.0,7641,6173.75,8840.25,9813.0,7402.0,11154.0,8104.0,7612.5,8591.0,0,0,0,298,19840,1,0,0,4,1,0,1,3,17,2015,12,68,0.0,0,0,0
