In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [16]:
def formatFeature(features, data):
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    features += ['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday', \
                 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek',\
                 'Promo2SinceYear']

    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)
    
    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear
    
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    
    features.extend(['IsPromoMonth', 'IsPromoNextMonth', 'WasPromoLastMonth'])
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data['NextMonth'] = data.Month + 1
    data['LastMonth'] = data.Month - 1
    data['nextMonthStr'] = data.NextMonth.map(month2str)
    data['lastMonthStr'] = data.LastMonth.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    data['IsPromoNextMonth'] = 0
    data['WasPromoLastMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval),\
                         'IsPromoMonth'] = 1
                data.loc[(data.nextMonthStr == month) & (data.PromoInterval == interval),\
                         'IsPromoNextMonth'] = 1
                data.loc[(data.lastMonthStr == month) & (data.PromoInterval == interval),\
                         'WasPromoLastMonth'] = 1
    

In [17]:
# googleTrends = pd.read_csv("../data/Rossmann_DE.csv")
# googleTrends

In [18]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../data/store.csv")

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

# Consider only open stores for training. Closed stores wont count into the score.
train = train[train["Open"] != 0]
# Use only Sales bigger then zero. Simplifies calculation of rmspe
train = train[train["Sales"] > 0]

In [19]:
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []
formatFeature(features, train)
formatFeature([], test)

In [20]:
train[features].sample(3)

Unnamed: 0,Store,CompetitionDistance,Promo,Promo2,SchoolHoliday,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2SinceWeek,Promo2SinceYear,StoreType,...,DayOfWeek,Month,Day,Year,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth,IsPromoNextMonth,WasPromoLastMonth
162466,217,3150,0,0,0,0,0,0,0,3,...,2,1,29,2014,5,24169,24169.25,0,0,0
639517,846,8860,0,1,0,4,2004,37,2009,1,...,5,1,5,2013,1,105,39.0,1,0,0
75487,100,17930,0,0,0,0,0,0,0,4,...,0,5,6,2013,19,24161,24160.75,0,0,0


In [21]:
X = train[features].values
y = np.log1p(train.Sales)
test = test[features].values

In [22]:
np.save(open('../data/X_pickle','wb'), X)
np.save(open('../data/y_pickle','wb'), y)
np.save(open('../data/test_pickle','wb'), test)
np.save(open('../data/features_pickle','wb'), features)