In [36]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt
import cPickle as pickle

def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat/y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y,yhat)

def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data


In [37]:
features = []
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../data/store_features.pd")
for feature in store.columns:
    if '_' in feature:
        features += [feature]

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)

train = train[train["Open"] != 0]
train = train[train["Sales"] > 0]

train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

build_features(features, train)
build_features([], test)
print train.count()

Store                        844338
DayOfWeek                    844338
Date                         844338
Sales                        844338
Customers                    844338
Open                         844338
Promo                        844338
StateHoliday                 844338
SchoolHoliday                844338
StoreType                    844338
Assortment                   844338
CompetitionDistance          844338
CompetitionOpenSinceMonth    844338
CompetitionOpenSinceYear     844338
Promo2                       844338
Promo2SinceWeek              844338
Promo2SinceYear              844338
PromoInterval                844338
SalesDayOfWeek4_Median       844338
SalesDayOfWeek4_25th         844338
SalesDayOfWeek4_75th         844338
SalesDayOfWeek3_Median       844338
SalesDayOfWeek3_25th         844338
SalesDayOfWeek3_75th         844338
SalesDayOfWeek2_Median       844338
SalesDayOfWeek2_25th         844338
SalesDayOfWeek2_75th         844338
SalesDayOfWeek1_Median      

In [38]:
validation_candidates = train[((train.Date >= '2013-08-01') & (train.Date <= '2013-09-17')) | ((train.Date >= '2014-08-01') & (train.Date <= '2014-09-17'))]
train = train[~(((train.Date >= '2013-08-01') & (train.Date <= '2013-09-17')) | ((train.Date >= '2014-08-01') & (train.Date <= '2014-09-17')))]
print train.shape
print validation_candidates.shape

(760259, 47)
(84079, 47)


In [39]:
X_train, X_valid = train_test_split(validation_candidates, test_size=0.1, random_state=1337)
X_train = pd.concat([train, X_train], axis=0)

In [40]:
print X_train.shape
print X_valid.shape

(835930, 47)
(8408, 47)
