In [33]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import operator
import matplotlib
matplotlib.use("Agg") #Needed to save figures
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

In [60]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("../data/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../data/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../data/store.csv")

train.fillna(1, inplace=True)
test.fillna(1, inplace=True)
train = train[train["Open"] != 0]
train = train[train["Sales"] > 0]

train = pd.merge(train, store, on='Store')

Load the training, test and store data using pandas


In [61]:
#     store['Sales25th'] = 0
#     store['Sales50th'] = 0
#     store['Sales75th'] = 0
#     store['SalesMedian'] = 0

#     for store_id in train.Store.unique():
#         Sales25th = train[train.Store == store_id].Sales.quantile(0.25)
#         Sales50th = train[train.Store == store_id].Sales.quantile(0.50)
#         Sales75th = train[train.Store == store_id].Sales.quantile(0.75)
#         SalesMedian = train[train.Store == store_id].Sales.median()

#         store.loc[store.Store == store_id, 'Sales25th'] = Sales25th
#         store.loc[store.Store == store_id, 'Sales50th'] = Sales50th
#         store.loc[store.Store == store_id, 'Sales75th'] = Sales75th
#         store.loc[store.Store == store_id, 'SalesMedian'] = SalesMedian

In [65]:
def build_features_store(store, train):
    train = train.copy()
    train['Sales'] = train['Sales'].apply(np.log1p)
    train['Year'] = train.Date.dt.year
    train['Month'] = train.Date.dt.month
    train['Day'] = train.Date.dt.day
    train['DayOfWeek'] = train.Date.dt.dayofweek
    train['WeekOfYear'] = train.Date.dt.weekofyear
    
    store_ids = store.Store.unique()
    for store_id in store_ids:
        for day in train.DayOfWeek.unique():
            keyword = 'SalesDayOfWeek'
            name_median = keyword + str(day) + "_Median"
            name_std = keyword + str(day) + "_std"
            store[name_median] = 0
            store[name_std] = 0
        for month in train.Month.unique():
            keyword = 'SalesMonth'
            name_median = keyword + str(month) + "_Median"
            name_std = keyword + str(month) + "_std"
            store[name_median] = 0
            store[name_std] = 0
        for year in train.Year.unique():
            keyword = 'SalesYear'
            name_median = keyword + str(year) + "_Median"
            name_std = keyword + str(year) + "_std"
            store[name_median] = 0
            store[name_std] = 0
            
    
    for store_id in store_ids:
        for day in train.DayOfWeek.unique():
            keyword = 'SalesDayOfWeek'
            name_median = keyword + str(day) + "_Median"
            name_std = keyword + str(day) + "_std"
            store.loc[store.Store == store_id, name_median] = train[(train.Store == store_id) & (train.DayOfWeek == day)].Sales.median()
            store.loc[store.Store == store_id, name_std] = train[(train.Store == store_id) & (train.DayOfWeek == day)].Sales.std()
        for month in train.Month.unique():
            keyword = 'SalesMonth'
            name_median = keyword + str(month) + "_Median"
            name_std = keyword + str(month) + "_std"
            store.loc[store.Store == store_id, name_median] = train[(train.Store == store_id) & (train.Month == month)].Sales.median()
            store.loc[store.Store == store_id, name_std] = train[(train.Store == store_id) & (train.Month == month)].Sales.std()
        for year in train.Year.unique():
            keyword = 'Salesyear'
            name_median = keyword + str(year) + "_Median"
            name_std = keyword + str(year) + "_stdth"
            store.loc[store.Store == store_id, name_median] = train[(train.Store == store_id) & (train.Year == year)].Sales.median()
            store.loc[store.Store == store_id, name_std] = train[(train.Store == store_id) & (train.Year == year)].Sales.std()
            
    store.fillna(0, inplace=True)        
    
build_features_store(store, train)
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,SalesDayOfWeek4_Median,SalesDayOfWeek4_std,SalesDayOfWeek3_Median,SalesDayOfWeek3_std,SalesDayOfWeek2_Median,SalesDayOfWeek2_std,SalesDayOfWeek1_Median,SalesDayOfWeek1_std,SalesDayOfWeek0_Median,SalesDayOfWeek0_std,SalesDayOfWeek5_Median,SalesDayOfWeek5_std,SalesDayOfWeek6_Median,SalesDayOfWeek6_std,SalesMonth7_Median,SalesMonth7_std,SalesMonth6_Median,SalesMonth6_std,SalesMonth5_Median,SalesMonth5_std,SalesMonth4_Median,SalesMonth4_std,SalesMonth3_Median,SalesMonth3_std,SalesMonth2_Median,SalesMonth2_std,SalesMonth1_Median,SalesMonth1_std,SalesMonth12_Median,SalesMonth12_std,SalesMonth11_Median,SalesMonth11_std,SalesMonth10_Median,SalesMonth10_std,SalesMonth9_Median,SalesMonth9_std,SalesMonth8_Median,SalesMonth8_std,SalesYear2015_Median,SalesYear2015_std,SalesYear2014_Median,SalesYear2014_std,SalesYear2013_Median,SalesYear2013_std,Salesyear2015_Median,Salesyear2015_stdth,Salesyear2014_Median,Salesyear2014_stdth,Salesyear2013_Median,Salesyear2013_stdth
0,1,c,a,1270,9,2008,0,0,0,0,8.445053,0.181857,8.385027,0.198358,8.422991,0.197095,8.442792,0.219252,8.576113,0.230407,8.473447,0.163825,0,0,8.395703,0.161158,8.37609,0.180397,8.462948,0.169759,8.42442,0.215894,8.507143,0.224599,8.459138,0.169321,8.458292,0.161443,8.747352,0.306682,8.528924,0.171775,8.367365,0.160959,8.337109,0.163525,8.33687,0.167175,0,0,0,0,0,0,8.396606,0.177956,8.435549,0.21345,8.477412,0.205454
1,2,a,a,570,11,2007,1,13,2010,"Jan,Apr,Jul,Oct",8.449343,0.173979,8.519191,0.197962,8.686337,0.212665,8.603921,0.280389,8.784391,0.313915,7.914252,0.166431,0,0,8.494129,0.299773,8.471568,0.343332,8.494641,0.318278,8.467583,0.331592,8.481151,0.35437,8.423102,0.325753,8.412933,0.319542,8.699515,0.444657,8.535426,0.300319,8.474112,0.309068,8.457443,0.327836,8.404696,0.304362,0,0,0,0,0,0,8.47449,0.315386,8.481151,0.334413,8.460829,0.346059
2,3,a,a,14130,12,2006,1,14,2011,"Jan,Apr,Jul,Oct",8.869527,0.202244,8.845201,0.203843,8.919854,0.246554,8.978274,0.287067,9.104313,0.334649,8.348064,0.205398,0,0,8.810161,0.303451,8.786762,0.308194,8.839277,0.28015,8.804475,0.30015,8.791182,0.332803,8.699343,0.293161,8.692614,0.307784,9.024131,0.366182,8.930759,0.276791,8.701915,0.306763,8.696009,0.345084,8.708144,0.326504,0,0,0,0,0,0,8.806873,0.289417,8.792999,0.324283,8.802219,0.325755
3,4,c,c,620,9,2009,0,0,0,0,9.143559,0.173277,9.093807,0.191063,9.094143,0.171656,9.169569,0.200862,9.315375,0.211665,9.202256,0.127745,0,0,9.143132,0.199868,9.153453,0.171077,9.18932,0.166268,9.131189,0.225918,9.153453,0.203924,9.166597,0.16256,9.135455,0.155201,9.388821,0.242493,9.171288,0.169721,9.121068,0.154609,9.104535,0.185138,9.141954,0.163893,0,0,0,0,0,0,9.179417,0.178562,9.160099,0.188938,9.125817,0.200464
4,5,a,a,29910,4,2015,0,0,0,0,8.474909,0.165918,8.471149,0.20696,8.577253,0.229853,8.555354,0.273898,8.831566,0.341444,7.606136,0.180172,0,0,8.490028,0.392968,8.411829,0.402407,8.458268,0.403609,8.349721,0.440445,8.358197,0.444875,8.369937,0.409664,8.390268,0.414723,8.61559,0.497578,8.532476,0.409266,8.446322,0.39631,8.369853,0.389926,8.40268,0.428911,0,0,0,0,0,0,8.435982,0.397012,8.423871,0.416979,8.452548,0.442798


In [66]:
store.describe()

60


In [67]:
store.to_csv("../data/store_features.pd", index=False)