In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost
from sklearn.cross_validation import train_test_split

  from numpy.core.umath_tests import inner1d


In [64]:
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}
train_data=pd.read_csv("train.csv",parse_dates=[2],low_memory=False,dtype=types)
test_data=pd.read_csv("test.csv",parse_dates=[3],low_memory=False,dtype=types)
store_data=pd.read_csv("store.csv")

In [65]:
def build_features_train(train,store):
    features=[]
    data=pd.merge(train,store,on="Store")
    data.loc[data.Open.isnull(), 'Open'] = 1
    data.fillna(0, inplace=True)
    features.append('Store')
    features.append('DayOfWeek')
    features.append('Promo')
    features.append('SchoolHoliday')
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['WeekOfYear'] = data.Date.dt.weekofyear
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StateHoliday.replace(mappings, inplace=True)
    features.append('StateHoliday')
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    features.append('StoreType')
    features.append('Assortment')
    features.append('CompetitionDistance')
    features.append('Promo2')
    data['CompetitionOpenSince']=12*(data.Year-data.CompetitionOpenSinceYear)+(data.Month - data.CompetitionOpenSinceMonth)
    features.append('CompetitionOpenSince')
    features.append('Promo2OpenSince')
    data['Promo2OpenSince']=12 * (data.Year - data.Promo2SinceYear)+(data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['Promo2OpenSince'] = data.Promo2OpenSince.apply(lambda x: x if x > 0 else 0)
    data.loc[data.Promo2SinceYear == 0, 'Promo2OpenSince'] = 0
    data.loc[data.Promo2 == 0, 'Promo2OpenSince'] = 0
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromo2Month'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromo2Month'] = 1
    features.append('IsPromo2Month')
    #features.append('Month')
    features.append('Day')
    #features.append('WeekOfYear')
    return data,features

In [66]:
train_data,features=build_features_train(train_data,store_data)

In [67]:
test_data,f=build_features_train(test_data,store_data)

In [54]:
test_data.to_csv("test_data_processed2.csv")
train_data.to_csv("train_data_processed2.csv")

In [68]:
random_model=RandomForestRegressor()
random_model.fit(train_data[features].values,train_data['Sales'].values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [69]:
predictions2=random_model.predict(test_data[features].values)
result = pd.DataFrame({"Id": test_data["Id"], 'Sales': predictions2})
result.to_csv("Submission16.csv")

In [73]:
xg_model=xgboost.XGBRegressor(max_depth=10,booster="gbtree")
xg_model.fit(train_data[features].values,train_data['Sales'].values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [77]:
predictions2=xg_model.predict(test_data[features].values)
result = pd.DataFrame({"Id": test_data["Id"], 'Sales': predictions2})
result.to_csv("Submission17.csv")