In [8]:
import numpy as np
import pandas as pd
import xgboost as xgb

df = pd.read_csv('../data/df_feature.csv')
train_da, valid_da, test_da = df.loc[df.week<136, :], df.loc[(df.week>=136)&(df.week<=145), :], df.loc[df.week>145, :]

ignorecols = ['id', 'num_orders', 'week']
features = [col for col in df.columns.tolist() if col not in ignorecols]

X_train, y_train, X_valid, y_valid = train_da[features], train_da.num_orders.map(np.log1p), valid_da[features], valid_da.num_orders.map(np.log1p)
X_test = test_da[features]


xgb_clf = xgb.XGBRegressor(n_estimators=1000, 
                           max_depth=9,
                           learning_rate=0.07,
                           subsample=0.75, 
                           colsample_bytree=0.75,
                           colsample_bylevel=0.75,
                           early_stopping_rounds=30,
                           reg_lambda=1.0,
                           random_state=731)

xgb_clf.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            eval_metric='rmse',
            verbose=True)

evals_result = xgb_clf.evals_result()

best_iteration = np.asarray(evals_result['validation_1']['rmse']).argmin()

train_da_all = df.loc[df.week<146, :]

X_train, y_train = train_da_all[features], train_da_all.num_orders.map(np.log1p)

best_iteration = 800
xgb_clf = xgb.XGBRegressor(n_estimators=best_iteration, 
                           max_depth=7,
                           learning_rate=0.1,
                           subsample=0.75, 
                           colsample_bytree=0.75,
                           colsample_bylevel=0.75,
                           reg_lambda=1.0,
                           random_state=731)

xgb_clf.fit(X_train, y_train, verbose=True)

predict = np.expm1(xgb_clf.predict(X_test)).astype(int)

sub_df = pd.read_csv('../data/sample_submission_hSlSoT6.csv')
sub_df['num_orders'] = predict
sub_df.to_csv('../sub/xgb.csv', index=None)


[0]	validation_0-rmse:4.24139	validation_1-rmse:4.1704
[1]	validation_0-rmse:3.94949	validation_1-rmse:3.88287
[2]	validation_0-rmse:3.67848	validation_1-rmse:3.61638
[3]	validation_0-rmse:3.42688	validation_1-rmse:3.36871
[4]	validation_0-rmse:3.19288	validation_1-rmse:3.14032
[5]	validation_0-rmse:2.97561	validation_1-rmse:2.92784
[6]	validation_0-rmse:2.77394	validation_1-rmse:2.73069
[7]	validation_0-rmse:2.58679	validation_1-rmse:2.54723
[8]	validation_0-rmse:2.41316	validation_1-rmse:2.37717
[9]	validation_0-rmse:2.25227	validation_1-rmse:2.21905
[10]	validation_0-rmse:2.10306	validation_1-rmse:2.07311
[11]	validation_0-rmse:1.96474	validation_1-rmse:1.93711
[12]	validation_0-rmse:1.83673	validation_1-rmse:1.81107
[13]	validation_0-rmse:1.71828	validation_1-rmse:1.69492
[14]	validation_0-rmse:1.60863	validation_1-rmse:1.58773
[15]	validation_0-rmse:1.50734	validation_1-rmse:1.48928
[16]	validation_0-rmse:1.41387	validation_1-rmse:1.3988
[17]	validation_0-rmse:1.32778	validation_1

KeyboardInterrupt: 

In [3]:
train_da, valid_da, test_da = df.loc[df.week<136, :], df.loc[(df.week>=136)&(df.week<=145), :], df.loc[df.week>145, :]
X_test = test_da[features]

In [4]:
predict = np.expm1(xgb_clf.predict(X_test)).astype(int)

sub_df = pd.read_csv('../data/sample_submission_hSlSoT6.csv')
sub_df['num_orders'] = predict
sub_df.to_csv('../sub/xgb.csv', index=None)

In [17]:
predict = np.expm1(xgb_clf.predict(X_test)).astype(int)

sub_df = pd.read_csv('../data/sample_submission_hSlSoT6.csv')
sub_df['num_orders'] = predict
sub_df.to_csv('../sub/xgb.csv', index=None)

XGBoostError: need to call fit or load_model beforehand

In [15]:
df.num_orders.describe()

count    456548.000000
mean        261.872760
std         395.922798
min          13.000000
25%          54.000000
50%         136.000000
75%         324.000000
max       24299.000000
Name: num_orders, dtype: float64

In [14]:
xgb_clf.feature_importances_

array([0.02514558, 0.03123346, 0.02991001, 0.0082054 , 0.00502912,
       0.00370566, 0.00873478, 0.00794071, 0.01482266, 0.00291159,
       0.00794071, 0.0354685 , 0.02302806, 0.02779248, 0.07781895,
       0.02276337, 0.02805717, 0.02593965, 0.03308629, 0.00132345,
       0.01482266, 0.03599788, 0.01217575, 0.01429328, 0.01932239,
       0.01985177, 0.01191106, 0.00158814, 0.0108523 , 0.00926416,
       0.03202753, 0.01111699, 0.01694018, 0.00952885, 0.00211752,
       0.00582319, 0.01349921, 0.00582319, 0.00079407, 0.01111699,
       0.00608788, 0.01005823, 0.        , 0.00052938, 0.        ,
       0.00132345, 0.        , 0.00158814, 0.        , 0.00079407,
       0.00370566, 0.00423505, 0.        , 0.00211752, 0.00185283,
       0.01879301, 0.00079407, 0.02011646, 0.08946533, 0.04949709,
       0.03070408, 0.02408682, 0.0219693 , 0.03017469, 0.00132345,
       0.00105876], dtype=float32)