In [25]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


In [26]:

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
sample_submit = pd.read_csv('dataset/sample_submit.csv')

df = pd.concat([train, test], axis=0, ignore_index=True)

def lag_feature_adv(df, lags, col):
    '''
    历史N周平移特征
    '''
    tmp = df[['week','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['week','shop_id','item_id', col+'_lag_'+str(i)+'_adv']
        shifted['week'] += i
        df = pd.merge(df, shifted, on=['week','shop_id','item_id'], how='left')
        df[col+'_lag_'+str(i)+'_adv'] = df[col+'_lag_'+str(i)+'_adv']
    return df

df = lag_feature_adv(df, [1, 2, 3], 'weekly_sales')

x_train = df[df.week < 33].drop(['weekly_sales'], axis=1)
y_train = df[df.week < 33]['weekly_sales']
x_test = df[df.week == 33].drop(['weekly_sales'], axis=1)

# 修改 'num_leaves': 30, 'max_depth':3
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
    folds = 5
    seed = 1024
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])
     
    categorical_feature = ['shop_id','item_id','item_category_id']
    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        params = {
            'boosting_type': 'dart',
            'objective': 'mse',
            'metric': 'mse',
            'min_child_weight': 5,
#             'num_leaves': 2 ** 7,
            'num_leaves': 25,
            'max_depth':5,
            'lambda_l2': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 6,
            'learning_rate': 0.02,
            'seed': 1024,
            'n_jobs':-1,
            'silent': True,
            'verbose': -1,
        }

        model = clf.train(params, train_matrix, 20000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature = categorical_feature,
                          verbose_eval=500,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(mean_squared_error(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [27]:
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)


sample_submit['weekly_sales'] = lgb_test
sample_submit['weekly_sales'] = sample_submit['weekly_sales'].apply(lambda x:x if x>0 else 0).values
sample_submit.to_csv('res_v3_3.csv', index=False)

************************************ 1 ************************************
[500]	training's l2: 2.30105	valid_1's l2: 2.3379
[1000]	training's l2: 1.96008	valid_1's l2: 2.1008
[1500]	training's l2: 1.7435	valid_1's l2: 1.9921
[2000]	training's l2: 1.59889	valid_1's l2: 1.91396
[2500]	training's l2: 1.49772	valid_1's l2: 1.887
[3000]	training's l2: 1.42666	valid_1's l2: 1.87596
[3500]	training's l2: 1.36994	valid_1's l2: 1.86477
[4000]	training's l2: 1.32388	valid_1's l2: 1.86796
[4500]	training's l2: 1.2744	valid_1's l2: 1.86697
[5000]	training's l2: 1.24282	valid_1's l2: 1.86854
[5500]	training's l2: 1.2089	valid_1's l2: 1.86628
[6000]	training's l2: 1.17652	valid_1's l2: 1.87136
[6500]	training's l2: 1.14438	valid_1's l2: 1.86928
[7000]	training's l2: 1.12331	valid_1's l2: 1.87364
[7500]	training's l2: 1.09751	valid_1's l2: 1.87977
[8000]	training's l2: 1.07199	valid_1's l2: 1.88089
[8500]	training's l2: 1.05366	valid_1's l2: 1.87447
[9000]	training's l2: 1.03409	valid_1's l2: 1.882

[13000]	training's l2: 0.927609	valid_1's l2: 1.50071
[13500]	training's l2: 0.915896	valid_1's l2: 1.49496
[14000]	training's l2: 0.906283	valid_1's l2: 1.49183
[14500]	training's l2: 0.895096	valid_1's l2: 1.49079
[15000]	training's l2: 0.884113	valid_1's l2: 1.48798
[15500]	training's l2: 0.876106	valid_1's l2: 1.48419
[16000]	training's l2: 0.867713	valid_1's l2: 1.4857
[16500]	training's l2: 0.857777	valid_1's l2: 1.48841
[17000]	training's l2: 0.849699	valid_1's l2: 1.48617
[17500]	training's l2: 0.841958	valid_1's l2: 1.48734
[18000]	training's l2: 0.83388	valid_1's l2: 1.48858
[18500]	training's l2: 0.826754	valid_1's l2: 1.49129
[19000]	training's l2: 0.819934	valid_1's l2: 1.49079
[19500]	training's l2: 0.811685	valid_1's l2: 1.48756
[20000]	training's l2: 0.804006	valid_1's l2: 1.4873
[1.8643995165558418, 1.946998510445819, 1.9229835499434669, 1.487300300885907]
************************************ 5 ************************************
[500]	training's l2: 2.2594	valid_1's 