In [20]:
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import re

warnings.filterwarnings('ignore')

In [21]:
wkd = pd.read_csv('data/wkd_v1.csv')
train = pd.read_csv('train_v2.csv')
train = pd.merge(train, wkd, left_on='date', right_on='ORIG_DT', how='left')
train['amount_hour_sum'] = train.groupby(['date', 'biz_type', 'periods'])['amount'].transform('sum')

train['Day'] = train['date'].apply(lambda x: int(x.split('/')[-1]))
train['d'] = train['date'].apply(lambda x: x[:7])

In [22]:
biz_list = train['biz_type'].unique()
from sklearn import preprocessing

def get_df_period(x):
    k = train[train['biz_type'] == x]
    k = k.drop_duplicates(['date', 'periods'])
    k['shift_WKD'] = k['WKD_TYP_CD'].shift(48)
    k['shift_WKD_'] = k['WKD_TYP_CD'].shift(-48)
    k = k.fillna('-1')
    k['WKD_+1'] = k['WKD_TYP_CD'] + k['shift_WKD']
    k['WKD_-1'] = k['WKD_TYP_CD'] + k['shift_WKD_']
    k['WKD_'] = k['shift_WKD'] + k['WKD_TYP_CD'] + k['shift_WKD_']
    return k

In [23]:
df = []
for i in tqdm(biz_list[:-1]):
    df.append(get_df_period(i))
k = pd.concat(df, axis=0)

100%|██████████████████████████████████████████| 13/13 [00:01<00:00, 10.97it/s]


In [24]:
test_v1_periods = pd.read_csv('test_v2_periods.csv')
test_v1_periods = pd.merge(test_v1_periods, wkd, left_on='date', right_on='ORIG_DT', how='left')

test_v1_periods_A = test_v1_periods[test_v1_periods['post_id'] == 'A']
test_v1_periods_A['shift_WKD'] = test_v1_periods_A['WKD_TYP_CD'].shift(48)
test_v1_periods_A['shift_WKD_'] = test_v1_periods_A['WKD_TYP_CD'].shift(-48)
test_v1_periods_A['shift_WKD'] = test_v1_periods_A['shift_WKD'].fillna('SN')
test_v1_periods_A['shift_WKD_'] = test_v1_periods_A['shift_WKD_'].fillna('SN')
test_v1_periods_A['WKD_+1'] = test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD']
test_v1_periods_A['WKD_-1'] = test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD_']
test_v1_periods_A['WKD_'] = test_v1_periods_A['shift_WKD'] + test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD_']

test_v1_periods_A['Day'] = test_v1_periods_A['date'].apply(lambda x: int(x.split('/')[-1]))

In [25]:
# 分细岗位单独训练
df_test = []
mape_list = []
for i in tqdm(biz_list[:-1]):
    train1 = k[k['d'] != '2020/11'][k['biz_type'] == i]
    train1_y = train1['amount_hour_sum']
    
    valid1 = k[k['biz_type'] == i]
    valid1 = valid1[valid1['d'] == '2020/11']
    valid1_y = valid1['amount_hour_sum']   
    
    test_v1_periods_A['biz_type'] = i
    test = test_v1_periods_A.copy()
    features = ['WKD_TYP_CD', 'shift_WKD', 'periods', 'Day']
    cat_cols = ['WKD_TYP_CD', 'shift_WKD', 'periods',]

    train1[cat_cols] = train1[cat_cols].astype('category')
    valid1[cat_cols] = valid1[cat_cols].astype('category')
    test[cat_cols] = test[cat_cols].astype('category')

    params = {'learning_rate': 0.1, 
            'boosting_type': 'gbdt', 
            'objective': 'regression_l1',
            'metric': 'poisson',
            'n_jobs': -1, 
            'seed': 2019, 
            'verbosity': -1, 
           }


    train_set = lgb.Dataset(train1[features], train1_y)
    val_set = lgb.Dataset(valid1[features], valid1_y)

    model = lgb.train(params, train_set, num_boost_round=5000,
                      valid_sets=(train_set, val_set), early_stopping_rounds=50,
                      verbose_eval=50,
                     categorical_feature=cat_cols
                     )
    oof_train = model.predict(valid1[features])
    test_predict = model.predict(test[features])
    test['amount'] = test_predict
    df_test.append(test)
    mape_list.append((abs(valid1_y - oof_train) / (valid1_y + 1)).mean())
    print((abs(valid1_y - oof_train) / (valid1_y + 1)).mean())

  8%|███▎                                       | 1/13 [00:00<00:06,  1.89it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -0.848714	valid_1's poisson: -0.309527
Early stopping, best iteration is:
[18]	training's poisson: -0.778559	valid_1's poisson: -0.316356
0.15038460158183703


 15%|██████▌                                    | 2/13 [00:00<00:05,  1.97it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -2.52855	valid_1's poisson: -1.2853
Early stopping, best iteration is:
[39]	training's poisson: -2.5229	valid_1's poisson: -1.29344
0.24637955121176336


 23%|█████████▉                                 | 3/13 [00:01<00:04,  2.07it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: 0.452102	valid_1's poisson: 0.449271
Early stopping, best iteration is:
[30]	training's poisson: 0.453475	valid_1's poisson: 0.445186
0.2094217977571942


 31%|█████████████▏                             | 4/13 [00:01<00:04,  2.19it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -23.3711	valid_1's poisson: -3.66495
Early stopping, best iteration is:
[5]	training's poisson: -19.6662	valid_1's poisson: -5.76322
0.18154919905402594


 38%|████████████████▌                          | 5/13 [00:02<00:03,  2.28it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -2.45944	valid_1's poisson: -0.464015
Early stopping, best iteration is:
[10]	training's poisson: -2.1502	valid_1's poisson: -0.642784
0.21509168191949896


 46%|███████████████████▊                       | 6/13 [00:02<00:03,  2.33it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -62.4593	valid_1's poisson: -49.9387
Early stopping, best iteration is:
[20]	training's poisson: -62.1825	valid_1's poisson: -50.0931
0.12887821575107483


 54%|███████████████████████▏                   | 7/13 [00:03<00:02,  2.37it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: 0.2475	valid_1's poisson: 0.095941
Early stopping, best iteration is:
[1]	training's poisson: 0.2475	valid_1's poisson: 0.095941
0.0020833333333333333


 62%|██████████████████████████▍                | 8/13 [00:03<00:02,  2.34it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -302.241	valid_1's poisson: -300.065
[100]	training's poisson: -302.257	valid_1's poisson: -300.055
Early stopping, best iteration is:
[50]	training's poisson: -302.241	valid_1's poisson: -300.065
0.06297867904574153


 69%|█████████████████████████████▊             | 9/13 [00:03<00:01,  2.40it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -5.07593	valid_1's poisson: -1.4297
Early stopping, best iteration is:
[9]	training's poisson: -4.52991	valid_1's poisson: -1.71166
0.18101233153567323


 77%|████████████████████████████████▎         | 10/13 [00:04<00:01,  2.41it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -25.8912	valid_1's poisson: -14.4512
Early stopping, best iteration is:
[13]	training's poisson: -25.2041	valid_1's poisson: -14.8305
0.13682025746834536


 85%|███████████████████████████████████▌      | 11/13 [00:04<00:00,  2.46it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -4.32783	valid_1's poisson: -1.61597
Early stopping, best iteration is:
[11]	training's poisson: -4.05061	valid_1's poisson: -1.82651
0.17006622332394358


 92%|██████████████████████████████████████▊   | 12/13 [00:05<00:00,  2.43it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -1119.11	valid_1's poisson: -984.778
Early stopping, best iteration is:
[23]	training's poisson: -1117.63	valid_1's poisson: -985.665
0.0846546612221229


100%|██████████████████████████████████████████| 13/13 [00:05<00:00,  2.38it/s]

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -129.614	valid_1's poisson: -61.027
Early stopping, best iteration is:
[10]	training's poisson: -125.465	valid_1's poisson: -63.7105
0.16573573054729093





In [26]:
sum(mape_list) / len(mape_list)

0.14885048182706503

In [27]:
test_v1_periods_A = test_v1_periods[test_v1_periods['post_id'] == 'B']
test_v1_periods_A['shift_WKD'] = test_v1_periods_A['WKD_TYP_CD'].shift(48)
test_v1_periods_A['shift_WKD_'] = test_v1_periods_A['WKD_TYP_CD'].shift(-48)
test_v1_periods_A['shift_WKD'] = test_v1_periods_A['shift_WKD'].fillna('SN')
test_v1_periods_A['shift_WKD_'] = test_v1_periods_A['shift_WKD_'].fillna('SN')
test_v1_periods_A['WKD_+1'] = test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD']
test_v1_periods_A['WKD_-1'] = test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD_']
test_v1_periods_A['WKD_'] = test_v1_periods_A['shift_WKD'] + test_v1_periods_A['WKD_TYP_CD'] + test_v1_periods_A['shift_WKD_']

test_v1_periods_A['Day'] = test_v1_periods_A['date'].apply(lambda x: int(x.split('/')[-1]))

In [28]:
i = 'B1'
k = get_df_period(i)
train1 = k[k['d'] != '2020/11'][k['biz_type'] == i]
train1_y = train1['amount_hour_sum']

valid1 = k[k['biz_type'] == i]
valid1 = valid1[valid1['d'] == '2020/11']
valid1_y = valid1['amount_hour_sum']   


test = test_v1_periods_A.copy()

features = ['WKD_TYP_CD', 'shift_WKD', 'periods', 'Day']
cat_cols = ['WKD_TYP_CD', 'shift_WKD', 'periods',]

train1[cat_cols] = train1[cat_cols].astype('category')
valid1[cat_cols] = valid1[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

params = {'learning_rate': 0.1, 
        'boosting_type': 'gbdt', 
        'objective': 'regression_l1',
        'metric': 'poisson',
        'n_jobs': -1, 
        'seed': 2019, 
        'verbosity': -1, 
       }


train_set = lgb.Dataset(train1[features], train1_y)
val_set = lgb.Dataset(valid1[features], valid1_y)

model = lgb.train(params, train_set, num_boost_round=5000,
                  valid_sets=(train_set, val_set), early_stopping_rounds=50,
                  verbose_eval=50,
                 categorical_feature=cat_cols
                 )
oof_train = model.predict(valid1[features])
test_predict = model.predict(test[features])
test['amount'] = test_predict
df_test.append(test)
print((abs(valid1_y - oof_train) / (valid1_y + 1)).mean())

Training until validation scores don't improve for 50 rounds
[50]	training's poisson: -253.7	valid_1's poisson: -176.838
Early stopping, best iteration is:
[16]	training's poisson: -251.545	valid_1's poisson: -177.925
0.32619993613737847


In [29]:
d = pd.concat(df_test, axis=0)
sum(d.amount)

564845.7662743723

In [30]:
d = d.groupby(['date', 'post_id', 'periods'])['amount'].sum().reset_index()
d = pd.merge(test_v1_periods[['date', 'post_id', 'periods']], d, on=['date', 'post_id', 'periods'], how='left')
d['amount'] = d['amount'].astype('int')
d.to_csv('sub_periods_lgb_hour.txt', index=False)
sum(d.amount)

564288

In [33]:
# scale
test_ = d.copy()
test_['date'] = pd.to_datetime(test_['date'])
data_day = pd.read_csv('sub_day_lgb_all_666.txt')
data_day['date'] = pd.to_datetime(data_day['date'])
test_['day_sum'] = test_.groupby(['date', 'post_id'])['amount'].transform('sum')
test_ = pd.merge(test_, data_day, on=['date', 'post_id'], how='left')
test_['ratio'] = test_['day_sum'] / test_['amount_y']
test_['amount'] = test_['amount_x'] / test_['ratio']
test_['amount'] = test_['amount'].fillna(0)
test_['amount'] = test_['amount'].astype('int')
test_[['date', 'post_id', 'periods', 'amount' ]].to_csv('sub_periods_scale_lgb.txt', index=False)

In [34]:
test_['amount'].sum()

587328

In [35]:
# ensemble
arima = pd.read_csv('sub_periods_scale_arima.txt')
test_['amount'] = (test_['amount'] + arima['amount']) // 2
test_[['date', 'post_id', 'periods', 'amount' ]].to_csv('sub_periods_scale_combine.txt', index=False)
test_['amount'].sum()