In [1]:
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import re

warnings.filterwarnings('ignore')

In [2]:
wkd = pd.read_csv('data/wkd_v1.csv')
train = pd.read_csv('train_v2.csv')
train = pd.merge(train, wkd, left_on='date', right_on='ORIG_DT', how='left')
train['amount_hour_sum'] = train.groupby(['date', 'post_id', 'periods'])['amount'].transform('sum')
train = train.drop_duplicates(['date', 'post_id', 'periods'])
train_A = train[train['post_id'] == 'A']
train_A.reset_index(drop=True, inplace=True)

In [3]:
from sklearn import preprocessing
from statsmodels.tsa.arima_model import ARIMA
le = preprocessing.LabelEncoder()
train_A['kk'] = le.fit_transform(train_A.WKD_TYP_CD)

In [4]:
test_v1_periods = pd.read_csv('test_v2_periods.csv')
test_v1_periods = pd.merge(test_v1_periods, wkd, left_on='date', right_on='ORIG_DT', how='left')
test_v1_periods_A = test_v1_periods[test_v1_periods['post_id'] == 'A']

In [5]:
# 获取最优arima模型参数p、q
def get_p_q(k):
    pmax = 7
    qmax = 7
    bic_matrix = []

    # 逐个搜索BIC值 
    for p in range(pmax+1):
        tmp = []
        for q in range(qmax+1):
            try:   
                tmp.append(ARIMA(k.amount_hour_sum, (p,0,q), exog=k.kk).fit().bic)
            except Exception as e:
                
                tmp.append(None)
        bic_matrix.append(tmp)
   
    bic_matrix = pd.DataFrame(bic_matrix)
    p,q = bic_matrix.stack().idxmin() 
    return p, q

In [7]:
# 分periods时间序列数据建模
test_df = []
for i in range(17, 37):
    k = train_A[train_A['periods'] == i]
    k.reset_index(drop=True, inplace=True)
    # 删除一段异常数据
    k_1 = k[:753]
    k_2 = k[800:]
    k = pd.concat([k_1, k_2])
    p, q = get_p_q(k)
    model = ARIMA(k.amount_hour_sum, order=(p, 0, q), exog=k.kk,).fit()

    test = test_v1_periods_A[test_v1_periods_A['periods'] == i]
    test['kk'] = le.transform(test.WKD_TYP_CD)
    t = model.forecast(31, exog=test.kk)
    test['a'] = t[0]
    test_df.append(test[['date', 'post_id', 'periods', 'a']])

























































































In [8]:
train_B = train[train['post_id'] == 'B']
train_B.reset_index(drop=True, inplace=True)
test_v1_periods_B = test_v1_periods[test_v1_periods['post_id'] == 'B']

le = preprocessing.LabelEncoder()
train_B['kk'] = le.fit_transform(train_B.WKD_TYP_CD)

In [9]:
for i in range(17, 38):
    k = train_B[train_B['periods'] == i]
    p, q = get_p_q(k)
    model = ARIMA(k.amount_hour_sum, order=(p, 0, q), exog=k.kk,).fit()
    mae = np.mean(np.abs(model.resid))    
    print(mae)
    test = test_v1_periods_B[test_v1_periods_B['periods'] == i]
    test['kk'] = le.transform(test.WKD_TYP_CD)
    t = model.forecast(31, exog=test.kk)
    test['a'] = t[0]
    test_df.append(test[['date', 'post_id', 'periods', 'a']])





11.914738463245916








13.130239091190205






29.288988432323485








46.04393572048259






44.73567477221934






42.36681227527074








42.66662487830386








36.91941772189971






31.054860583907725






18.436232462707252






16.03804588102499






33.91267811083063






52.51175027482186








53.50302495698986








55.606672236042655








56.173833720123746








56.730765796738936






60.37812580947395






53.43258533357129






32.74169788339434






23.866894738514656


In [10]:
d = pd.concat(test_df, axis=0)
max(d['a'])

1499.6618493804986

In [11]:
d = pd.merge(test_v1_periods[['date', 'post_id', 'periods']], d, on=['date', 'post_id', 'periods'], how='left')
d = d.rename(columns={'a':'amount'})
d['amount'] = d['amount'].fillna(0)
d['amount'] = d['amount'].astype('int')
d['amount'] = d['amount'].apply(lambda x: 0 if x < 0 else x)

In [13]:
d.amount.sum()

562212

In [16]:
# scale
test_ = d.copy()
test_['date'] = pd.to_datetime(test_['date'])
data_day = pd.read_csv('sub_day_lgb_all_666.txt')
data_day['date'] = pd.to_datetime(data_day['date'])
test_['day_sum'] = test_.groupby(['date', 'post_id'])['amount'].transform('sum')
test_ = pd.merge(test_, data_day, on=['date', 'post_id'], how='left')
test_['ratio'] = test_['day_sum'] / test_['amount_y']
test_['amount'] = test_['amount_x'] / test_['ratio']
test_['amount'] = test_['amount'].fillna(0)
test_['amount'] = test_['amount'].astype('int')
test_[['date', 'post_id', 'periods', 'amount' ]].to_csv('sub_periods_scale_arima.txt', index=False)

In [17]:
test_.amount.sum()

587353