In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_hour = pd.read_csv('./data/hourly_dataset.csv') #每小时间隔流量数据集，含20个小区（01-20），多个表格间对于小区的编码一致
sub       = pd.read_csv('./data/sample_submission.csv') #提交样例
test      = pd.read_csv('./data/test_public.csv') #测试集（小时单位），须提交20个小区、4个不连续周的供水量。也即672（小时数） x 20（小区数）的矩阵
weather   = pd.read_csv('./data/weather.csv') #深圳市天气数据，测试集部分假定未知

def time_transfer(df):
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by=['time'])
    return df

data_hour = time_transfer(data_hour)
weather = time_transfer(weather)
# data_hour = pd.merge(data_hour,weather,on=['time','train or test'],how='left')
for i in range(20):
    data_hour[f'flow_{i+1}'][data_hour[f'flow_{i+1}']<0] = 0

In [4]:
test_list1 = test.groupby('train or test')['time'].first().reset_index()
test_list1 = test_list1['time'].values.tolist()
test_list2 = test.groupby('train or test')['time'].last().reset_index()
test_list2 = test_list2['time'].values.tolist()
test_list1.extend(test_list2)
test_list1.sort()

# train1 test1
train1_ = data_hour[data_hour['time']<test_list1[0]].reset_index(drop=True)
test1_  = data_hour[(data_hour['time']>=test_list1[0]) & (data_hour['time']<=test_list1[1])].reset_index(drop=True)
#train2 test2
train2_ = data_hour[(data_hour['time']>test_list1[1])  & (data_hour['time']<test_list1[2])].reset_index(drop=True)
test2_  = data_hour[(data_hour['time']>=test_list1[2]) & (data_hour['time']<=test_list1[3])].reset_index(drop=True)
#train3 test3
train3_ = data_hour[(data_hour['time']>test_list1[3])  & (data_hour['time']<test_list1[4])].reset_index(drop=True)
test3_  = data_hour[(data_hour['time']>=test_list1[4]) & (data_hour['time']<=test_list1[5])].reset_index(drop=True)
#train4 test4
train4_ = data_hour[(data_hour['time']>test_list1[5])  & (data_hour['time']<test_list1[6])].reset_index(drop=True)
test4_  = data_hour[(data_hour['time']>=test_list1[6]) & (data_hour['time']<=test_list1[7])].reset_index(drop=True)

In [5]:
def preprocess_single_flow(train, flow):
    if np.isnan(train.loc[0, flow]):
        train.loc[0, flow] = train.loc[:, flow].mean()
    #最后一个值为nan，就标记为有值
    if np.isnan(train.loc[len(train)-1, flow]):
        train.loc[len(train)-1, flow] = 0
    cur_train = train[flow].copy()
    cur_train[cur_train.notnull()]=0
    cur_train[cur_train.isnull()]=1
    begin_index = cur_train.diff()[cur_train.diff()==1].index.values.tolist()
    end_index = cur_train.diff()[cur_train.diff()==-1].index.values.tolist()
    for index in range(len(begin_index)):
        fill_null = train[flow].loc[begin_index[index]-1]/(end_index[index]-begin_index[index]+1)
        # 异常值均匀化填补空值
        train[flow].loc[begin_index[index]-1:end_index[index]-1] = fill_null
    return train[flow]

def preprocess_all_flow(train):
    for i in range(20):
        train[f'flow_{i+1}'] = preprocess_single_flow(train, f'flow_{i+1}')
    return train

train1_ = preprocess_all_flow(train1_)
train2_ = preprocess_all_flow(train2_)
train3_ = preprocess_all_flow(train3_)
train4_ = preprocess_all_flow(train4_)

In [6]:
def make_diff_features(df, flow):
    seq_len = 7*24
    df[f"{flow}_simple_diff1"] = df[flow].diff()
    df[f"{flow}_simple_shift24"] = df[flow].shift(24)
    df[f"{flow}_simple_diff168"] = df[flow].diff(seq_len)
    return df

def make_trigonometric_features(df, flow, n=24):
    df[f'{flow}_sin'] = round(np.sin(2*np.pi/n*df[flow]), 6)
    df[f'{flow}_cos'] = round(np.cos(2*np.pi/n*df[flow]), 6)
    return df.iloc[:,-2:]

def make_feas(data):
    data = data.drop(['time','train or test'],axis=1)
    cur1, cur2 = [], []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(data.loc[:,[f'flow_{flow+1}']]) #,'R', 'fx', 'T', 'U', 'fs', 'V', 'P']])
        cur1.append(make_diff_features(train_data_x, f'flow_{flow+1}').interpolate().fillna(method='bfill'))
        cur2.append(make_trigonometric_features(train_data_x, f'flow_{flow+1}', n=24))
    cur1 = np.stack(cur1,axis=0).transpose(1,0,2)
    cur2 = np.stack(cur2,axis=0).transpose(1,0,2)
    return np.concatenate((cur1,cur2), axis=2)

def generate_dataset(data, seq_len, pre_len, split_ratio=0.8):
    train_x, train_y, val_x, val_y, test_x = [], [], [], [], np.expand_dims(data[-pre_len:],axis=[0])
    split_size = int(len(data)*split_ratio)
    train_data = data[:split_size]
    val_data   = data[split_size:]
    for i in range(0, len(train_data)-seq_len-pre_len, seq_len):
        train_x.append(train_data[i:i+seq_len])
        train_y.append(train_data[i+seq_len:i+seq_len+pre_len])
    for i in range(0, len(val_data)-seq_len-pre_len, seq_len):
        val_x.append(val_data[i:i+seq_len])
        val_y.append(val_data[i+seq_len:i+seq_len+pre_len])
    train_x, train_y, val_x, val_y = np.array(train_x), np.array(train_y), np.array(val_x), np.array(val_y)
    return train_x, train_y, val_x, val_y, test_x
# 做特征
a = make_feas(train1_)
b = make_feas(pd.concat([train1_,train2_]))
c = make_feas(pd.concat([train1_,train2_,train3_]))
d = make_feas(pd.concat([train1_,train2_,train3_,train4_]))
print(a.shape, b.shape, c.shape, d.shape)
# 构造x和y数据
train1_x, train1_y, val1_x, val1_y, test1_x = generate_dataset(a, 24*7, 24*7)
train2_x, train2_y, val2_x, val2_y, test2_x = generate_dataset(b, 24*7, 24*7)
train3_x, train3_y, val3_x, val3_y, test3_x = generate_dataset(c, 24*7, 24*7)
train4_x, train4_y, val4_x, val4_y, test4_x = generate_dataset(d, 24*7, 24*7)

100%|██████████| 20/20 [00:00<00:00, 350.88it/s]
100%|██████████| 20/20 [00:00<00:00, 384.61it/s]
100%|██████████| 20/20 [00:00<00:00, 363.63it/s]
100%|██████████| 20/20 [00:00<00:00, 322.53it/s]

(2880, 20, 6) (3456, 20, 6) (4488, 20, 6) (5064, 20, 6)





In [7]:
def lightgbm_train(train_x, train_y, val_x, val_y, test_x):
    fea_nums = train_x.shape[-1]
    scores = []
    predictions = []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(train_x[:, :, flow, :].reshape(-1, fea_nums))
        train_data_y = pd.DataFrame(train_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        val_data_x   = pd.DataFrame(val_x[:, :, flow, :].reshape(-1, fea_nums))
        val_data_y   = pd.DataFrame(val_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        test_data_x  = pd.DataFrame(test_x[:, :, flow, :].reshape(-1, fea_nums))

        train_part = lgb.Dataset(train_data_x, train_data_y)
        val_part = lgb.Dataset(val_data_x, val_data_y)
        ESR = 100
        NBR = 3000
        VBE = 100
        lgb_params_best = {'objective': 'regression',
                           'metric': ['mse'],
                           'bagging_seed': 2022,
                           'verbose': -1}
        lgb_model = lgb.train(lgb_params_best, train_part, num_boost_round=NBR,
                              valid_sets=[train_part, val_part],
                              valid_names=['train', 'valid'],
                              early_stopping_rounds=ESR, verbose_eval=None)
        score = mean_squared_error(train_data_y, lgb_model.predict(train_data_x))
        scores.append(round(score, 3))
        prediction_test = lgb_model.predict(test_data_x)
        predictions.append(prediction_test)
    return predictions, scores

In [8]:
predictions1, scores1 = lightgbm_train(train1_x, train1_y, val1_x, val1_y, test1_x)
print(scores1)
predictions2, scores2 = lightgbm_train(train2_x, train2_y, val2_x, val2_y, test2_x)
print(scores2)
predictions3, scores3 = lightgbm_train(train3_x, train3_y, val3_x, val3_y, test3_x)
print(scores3)
predictions4, scores4 = lightgbm_train(train4_x, train4_y, val4_x, val4_y, test4_x)
print(scores4)

100%|██████████| 20/20 [00:02<00:00,  7.45it/s]


[88.223, 6.365, 53.366, 36.359, 1.433, 95.098, 5.015, 0.951, 1.032, 0.679, 2.662, 0.618, 1.12, 1.351, 0.456, 5.198, 12.195, 1.682, 0.562, 0.498]


100%|██████████| 20/20 [00:05<00:00,  3.38it/s]


[56.331, 6.9, 7.199, 24.489, 0.718, 74.156, 5.522, 0.978, 0.939, 0.603, 2.258, 0.563, 0.934, 1.078, 0.364, 3.91, 10.423, 1.584, 0.505, 0.522]


100%|██████████| 20/20 [00:05<00:00,  3.86it/s]


[93.212, 6.422, 204.136, 62.252, 1.009, 114.573, 5.75, 0.786, 0.993, 0.812, 1.907, 0.94, 0.874, 0.954, 0.33, 3.134, 8.204, 2.049, 0.654, 0.572]


100%|██████████| 20/20 [00:05<00:00,  3.92it/s]

[96.749, 7.118, 223.561, 51.874, 1.066, 108.037, 5.766, 0.733, 1.019, 0.823, 1.786, 1.218, 0.673, 1.126, 0.32, 2.848, 7.793, 2.198, 0.76, 0.639]





In [9]:
result = np.concatenate((np.vstack(predictions1).transpose(1,0),
                         np.vstack(predictions2).transpose(1,0),
                         np.vstack(predictions3).transpose(1,0),
                         np.vstack(predictions4).transpose(1,0)))
result[result<0]=0
result = pd.concat([sub['time'],pd.DataFrame(result)],axis=1)
result.columns = sub.columns
result.to_csv('./lgb_baseline.csv',index=False,encoding='utf-8')
result

Unnamed: 0,time,flow_1,flow_2,flow_3,flow_4,flow_5,flow_6,flow_7,flow_8,flow_9,...,flow_11,flow_12,flow_13,flow_14,flow_15,flow_16,flow_17,flow_18,flow_19,flow_20
0,2022-05-01 01:00:00,27.709220,10.705443,43.317919,27.647086,2.579005,40.045006,5.340375,1.321228,2.316115,...,3.084011,2.037162,1.606722,1.895501,1.143209,2.900139,4.742417,2.839885,1.376312,1.358427
1,2022-05-01 02:00:00,25.134299,5.711133,35.403256,18.992146,2.276685,27.234427,2.668235,0.670767,1.343225,...,1.840070,1.783863,0.772202,0.842041,0.998554,1.668568,2.140632,2.414376,0.825254,0.412802
2,2022-05-01 03:00:00,18.949712,4.465108,33.047575,16.348291,2.138523,26.524499,2.533850,0.507747,1.309367,...,1.361191,1.825984,0.612063,0.435068,0.485229,1.142895,1.492439,2.617639,0.783921,0.412802
3,2022-05-01 04:00:00,18.661581,3.313725,30.374982,15.938176,1.995615,24.620727,2.533850,0.513311,1.297070,...,1.064824,1.772614,0.564960,0.399965,0.466419,1.033294,1.492439,2.201366,0.822387,0.322086
4,2022-05-01 05:00:00,21.679588,4.078497,33.047575,18.869687,2.132076,32.260014,2.626242,0.627923,1.547474,...,1.311306,1.184882,0.596069,0.608485,0.498242,1.159635,1.581138,1.603479,1.093043,0.922613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-08-27 20:00:00,71.449656,28.298622,96.583382,66.537538,8.342530,115.946775,24.106232,3.230318,10.744994,...,7.722372,4.721157,5.258641,4.858496,3.252256,8.446310,13.474473,15.572347,3.840912,4.052091
668,2022-08-27 21:00:00,57.224577,40.794165,111.646904,70.889847,10.759275,148.211942,33.952243,4.164289,15.144793,...,11.341850,5.777615,5.814295,7.223728,4.556885,11.217212,16.922572,15.587184,5.776395,4.168951
669,2022-08-27 22:00:00,76.290744,42.293585,112.925219,81.616457,11.170417,153.276739,38.007980,4.869408,17.007439,...,14.575433,6.943302,6.845609,7.641990,4.087467,11.337842,20.373526,15.676606,6.915049,4.891003
670,2022-08-27 23:00:00,61.388159,33.177919,94.710262,92.300995,7.314822,115.783065,25.701638,3.911168,11.618414,...,9.778403,5.653574,6.649765,7.192107,4.550628,12.490603,15.652188,15.676606,6.837656,5.139643
