In [9]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_hour = pd.read_csv('./data/hourly_dataset.csv') #每小时间隔流量数据集，含20个小区（01-20），多个表格间对于小区的编码一致
sub       = pd.read_csv('./data/sample_submission.csv') #提交样例
test      = pd.read_csv('./data/test_public.csv') #测试集（小时单位），须提交20个小区、4个不连续周的供水量。也即672（小时数） x 20（小区数）的矩阵
weather   = pd.read_csv('./data/weather.csv') #深圳市天气数据，测试集部分假定未知

def time_transfer(df):
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by=['time'])
    return df

data_hour = time_transfer(data_hour)
weather = time_transfer(weather)
data_hour = pd.merge(data_hour,weather,on=['time','train or test'],how='left')
for i in range(20):
    data_hour[f'flow_{i+1}'][data_hour[f'flow_{i+1}']<0] = 0

In [3]:
test_list1 = test.groupby('train or test')['time'].first().reset_index()
test_list1 = test_list1['time'].values.tolist()
test_list2 = test.groupby('train or test')['time'].last().reset_index()
test_list2 = test_list2['time'].values.tolist()
test_list1.extend(test_list2)
test_list1.sort()

# train1 test1
train1_ = data_hour[data_hour['time']<test_list1[0]].reset_index(drop=True)
test1_  = data_hour[(data_hour['time']>=test_list1[0]) & (data_hour['time']<=test_list1[1])].reset_index(drop=True)
#train2 test2
train2_ = data_hour[(data_hour['time']>test_list1[1])  & (data_hour['time']<test_list1[2])].reset_index(drop=True)
test2_  = data_hour[(data_hour['time']>=test_list1[2]) & (data_hour['time']<=test_list1[3])].reset_index(drop=True)
#train3 test3
train3_ = data_hour[(data_hour['time']>test_list1[3])  & (data_hour['time']<test_list1[4])].reset_index(drop=True)
test3_  = data_hour[(data_hour['time']>=test_list1[4]) & (data_hour['time']<=test_list1[5])].reset_index(drop=True)
#train4 test4
train4_ = data_hour[(data_hour['time']>test_list1[5])  & (data_hour['time']<test_list1[6])].reset_index(drop=True)
test4_  = data_hour[(data_hour['time']>=test_list1[6]) & (data_hour['time']<=test_list1[7])].reset_index(drop=True)

In [4]:
def preprocess_single_flow(train, flow):
    if np.isnan(train.loc[0, flow]):
        train.loc[0, flow] = train.loc[:, flow].mean()
    #最后一个值为nan，就标记为有值
    if np.isnan(train.loc[len(train)-1, flow]):
        train.loc[len(train)-1, flow] = 0
    cur_train = train[flow].copy()
    cur_train[cur_train.notnull()]=0
    cur_train[cur_train.isnull()]=1
    begin_index = cur_train.diff()[cur_train.diff()==1].index.values.tolist()
    end_index = cur_train.diff()[cur_train.diff()==-1].index.values.tolist()
    for index in range(len(begin_index)):
        fill_null = train[flow].loc[begin_index[index]-1]/(end_index[index]-begin_index[index]+1)
        # 异常值均匀化填补空值
        train[flow].loc[begin_index[index]-1:end_index[index]-1] = fill_null
    return train[flow]

def preprocess_all_flow(train):
    for i in range(20):
        train[f'flow_{i+1}'] = preprocess_single_flow(train, f'flow_{i+1}')
    return train

train1_ = preprocess_all_flow(train1_)
train2_ = preprocess_all_flow(train2_)
train3_ = preprocess_all_flow(train3_)
train4_ = preprocess_all_flow(train4_)

In [5]:
def make_diff_features(df, flow):
    seq_len = 7*24
    df[f"{flow}_simple_diff1"] = df[flow].diff()
    df[f"{flow}_simple_shift24"] = df[flow].shift(24)
    df[f"{flow}_simple_diff168"] = df[flow].diff(seq_len)
    return df

def make_trigonometric_features(df, flow, n=24):
    df[f'{flow}_sin'] = round(np.sin(2*np.pi/n*df[flow]), 6)
    df[f'{flow}_cos'] = round(np.cos(2*np.pi/n*df[flow]), 6)
    return df.iloc[:,-2:]

def make_feas(data):
    data = data.drop(['time','train or test'],axis=1)
    cur1, cur2 = [], []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(data.loc[:,[f'flow_{flow+1}','R', 'fx', 'T', 'U', 'fs', 'V', 'P']])
        cur1.append(make_diff_features(train_data_x, f'flow_{flow+1}').interpolate().fillna(method='bfill'))
        cur2.append(make_trigonometric_features(train_data_x, f'flow_{flow+1}', n=24))
    cur1 = np.stack(cur1,axis=0).transpose(1,0,2)
    cur2 = np.stack(cur2,axis=0).transpose(1,0,2)
    return np.concatenate((cur1,cur2), axis=2)

def generate_dataset(data, seq_len, pre_len, split_ratio=0.8):
    train_x, train_y, val_x, val_y, test_x = [], [], [], [], np.expand_dims(data[-pre_len:],axis=[0])
    split_size = int(len(data)*split_ratio)
    train_data = data[:split_size]
    val_data   = data[split_size:]
    for i in range(0, len(train_data)-seq_len-pre_len, seq_len):
        train_x.append(train_data[i:i+seq_len])
        train_y.append(train_data[i+seq_len:i+seq_len+pre_len])
    for i in range(0, len(val_data)-seq_len-pre_len, seq_len):
        val_x.append(val_data[i:i+seq_len])
        val_y.append(val_data[i+seq_len:i+seq_len+pre_len])
    train_x, train_y, val_x, val_y = np.array(train_x), np.array(train_y), np.array(val_x), np.array(val_y)
    return train_x, train_y, val_x, val_y, test_x
# 做特征
a = make_feas(train1_)
b = make_feas(pd.concat([train1_,train2_]))
c = make_feas(pd.concat([train1_,train2_,train3_]))
d = make_feas(pd.concat([train1_,train2_,train3_,train4_]))
print(a.shape, b.shape, c.shape, d.shape)
# 构造x和y数据
train1_x, train1_y, val1_x, val1_y, test1_x = generate_dataset(a, 24*7, 24*7)
train2_x, train2_y, val2_x, val2_y, test2_x = generate_dataset(b, 24*7, 24*7)
train3_x, train3_y, val3_x, val3_y, test3_x = generate_dataset(c, 24*7, 24*7)
train4_x, train4_y, val4_x, val4_y, test4_x = generate_dataset(d, 24*7, 24*7)

100%|██████████| 20/20 [00:00<00:00, 259.45it/s]
100%|██████████| 20/20 [00:00<00:00, 285.69it/s]
100%|██████████| 20/20 [00:00<00:00, 263.14it/s]
100%|██████████| 20/20 [00:00<00:00, 240.96it/s]

(2880, 20, 13) (3456, 20, 13) (4488, 20, 13) (5064, 20, 13)





In [6]:
def lightgbm_train(train_x, train_y, val_x, val_y, test_x):
    fea_nums = train_x.shape[-1]
    scores = []
    predictions = []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(train_x[:, :, flow, :].reshape(-1, fea_nums))
        train_data_y = pd.DataFrame(train_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        val_data_x   = pd.DataFrame(val_x[:, :, flow, :].reshape(-1, fea_nums))
        val_data_y   = pd.DataFrame(val_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        test_data_x  = pd.DataFrame(test_x[:, :, flow, :].reshape(-1, fea_nums))
        # print(train_data_x.shape,train_data_y.shape,val_data_x.shape,val_data_y.shape,test_data_x.shape)
        train_part = lgb.Dataset(train_data_x, train_data_y)
        val_part = lgb.Dataset(val_data_x, val_data_y)
        ESR = 100
        NBR = 3000
        VBE = 100
        lgb_params_best = {'objective': 'regression',
                           'metric': ['mse'],
                           'bagging_seed': 2022,
                           'verbose': -1}
        lgb_model = lgb.train(lgb_params_best, train_part, num_boost_round=NBR,
                              valid_sets=[train_part, val_part],
                              valid_names=['train', 'valid'],
                              early_stopping_rounds=ESR, verbose_eval=None)
        score = mean_squared_error(train_data_y, lgb_model.predict(train_data_x))
        scores.append(round(score, 3))
        prediction_test = lgb_model.predict(test_data_x)
        predictions.append(prediction_test)
    return predictions, scores

In [7]:
predictions1, scores1 = lightgbm_train(train1_x, train1_y, val1_x, val1_y, test1_x)
print(scores1)
predictions2, scores2 = lightgbm_train(train2_x, train2_y, val2_x, val2_y, test2_x)
print(scores2)
predictions3, scores3 = lightgbm_train(train3_x, train3_y, val3_x, val3_y, test3_x)
print(scores3)
predictions4, scores4 = lightgbm_train(train4_x, train4_y, val4_x, val4_y, test4_x)
print(scores4)

100%|██████████| 20/20 [00:01<00:00, 12.13it/s]


[83.478, 3.426, 42.083, 24.924, 1.143, 58.222, 3.756, 0.862, 0.64, 0.538, 2.115, 0.229, 0.812, 0.939, 0.263, 4.382, 10.463, 1.354, 0.366, 0.375]


100%|██████████| 20/20 [00:02<00:00,  7.53it/s]


[32.12, 5.761, 35.023, 16.655, 0.262, 21.843, 2.776, 0.564, 0.614, 0.395, 1.639, 0.427, 0.84, 0.842, 0.238, 2.685, 7.778, 0.946, 0.306, 0.31]


100%|██████████| 20/20 [00:06<00:00,  3.10it/s]


[71.993, 3.9, 189.756, 48.355, 0.645, 68.122, 4.23, 0.665, 0.714, 0.567, 1.492, 0.922, 0.637, 0.715, 0.046, 2.441, 6.856, 1.89, 0.551, 0.361]


100%|██████████| 20/20 [00:05<00:00,  3.70it/s]

[96.015, 6.15, 202.955, 56.213, 0.938, 96.677, 5.042, 0.633, 1.0, 0.767, 1.579, 0.992, 0.618, 1.267, 0.259, 2.43, 6.336, 2.059, 0.666, 0.585]





In [8]:
result = np.concatenate((np.vstack(predictions1).transpose(1,0),
                         np.vstack(predictions2).transpose(1,0),
                         np.vstack(predictions3).transpose(1,0),
                         np.vstack(predictions4).transpose(1,0)))
result[result<0]=0
result = pd.concat([sub['time'],pd.DataFrame(result)],axis=1)
result.columns = sub.columns
result.to_csv('./lgb_baseline.csv',index=False,encoding='utf-8')
result

Unnamed: 0,time,flow_1,flow_2,flow_3,flow_4,flow_5,flow_6,flow_7,flow_8,flow_9,...,flow_11,flow_12,flow_13,flow_14,flow_15,flow_16,flow_17,flow_18,flow_19,flow_20
0,2022-05-01 01:00:00,27.884680,9.414463,43.022743,25.899429,2.419271,37.322496,5.364615,1.383816,2.337246,...,2.817109,2.055810,1.602279,1.452655,1.067819,2.483773,4.683803,2.424868,1.180567,1.472677
1,2022-05-01 02:00:00,23.819940,5.441234,35.768929,17.606660,2.140117,26.174959,2.849715,0.995091,1.336739,...,1.876682,1.511654,0.680873,0.718495,0.962936,1.483075,2.181961,2.424868,0.678127,1.004957
2,2022-05-01 03:00:00,18.846267,4.092913,33.940910,15.443183,2.041279,25.134394,2.711238,0.886371,1.269010,...,1.427070,1.254413,0.487492,0.384038,0.514949,1.024444,1.526934,2.424868,0.534097,0.993583
3,2022-05-01 04:00:00,18.625972,3.347289,31.334585,14.924122,1.921940,23.583149,2.711238,0.856317,1.270798,...,1.192694,1.272824,0.397926,0.334408,0.410724,0.958909,1.526934,2.365384,0.633631,1.064717
4,2022-05-01 05:00:00,22.001601,3.735552,33.940910,17.912084,2.082309,31.067528,2.849715,0.972939,1.650801,...,1.395369,0.672502,0.410639,0.453890,0.440615,0.985668,1.482745,1.379436,0.831772,1.273502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-08-27 20:00:00,73.001638,27.069443,97.604017,64.451999,8.641944,117.321202,24.190197,3.614768,10.879873,...,7.953987,4.608416,4.905168,4.917250,3.259005,7.863720,13.551693,15.671177,3.953688,3.756890
668,2022-08-27 21:00:00,59.402008,42.980644,104.897147,62.483702,10.654468,149.020297,37.296115,4.189565,15.877773,...,11.906803,5.720183,6.082619,6.963549,4.724220,11.844752,16.817263,15.494305,6.047159,4.613073
669,2022-08-27 22:00:00,81.354017,44.682891,109.861791,66.717336,11.884337,155.555160,38.121479,5.194213,16.576059,...,13.873080,6.550204,6.683672,7.032987,4.112386,12.413095,20.403949,15.585398,6.983046,4.923210
670,2022-08-27 23:00:00,62.229030,33.108590,102.421889,76.192233,6.948875,117.647259,25.702451,4.349030,11.878700,...,10.163486,5.513608,6.703739,6.539339,3.692772,11.234901,15.868987,15.585398,6.891951,5.093643
