In [1]:
import numpy as np
import pandas as pd
import os
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tqdm import trange, tqdm
import gc
import matplotlib.pyplot as plt
%matplotlib inline

from l1qr import L1QR
from dataloader import get_data, get_weather, get_hod, get_dow, get_train_set_qra, get_test_set_qra

months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

Using TensorFlow backend.


In [2]:
data_set = 'Irish_2010'
path = os.path.abspath(os.path.join(os.getcwd(), '../..'))

data = get_data(path, data_set)

100%|██████████| 918/918 [00:29<00:00, 31.40it/s]


In [3]:
def qloss_i(y_true, y_pred, q):
    tmp1 = (q / 100 - 1) * (y_true - y_pred)
    tmp2 = q / 100 * (y_true - y_pred)
    return K.mean(K.maximum(tmp1, tmp2))

def qloss(y_true, y_pred):
    q = np.array(range(1, 100))
    tmp1 = (q / 100 - 1) * (y_true - y_pred)
    tmp2 = q / 100 * (y_true - y_pred)
    return K.mean(K.maximum(tmp1, tmp2))

def train_model_1(train, test, week, day):
    
    d = 2
    
    # to get the num of samples
    max_lag = 24
    trainX, trainTlag, trainTd, trainY = get_train_set_qra(train, week, day, max_lag, d)
    n_samples = trainY.shape[0]
    
    error_train = np.zeros(max_lag)
    error_val = np.zeros(max_lag)
    error_train_step1 = np.zeros(max_lag)
    error_test_step1 = np.zeros(max_lag)
    
    pred_train = np.zeros((max_lag, n_samples))
    pred_test = np.zeros((max_lag, 168))
    
    for lag in trange(1, max_lag+1):
        trainX, trainTlag, trainTd, trainY = get_train_set_qra(train, week, day, lag, d)
        testX, testTlag, testTd, testY = get_test_set_qra(train, test, week, day, lag, d)

        trainX_, valX_, trainY_, valY_ = train_test_split(np.hstack((trainX, trainTlag, trainTd)), trainY, test_size = 0.20, shuffle=False)

        linreg = LinearRegression()
        model = linreg.fit(trainX_, trainY_)

        # Predict (train)
        pred = linreg.predict(trainX_)
        error_train[lag-1] = np.sum(np.abs(pred - trainY_))

        # Predict (val)
        pred = linreg.predict(valX_)
        error_val[lag-1] = np.sum(np.abs(pred - valY_))

        del linreg

        linreg = LinearRegression()
        model = linreg.fit(np.hstack((trainX, trainTlag, trainTd)), trainY)

        # Predict (train)
        pred = linreg.predict(np.hstack((trainX, trainTlag, trainTd)))
        error_train_step1[lag-1] = np.sum(np.abs(pred[-n_samples:, :] - trainY[-n_samples:, :]))
        pred_train[lag-1] = np.squeeze(pred[-n_samples:, :])

        # Predict (test)
        pred = linreg.predict(np.hstack((testX, testTlag, testTd)))
        error_test_step1[lag-1] = np.sum(np.abs(pred - testY))
        pred_test[lag-1] = np.squeeze(pred)

    # clear
    del model, linreg, pred
    gc.collect()
    return pred_train, pred_test, trainY[-n_samples:, :], testY
#     return error_train, error_val, error_train_step1, error_test_step1

def train_model_2(trainX_, trainY_, testX_):
    
    trainY = pd.Series(np.squeeze(trainY_))
    trainX = pd.DataFrame(trainX_.T)
    
    pred = []
    for q in tqdm(np.linspace(0.01, 0.99, 99)):
        
        mdl = L1QR(y=trainY, x=trainX, alpha=q)
        mdl.fit(s_max=3)
        b0 = mdl.b0.to_numpy()
        b = mdl.b.to_numpy()

        loss_train = np.zeros(len(b0))
        for i in range(len(b0)):
            tmp = b0[i] + np.sum(b[i] * (trainX_.T), axis=1)
            loss_train[i] = qloss_i(trainY_.reshape(-1), tmp, q)
        b0 = b0[np.argmin(loss_train)]
        b = b[np.argmin(loss_train)]
        
        pred.append(b0 + np.sum(b * (testX_.T), axis=1))
    
    return np.array(pred)

In [4]:
month = 1
n_clusters = 2
method = 'hierarchical/euclidean'

path_cluster = os.path.join(path, 'result', data_set, 'clustering', 'point', method, f'n_clusters_{n_clusters}.csv')
clusters = pd.read_csv(path_cluster, header=None)

series = data[:, month-1, :months[month-1]*24]
weather = get_weather(path, data_set, month)
week = get_dow(data_set, month)
day = get_hod(month)

In [5]:
for i in range(n_clusters):

    index = list(clusters[month-1] == i)
    sub_series = series[index]
    sub_series = np.sum(sub_series, axis=0)

    total_series = np.vstack((sub_series, weather))

    test = total_series[:, -168:]
    train = total_series[:, :-168]

    scale = np.zeros(2)
    scale[0] = np.max(train[0])
    scale[1] = np.min(train[0])
    train[0] = (train[0] - scale[1]) / (scale[0] - scale[1])
    test[0] = (test[0] - scale[1]) / (scale[0] - scale[1])

    trainX_, testX_, trainY_, testY_ = train_model_1(train, test, week, day)
#     error_train, error_val, error_train_step1, error_test_step1 = train_model_1(train, test, week, day)

    pred_series = train_model_2(trainX_, trainY_, testX_)

    break

100%|██████████| 24/24 [00:04<00:00,  3.82it/s]
  delta1 = np.delete(residual, ind_e, 0) / gam  # This is s - s_l in (2.14)
  delta2 = np.array(-beta[idx - 1, ind_v] / nu)
100%|██████████| 99/99 [03:26<00:00,  1.07s/it]
