In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import trange
from sklearn.model_selection import train_test_split

from dataloader import get_train_set_, get_test_set_
from msvr import kernelmatrix
from msvr import msvr

months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

In [3]:
method = 'hierarchical/euclidean'
data_set = 'Irish_2010'
month = 1
n_clusters = 2

path = os.path.abspath(os.path.join(os.getcwd(), '../..'))

attr = pd.read_csv(os.path.join(path, 'data', f'{data_set}_attr_final.csv'))
data = []
for i in trange(len(attr)):
    id = attr['ID'][i]
    df = pd.read_csv(os.path.join(path, 'data', f'{data_set}_monthly_interval', f'{id}.csv'), header = None).values
    data.append(df)
data = np.array(data)

100%|██████████| 918/918 [00:29<00:00, 30.71it/s]


In [4]:
path_cluster = os.path.join(path, 'result', data_set, 'clustering', 'interval', method, f'n_clusters_{n_clusters}.csv')
clusters = pd.read_csv(path_cluster, header=None)

series = data[:, (month-1)*2:month*2, :months[month-1]*24]

In [8]:
error_train = []
error_test = []

for i in range(n_clusters):

    index = list(clusters[month-1] == i)
    sub_series = series[index]
    sub_series = np.sum(sub_series, axis=0)
    test = sub_series[:, -168:]
    train = sub_series[:, :-168]

    scale = np.zeros(2)
    scale[0] = np.max(train)
    scale[1] = np.min(train)
    train = (train - scale[1])/(scale[0] - scale[1])
    test = (test - scale[1])/(scale[0] - scale[1])

    # recency effect
    for lag in range(1,25):
        d = 1

        trainX, trainY = get_train_set_(train, lag, d)
        testX, testY = get_test_set_(train, test, lag, d)

        # Parameters
        ker = 'rbf'
        epsi = 0.001
        tol = 1e-10

        X_train, X_val, y_train, y_val = train_test_split(trainX, trainY, test_size=0.3, random_state=3)

        Cs = np.arange(1, 4.5, 0.1)
        pars = np.arange(1, 64, 1)
        error = np.zeros((len(Cs), len(pars)))
        
        error_test = np.zeros((len(Cs), len(pars)))
        
        for i in range(len(Cs)):
            for j in range(len(pars)):

                C = Cs[i]
                par = pars[j]

                # Train
                Beta = msvr(X_train, y_train, ker, C, epsi, par, tol)

                # Predict with test set
                K = kernelmatrix('rbf', X_val, X_train, par)
                pred = np.dot(K, Beta)
                
                # Predict with test set
                K = kernelmatrix('rbf', testX, X_train, par)
                pred_test = np.dot(K, Beta)

                error[i][j] = np.mean(np.sum((pred - y_val)**2, axis=1))
                
                error_test[i][j] = np.mean(np.sum((pred_test - testY)**2, axis=1))
                
        print('lag:', lag, 'error:', np.min(error), 'error_test:', np.min(error_test))
    
    break

lag: 1 error: 0.014418014472158925 error_test: 0.018083026621952495
lag: 2 error: 0.014702476396996192 error_test: 0.019502823323579603
lag: 3 error: 0.016463203519481617 error_test: 0.02034438249180348
lag: 4 error: 0.01720950825112333 error_test: 0.020179987336080913
lag: 5 error: 0.020063509415270474 error_test: 0.018055731487230768
lag: 6 error: 0.01584724157904799 error_test: 0.017211384742635987
lag: 7 error: 0.016959421088596827 error_test: 0.020984317026740588
lag: 8 error: 0.020050972877760198 error_test: 0.01857902507458113
lag: 9 error: 0.02172202177101848 error_test: 0.025228206216875694
lag: 10 error: 0.0233791756716879 error_test: 0.02583271704062471
lag: 11 error: 0.02283269209275809 error_test: 0.02679232007405986
lag: 12 error: 0.022200119884137042 error_test: 0.024546743973891984
lag: 13 error: 0.02073792632396276 error_test: 0.024073737559604978
lag: 14 error: 0.024892560007814063 error_test: 0.027072421307005917
lag: 15 error: 0.022803852382307758 error_test: 0.0256

In [5]:
np.min(error)

0.004659829768993877

In [6]:
np.where(error==np.min(error))

(array([4]), array([8]))

In [7]:
Cs[4]

1.4000000000000004

In [8]:
pars[8]

9