# 0. Dataset Load

In [None]:
# =============================================================================
# 0. 라이브러리 및 설정
# =============================================================================

# 모듈 불러오기
import os
import sys
import pickle
import time

import pandas as pd
import numpy as np

In [None]:
# 프로젝트 경로 설정 및 customized function 불러오기
project_dir = "./."
project_dir = os.path.abspath(project_dir)
if project_dir not in sys.path:
    sys.path.append(project_dir)
from utils_.data_utils import convert_mdf
from utils_ import arglist
from utils_.Hierarchical_Convolutional_Attention_Regression import HConv

In [None]:
# =============================================================================
# 1. 데이터 리스트 로드
# =============================================================================

# 데이터 경로 설정 및 불러오기 (mdf 파일)
data_dir = os.path.join(project_dir, 'data', arglist.file_dir)
data_filenames = [f for f in os.listdir(data_dir) if f.endswith('.dat')]
data_filenames

In [None]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

In [None]:
# 데이터 파일별로 모델 생성, 결과 도출, 그래프 저장
for data_filename in data_filenames:
    
    # 2.1 유니크 토큰 생성
    # 여러 반복 실험을 하는 경우를 대비하여 현재 시간을 토큰으로 사용
    unique_seed = time.strftime("%Y%m%d-%H%M%S")

    # 2.2 파라미터 설정
    # arglist에서 기본 파라미터를 불러온 후, 수정하여 사용
    param = arglist.PARAM()
    param.data_filename = data_filename
    param.network_type = 'HConv'

    param.optimizer = tf.keras.optimizers.Adam()
    param.scaler = StandardScaler(with_std=False)

    param.n_attention = 10
    param.embedding_dim = 20

    param.segment_length = 20
    param.segment_stride = 2

    param.convert_log = True
    param.weight_log = True

    param.update_network_arguments()
    assert param.updated
    
    # 2.3 데이터 로드
    # mdf 데이터 불러오기
    data = convert_mdf(mdf_filename=os.path.join(data_dir, data_filename),
                       mdf_channels=param.mdf_channels,
                       sample_frequency=0.01,
                       save_file=False,
                       csv_filename=None)
    
    # 2.4 데이터 전처리
    # 목표량 = 100 - 목표량
    data[arglist.valve_channels[1]] = -(data[arglist.valve_channels[1]] - 100)

    data = data.drop(arglist.alarm_channels, axis=1)

    data[arglist.valve_channels] = data[arglist.valve_channels].clip(lower=0)
    egr_act = data[arglist.valve_channels[0]]
    egr_r = data[arglist.valve_channels[1]]

    # 타겟 변수를 만들기 위한 |목표값 - 실제값| 산출
    egr_diff = np.abs(egr_act - egr_r).values

    # 실제값은 분석에서 제외
    data = data.drop(arglist.valve_channels[0], axis=1).values

    # 데이터 스케일링: 평균만 0으로 맞춰주는 zero-centered scaling을 사용... param.scaler로 확인
    if param.scaler is not None:
        data = param.scaler.fit_transform(data)

    # 슬라이딩 윈도우
    X = []
    y = []
    window_start_index = []
    for start_index in range(0, data.shape[0] - param.window_size, param.shift_size):
        X.append(data[start_index:start_index + param.window_size])
        y.append(egr_diff[start_index:start_index + param.window_size].sum())
        window_start_index.append(start_index)
    X = np.asarray(X)
    y = np.asarray(y)
    window_start_index = np.asarray(window_start_index)
    print('Total number of windows = {}'.format(len(X)))

    # 학습 단계에서 가중치 부여
    if param.weight_log:
        y_weight = np.log(1 + y)
    else:
        y_weight = np.ones(shape=y.shape)

    # 타겟 변수는 log transformation을 사용해서 학습
    if param.convert_log:
        y = np.log(1 + y)

    # 학습:평가 = 8:2로 데이터셋 분리
    X_train, X_test, y_train, y_test, y_weight_train, y_weight_test = train_test_split(
        X, y, y_weight, test_size=0.2, random_state=2019, shuffle=True)

In [None]:
param.window_size

In [None]:
train_dat = X
train_dat.shape

# 1. ARIMA

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import chart_studio.plotly as ply
import cufflinks as cf
from pmdarima import auto_arima
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
def ARIMA_X(x_win):
    
    while True:
        max_p = 3; max_d = 2; max_q = 3
        results=[]

        try:
            par = auto_arima(x_win, start_p=0, start_q=0,d=0,
                         test='adf',max_p=max_p, max_q=max_q,seasonal=False,
                         trace=False, error_action='ignore')
            order = par.order

            model = ARIMA(x_win, order=(order[0], order[1], order[2]))
            results = model.fit(disp=-1)
            AR_model = results.arparams
            MA_model = results.maparams
            result_param = np.array([results.params[0]])

            break
        except (ValueError, np.linalg.LinAlgError):
            try:
                par = auto_arima(x_win, start_p=0, start_q=0,d=1,
                         test='adf',max_p=max_p, max_q=max_q,seasonal=False,
                         trace=False, error_action='ignore')
                order = par.order

                model = ARIMA(x_win, order=(order[0], order[1], order[2]))
                results = model.fit(disp=-1)
                AR_model = results.arparams
                MA_model = results.maparams
                result_param = np.array([results.params[0]])
                break
            except (ValueError, np.linalg.LinAlgError):
                try:
                    par = auto_arima(x_win, start_p=0, start_q=0,d=2,
                         test='adf',max_p=max_p, max_q=max_q,seasonal=False,
                         trace=False, error_action='ignore')
                    order = par.order

                    model = ARIMA(x_win, order=(order[0], order[1], order[2]))
                    results = model.fit(disp=-1)
                    AR_model = results.arparams
                    MA_model = results.maparams
                    result_param = np.array([results.params[0]])
                    break
                except (ValueError, np.linalg.LinAlgError):
                    AR_model = np.array([])
                    MA_model = np.array([])
                    result_param = np.array([0])
                    order = [0,0,0]
                    break
                    
    opt_param = np.concatenate([AR_model, MA_model, result_param ])
    
    p_idx = np.arange(0, order[0], dtype=int)
    q_idx = np.arange(order[0], order[0]+order[2], dtype=int)

    if len(p_idx)==0:
        AR = np.r_[np.zeros(max_p-np.size(p_idx))]
    else:
        AR = np.r_[opt_param[0:len(p_idx)], np.zeros(max_p-np.size(p_idx))]

    if len(q_idx)==0:
        MA = np.r_[np.zeros(max_q-np.size(q_idx))]
    else:
        MA = np.r_[opt_param[len(p_idx):len(p_idx)+len(q_idx)], np.zeros(max_q-np.size(q_idx))]
    
    return np.concatenate([AR, MA ,np.array([opt_param[-1]])])

In [None]:
arima = []
for i in range(train_dat.shape[0]):
    arima_win = []
    for j in range(train_dat.shape[2]):
        aim = ARIMA_X(train_dat[i,:,j])
        arima_win.append(aim)
    arima_win = np.reshape(arima_win, (1, np.size(arima_win)))
    arima.append(arima_win)
    print(i)

arima = np.array(arima, dtype=np.float32)
arima = np.reshape(arima, (arima.shape[0], arima.shape[2]))

In [None]:
np.savetxt('c:/tf2/arima.csv', X=arima, delimiter=',', fmt='%.5f')

# 2. Wavelet

In [None]:
from pywt import wavedec, Wavelet

In [None]:
def WT(x_win):
    
    w = Wavelet('sym4')
    cA3, cD4, cD3, cD2, cD1 = wavedec(x_win, w)
    
    plf = np.polyfit(np.arange(np.size(cA3)), cA3, 5)
    cD1 = np.std(cD1)
    cD2 = np.std(cD2)
    cD3 = np.std(cD3)
    cD4 = np.std(cD4)
    
    return np.r_[plf, cD1, cD2, cD3, cD4]

In [None]:
wavelets = []
for i in range(train_dat.shape[0]):
    wt_win = []
    for j in range(train_dat.shape[2]):
        wts = WT(train_dat[i,:,j])
        wt_win.append(wts)
    wt_win = np.reshape(wt_win, (1, np.size(wt_win)))
    wavelets.append(wt_win)
    print(i)

wavelets = np.array(wavelets, dtype=np.float32)
wavelets = np.reshape(wavelets, (wavelets.shape[0], wavelets.shape[2]))

In [None]:
np.savetxt('c:/tf2/wavelets.csv', X=wavelets, delimiter=',', fmt='%.5f')

# 3. DBN

In [None]:
from __future__ import print_function

import numpy as np

from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neural_network import BernoulliRBM
import matplotlib.pyplot as plt 

from dbn.models import UnsupervisedDBN
# pip install git
# CPU : pip install git+git://github.com/albertbup/deep-belief-network.git
# GPU : pip install git+git://github.com/albertbup/deep-belief-network.git@master_gpu

In [None]:
def DBN(x_win):
    dbn = UnsupervisedDBN(hidden_layers_structure=[50, 30, 20],
                      batch_size=100,
                      learning_rate_rbm=0.05,
                      n_epochs_rbm=100,
                      activation_function='sigmoid')
    dbn.fit(x_win)
    return dbn.transform(x_win)

In [None]:
dbns = []
for i in range(train_dat.shape[0]):
    dbn_win = []
    
    dbs = DBN(train_dat[i,:,:])
    dbn_win.append(dbs)
    
    dbn_win = np.reshape(dbn_win, (1, np.size(dbn_win)))
    dbns.append(dbn_win)

dbns = np.array(dbns, dtype=np.float32)
dbns = np.reshape(dbns, (dbns.shape[0], dbns.shape[2]))

In [None]:
np.savetxt('c:/tf2/dbns.csv', X=dbns, delimiter=',', fmt='%.5f')

# 4. Simple Autoencoder

In [None]:
from IPython.display import Image, SVG
import matplotlib.pyplot as plt

%matplotlib inline

import numpy as np
import keras
from keras.datasets import mnist
from keras.models import Model, Sequential
from keras.layers import Lambda, Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten, Reshape, Conv1D, MaxPooling1D, UpSampling1D
from keras import regularizers
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras import backend as K
import os
from keras import objectives

In [None]:
def simpleAE(x_win):
    input_dim = x_win.shape[1]
    encoding_dim = 50
    
    autoencoder = Sequential()
    autoencoder.add(Dense(encoding_dim, input_shape=(input_dim,), activation='relu'))
    autoencoder.add(Dense(input_dim, activation='sigmoid'))

    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
        
    autoencoder.fit(x_win,x_win,
                        epochs=100,
                        batch_size=100,
                        shuffle=True, verbose=0)
    ##########################################
    input_img = Input(shape=(input_dim,))

    encoder_layer = autoencoder.layers[0]
    encoder = Model(input_img, encoder_layer(input_img))
    
    return encoder.predict(x_win)

In [None]:
simple_ae = []

for i in range(518,train_dat.shape[0]):
    simple_ae_win = []
    
    sae = simpleAE(train_dat[i,:,:])
    simple_ae_win.append(sae)
    
    simple_ae_win = np.reshape(simple_ae_win, (1, np.size(simple_ae_win)))
    simple_ae.append(simple_ae_win)
    print(i)
    

simple_ae = np.array(simple_ae, dtype=np.float32)
simple_ae = np.reshape(simple_ae, (simple_ae.shape[0], simple_ae.shape[2]))

In [None]:
simple_ae = np.array(simple_ae, dtype=np.float32)
simple_ae = np.reshape(simple_ae, (simple_ae.shape[0], simple_ae.shape[2]))

In [None]:
np.savetxt('c:/tf2/simple_ae(518_).csv', X=simple_ae, delimiter=',', fmt='%.5f')

# 5. Stacked Autoencoder

In [None]:
def stackedAE(x_win):
    input_dim = x_win.shape[1]
    encoding_dim = 50
    
    autoencoder = Sequential()

    # Encoder Layers
    autoencoder.add(Dense(2 * encoding_dim, input_shape=(input_dim,), activation='relu'))
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(encoding_dim, activation='relu'))

    # Decoder Layers
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(input_dim, activation='sigmoid'))

    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(x_win, x_win,
                epochs=100,
                batch_size=100, verbose=0)
###################################################################################################
    input_img = Input(shape=(input_dim,))
    encoder_layer1 = autoencoder.layers[0]
    encoder_layer2 = autoencoder.layers[1]
    encoder_layer3 = autoencoder.layers[2]
    encoder = Model(input_img, encoder_layer3(encoder_layer2(encoder_layer1(input_img))))

    return encoder.predict(x_win)

In [None]:
stack_ae = []

for i in range(train_dat.shape[0]):
    stack_ae_win = []
    
    sae = stackedAE(train_dat[i,:,:])
    stack_ae_win.append(sae)
    
    stack_ae_win = np.reshape(stack_ae_win, (1, np.size(stack_ae_win)))
    stack_ae.append(stack_ae_win)
    print(i)
    

stack_ae = np.array(stack_ae, dtype=np.float32)
stack_ae = np.reshape(stack_ae, (stack_ae.shape[0], stack_ae.shape[2]))

In [None]:
np.savetxt('c:/tf2/stack_ae.csv', X=stack_ae, delimiter=',', fmt='%.5f')

# 6. Denoising Autoencoder

In [None]:
def deAE(x_win):
    
    input_dim = x_win.shape[1]
    encoding_dim = 50
    
    noise_factor = 0.5
    x_win_noisy = x_win + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_win.shape) 
    x_win_noisy = np.clip(x_win_noisy, 0., 1.)


    autoencoder = Sequential()

    # Encoder Layers
    autoencoder.add(Dense(2 * encoding_dim, input_shape=(input_dim,), activation='relu'))
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(encoding_dim, activation='relu'))

    # Decoder Layers
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(2 * encoding_dim, activation='relu'))
    autoencoder.add(Dense(input_dim, activation='sigmoid'))

    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(x_win_noisy, x_win,
                epochs=100,
                batch_size=100, verbose=0)

    input_img = Input(shape=(input_dim,))
    encoder_layer1 = autoencoder.layers[0]
    encoder_layer2 = autoencoder.layers[1]
    encoder_layer3 = autoencoder.layers[2]
    encoder = Model(input_img, encoder_layer3(encoder_layer2(encoder_layer1(input_img))))

    return encoder.predict(x_win_noisy)

In [None]:
de_ae = []

for i in range(train_dat.shape[0]):
    de_ae_win = []
    
    dae_return = deAE(train_dat[i,:,:])
    de_ae_win.append(dae_return)
    
    de_ae_win = np.reshape(de_ae_win, (1, np.size(de_ae_win)))
    de_ae.append(de_ae_win)
    print(i)

de_ae = np.array(de_ae, dtype=np.float32)
de_ae = np.reshape(de_ae, (de_ae.shape[0], de_ae.shape[2]))

In [None]:
np.savetxt('c:/tf2/de_ae.csv', X=de_ae, delimiter=',', fmt='%.5f')

# 7. Variational Autoencoder(VAE)

In [None]:
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch,dim))
    return z_mean +K.exp(0.5*z_log_var)*epsilon

In [None]:
def VAE(x_win):
    
    input_dim = x_win.shape[1]

    inputs = Input(shape=(input_dim,),name='encoder_input')

    x = Dense(256,activation='relu')(inputs)
    x = Dense(256,activation='relu')(x)
    z_mean = Dense(2,name='z_mean')(x)
    z_log_var = Dense(2,name='z_log_var')(x)
    
    z = Lambda(sampling,output_shape=(2,),name='z')([z_mean,z_log_var])

    encoder = Model(inputs,[z_mean,z_log_var,z],name='encoder')
#####################################################################################
    latent_inputs = Input(shape=(2,),name='z_sampling')
    x = Dense(256,activation='relu')(latent_inputs)
    x = Dense(256,activation='relu')(x)
    outputs = Dense(input_dim,activation='sigmoid')(x) # 0~1
    
    decoder = Model(latent_inputs,outputs,name='decoder')
#####################################################################################
    outputs = decoder(encoder(inputs)[2])
    vae = Model(inputs,outputs,name='vae_mlp')

    models = (encoder,decoder)
    
    def vae_loss(x, x_decoded_mean):
        xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
        kl_loss = -0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
        loss = xent_loss + kl_loss
        return loss

    vae.compile(optimizer='adam',loss=vae_loss)

    vae.fit(x_win,x_win,epochs=100,batch_size=100, verbose=0)
#####################################################################################
    input_img = Input(shape=(input_dim,))

    encoder_layer1 = vae.layers[0]
    encoder_layer2 = vae.layers[1]

    encoder2 = Model(input_img, encoder_layer2(encoder_layer1(input_img)))

    return encoder2.predict(x_win)

In [None]:
vae = []

for i in range(train_dat.shape[0]):
    vae_win = []
    
    vae_return = VAE(train_dat[i,:,:])
    vae_win.append(vae_return)
    
    vae_win = np.reshape(vae_win, (1, np.size(vae_win)))
    vae.append(vae_win)
    print(i)

vae = np.array(vae, dtype=np.float32)
vae = np.reshape(vae, (vae.shape[0], vae.shape[2]))

In [None]:
np.savetxt('c:/tf2/vae.csv', X=vae, delimiter=',', fmt='%.5f')

# 8. Convolutional Autoencoder(CAE)

In [None]:
def CAE(x_win):
    
    autoencoder = Sequential()

    # Encoder Layers
    autoencoder.add(Conv1D(26, 3, activation='relu', padding='same', input_shape=x_win.shape[1:]))
    autoencoder.add(MaxPooling1D(5, padding='same'))
    autoencoder.add(Conv1D(13, 3, activation='relu', padding='same'))
    autoencoder.add(MaxPooling1D(5, padding='same'))
    autoencoder.add(Conv1D(5, 3, activation='relu', padding='same'))

    # Flatten encoding for visualization
    autoencoder.add(Flatten())
    autoencoder.add(Reshape((8, 5)))
    autoencoder.add(Conv1D(5, 3, activation='relu', padding='same'))
    autoencoder.add(UpSampling1D(5))
    autoencoder.add(Conv1D(13, 3, activation='relu', padding='same'))
    autoencoder.add(UpSampling1D(5))
    autoencoder.add(Conv1D(26, 3, activation='relu', padding='same'))
    
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(x_win, x_win,
                epochs=100,
                batch_size=100, verbose=0)
##################################################################################################
    input_img = Input(shape=x_win.shape[1:])
    encoder_layer1 = autoencoder.layers[0]
    encoder_layer2 = autoencoder.layers[1]
    encoder_layer3 = autoencoder.layers[2]
    encoder_layer4 = autoencoder.layers[3]
    encoder_layer5 = autoencoder.layers[4]
    encoder = Model(input_img, encoder_layer5(encoder_layer4(encoder_layer3(encoder_layer2(encoder_layer1(input_img))))))
    return encoder.predict(x_win)

In [None]:
# function for batch training (slicing traing set into proper batches)
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [None]:
n_train = len(train_dat)
batch_size = 100

In [None]:
train_idx = np.arange(n_train)
train_batch = list(chunks(train_idx, batch_size))

In [None]:
cae = []
cae_win = []

for batch_idx in train_batch:
    cae_return = CAE(train_dat[batch_idx])
    
    for i in range(cae_return.shape[0]):
        cae_win.append(cae_return[i])
    print(batch_idx[0])

cae_win = np.reshape(cae_win, (train_dat.shape[0], int(np.size(cae_win)/train_dat.shape[0])))
cae = np.array(cae_win, dtype=np.float32)

In [None]:
np.savetxt('c:/tf2/cae.csv', X=cae, delimiter=',', fmt='%.5f')

# 9. LSTM Autoencoder

In [None]:
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.utils import plot_model
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Lambda, Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten, Reshape
from keras.models import Model, Sequential

In [None]:
def LSTM_AE(x_win):
    
    # define model
    model = Sequential()
    model.add(LSTM(100, activation='relu', input_shape=(x_win.shape[1:])))
    model.add(RepeatVector(x_win.shape[1]))
    model.add(LSTM(200, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(x_win.shape[2])))
    model.compile(optimizer='adam', loss='mse')
    model.fit(x_win, x_win, epochs=100, verbose=0)
#################################################################################
    input_img = Input(shape=(x_win.shape[1:]))

    encoder_layer1 = model.layers[0]
    encoder_layer2 = model.layers[1]

    encoder = Model(input_img, encoder_layer2(encoder_layer1(input_img)))
    return encoder.predict(x_win)

In [None]:
lstm_ae = []
lstm_ae_win = []

for batch_idx in train_batch:
    lae_return = LSTM_AE(train_dat[batch_idx])
    
    for i in range(lae_return.shape[0]):
        lstm_ae_win.append(lae_return[i])
    print(batch_idx[0])

lstm_ae_win = np.reshape(lstm_ae_win, (train_dat.shape[0], int(np.size(lstm_ae_win)/train_dat.shape[0])))
lstm_ae = np.array(lstm_ae_win, dtype=np.float32)

In [None]:
np.savetxt('c:/tf2/lstm_ae.csv', X=lstm_ae, delimiter=',', fmt='%.5f')

# 10. Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering as HC
from sklearn.metrics import silhouette_score as ss
import pandas as pd
from scipy.stats import entropy
from mvpa2.mappers.som import SimpleSOMMapper
from sklearn.preprocessing import OneHotEncoder as ENC
from sklearn.cluster import KMeans

In [None]:
arima = np.loadtxt(fname='c:/feature/fin/arima.csv', delimiter=',', skiprows=0)
arima = np.array(arima)
arima = np.nan_to_num(arima)

wavelets = np.loadtxt(fname='c:/feature/fin/wavelets.csv', delimiter=',', skiprows=0)
wavelets = np.array(wavelets)
wavelets = np.nan_to_num(wavelets)

dbns = np.loadtxt(fname='c:/feature/fin/dbns.csv', delimiter=',', skiprows=0)

simple_ae = np.loadtxt(fname='c:/feature/fin/simple_ae.csv', delimiter=',', skiprows=0)

stack_ae = np.loadtxt(fname='c:/feature/fin/stack_ae.csv', delimiter=',', skiprows=0)

de_ae = np.loadtxt(fname='c:/feature/fin/de_ae.csv', delimiter=',', skiprows=0)

vae = np.loadtxt(fname='c:/feature/fin/vae.csv', delimiter=',', skiprows=0)

cae = np.loadtxt(fname='c:/feature/fin/cae.csv', delimiter=',', skiprows=0)

lstm_ae = np.loadtxt(fname='c:/feature/fin/lstm_ae.csv', delimiter=',', skiprows=0)

In [None]:
data = [arima,wavelets,dbns,simple_ae,stack_ae,de_ae,vae,cae,lstm_ae]
data2 = ['arima','wavelets','dbns','simple_ae','stack_ae','de_ae','vae','cae','lstm_ae']

## 10.1 Hierarchical Clustering

In [None]:
def h_cluster(feature):
    cluster_size = range(5, 21)
    scores, idx_tmp = [], []
    for n_cluster in cluster_size:
        hc = HC(n_clusters=n_cluster, linkage='ward')
        hc_idx = hc.fit_predict(feature)
        idx_tmp.append(hc_idx)
        scores.append(ss(feature, hc_idx))
    cluster_labels = idx_tmp[np.argmax(scores)]
    
    df = pd.DataFrame({'cluster' : cluster_labels, 'abnormal' : abnormal_idx}, columns=['cluster', 'abnormal'])
    
    ab = []
    nor = []
    
    length = len(pd.Series.value_counts(cluster_labels))

    for i in range(0,length):
        ab_i = np.sum(df[df['cluster']==i]['abnormal'])
        nor_i = len(df[df['cluster']==i]['abnormal'])-ab_i
        ab.append(ab_i)
        nor.append(nor_i)

    return (ab, nor)

In [None]:
h_abnormal = []
h_normal = []
h = []

for i in range(0,len(data)):
    h = h_cluster(data[i])
    h_abnormal.append(h[0])
    h_normal.append(h[1])
    print(data2[i],'finished')

In [None]:
h_result = []

for j in range(0,len(data)):
    a = np.divide(np.array(h_normal[j]), np.array(h_abnormal[j])+np.array(h_normal[j]))
    b = np.divide(np.array(h_abnormal[j]), np.array(h_abnormal[j])+np.array(h_normal[j]))

    c = sum(h_normal[j])+sum(h_abnormal[j])
    d = np.array(h_normal[j])+np.array(h_abnormal[j])
    result_data = sum(entropy([a,b],base=2) * d/c)
    
    h_result.append(round(result_data,3))

In [None]:
pd.DataFrame((data2,h_result))

## 10.2 Self-Organizing Map

In [None]:
def som_cluster(feature):
    cluster_size = range(5, 21)
    scores, idx_tmp = [], []
    for n_cluster in cluster_size:
        som = SimpleSOMMapper((n_cluster, n_cluster), learning_rate=0.02, niter=500)
        som.train(feature)
        encoder = ENC(sparse=False)
        tmp = som.forward(feature)
        som_idx = np.argmax(encoder.fit_transform((tmp[:,0]+tmp[:,1]**2).reshape(len(feature),1)), axis=1)
        scores.append(ss(feature, som_idx))
        idx_tmp.append(som_idx)

    cluster_labels = idx_tmp[np.argmax(scores)]
    
    df = pd.DataFrame({'cluster' : cluster_labels, 'abnormal' : abnormal_idx}, columns=['cluster', 'abnormal'])
    
    length = len(pd.Series.value_counts(cluster_labels))

    ab = []
    nor = []

    for i in range(0,length):
        ab_i = np.sum(df[df['cluster']==i]['abnormal'])
        nor_i = len(df[df['cluster']==i]['abnormal'])-ab_i
        ab.append(ab_i)
        nor.append(nor_i)

    return (ab, nor)

In [None]:
som_abnormal = []
som_normal = []
som = []

for i in range(0,len(data)):
    som = som_cluster(data[i])
    som_abnormal.append(som[0])
    som_normal.append(som[1])
    print(data2[i],'finished')

In [None]:
som_result = []

for j in range(0,len(data)):
    a = np.divide(np.array(som_normal[j]), np.array(som_abnormal[j])+np.array(som_normal[j]))
    b = np.divide(np.array(som_abnormal[j]), np.array(som_abnormal[j])+np.array(som_normal[j]))

    c = sum(som_normal[j])+sum(som_abnormal[j])
    d = np.array(som_normal[j])+np.array(som_abnormal[j])
    result_data = sum(entropy([a,b],base=2) * d/c)
    
    som_result.append(round(result_data,3))

## 10.3 K-Means Clustering

In [None]:
def KM_cluster(feature):

    cluster_size = range(5, 21)
    scores, idx_tmp = [], []
    for n_cluster in cluster_size:
        KM = KMeans(n_clusters=n_cluster,algorithm='auto')
        KM_idx = KM.fit_predict(feature)
        scores.append(ss(feature, KM_idx))
        idx_tmp.append(KM_idx)

    cluster_labels = idx_tmp[np.argmax(scores)]
    
    df = pd.DataFrame({'cluster' : cluster_labels, 'abnormal' : abnormal_idx}, columns=['cluster', 'abnormal'])
    
    length = len(pd.Series.value_counts(cluster_labels))

    ab = []
    nor = []

    for i in range(0,length):
        ab_i = np.sum(df[df['cluster']==i]['abnormal'])
        nor_i = len(df[df['cluster']==i]['abnormal'])-ab_i
        ab.append(ab_i)
        nor.append(nor_i)

    return (ab, nor)

In [None]:
KM_abnormal = []
KM_normal = []
KM = []

for i in range(0,len(data)):
    KM = KM_cluster(data[i])
    KM_abnormal.append(KM[0])
    KM_normal.append(KM[1])
    print(data2[i],'finished')

In [None]:
KM_result = []

for j in range(0,len(data)):
    a = np.divide(np.array(KM_normal[j]), np.array(KM_abnormal[j])+np.array(KM_normal[j]))
    b = np.divide(np.array(KM_abnormal[j]), np.array(KM_abnormal[j])+np.array(KM_normal[j]))

    c = sum(KM_normal[j])+sum(KM_abnormal[j])
    d = np.array(KM_normal[j])+np.array(KM_abnormal[j])
    result_data = sum(entropy([a,b],base=2) * d/c)
    
    KM_result.append(round(result_data,3))

In [None]:
np.array(KM_result)

In [None]:
pd.DataFrame((data2,KM_result))