In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import IsolationForest
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.model_selection import train_test_split
from numpy.random import seed
import glob
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Data Load & PreProcessing

In [3]:
#Initialize global variable 
train_data = None #train1~23
phase1_test_normal = None #test01,02,03
phase1_test_abnormal = None #test_abn1,abn2
phase2_final_normal = None #final01_label = 0
phase2_final_abnormal = None  #fianl01_label = 1

In [4]:
# calculate f1 value
def acc_f1(y_true, y_pred, abnormal = 0):
    cm = np.zeros((2,2))
    cm[0][0] = np.sum((y_true != abnormal)&(y_pred != abnormal))
    cm[0][1] = np.sum((y_true != abnormal)&(y_pred == abnormal))
    cm[1][0] = np.sum((y_true == abnormal)&(y_pred != abnormal))
    cm[1][1] = np.sum((y_true == abnormal)&(y_pred == abnormal))
    precision = cm[1][1] / np.sum(cm[:,1])
    recall = cm[1][1] / np.sum(cm[1,:])
    f1 = 2*precision*recall/(precision+recall)
    return (cm[0][0] + cm[1][1])/(np.sum(cm)), f1

# calculate iou value
def IOU(y_true, y_pred, abnormal = 0):
    U = np.sum((y_true == abnormal)|(y_pred == abnormal))
    I = np.sum((y_true == abnormal)&(y_pred == abnormal))
    return I/U

In [5]:
# normalization
def nor(data, interval, strides):
    data_norm = (data - data.min()) / (data.max() - data.min())
    data_norm = data_norm.fillna(0)
    X_data = []
    for i in range(0, data_norm.shape[0], strides):
        temp = data_norm.iloc[i:i+interval].values
        if temp.shape[0] != interval:
            continue
        X_data.append(np.expand_dims(temp, axis=0))
    X_data = np.concatenate(X_data, axis=0)
    print(X_data.shape)
    return X_data

In [6]:
# data accumulation
# delete 3 meaningless columns 
def file_concat(input_path):
    global train_data, phase1_test_normal, phase1_test_abnormal 
    for input_file in sorted(glob.glob(os.path.join(input_path,'*.csv'))):
        print(input_file)
        data = pd.read_csv(input_file)
        data = data.iloc[:,1:]
        if 'final' not in input_file:
            data = data.drop(['sensor_smk','sensor_air','sensor_cycle'], axis=1)
        if 'train' in input_file:
            if train_data is None :
                train_data = data
                print(train_data.shape)
            else:
                train_data = np.concatenate([train_data, data], axis=0)
                print(train_data.shape)
        if 'test' in input_file:
            if ('_' not in input_file):
                if phase1_test_normal is None :
                    phase1_test_normal = data
                    print(phase1_test_normal.shape)
                else:
                    phase1_test_normal = np.concatenate([phase1_test_normal, data], axis=0)
                    print(phase1_test_normal.shape)
            else:
                if phase1_test_abnormal is None :
                    phase1_test_abnormal = data
                    print(phase1_test_abnormal.shape)
                else:
                    phase1_test_abnormal = np.concatenate([phase1_test_abnormal, data], axis=0)
                    print(phase1_test_abnormal.shape)

In [7]:
file_concat('./../../problem1/')
file_concat('./../problem1/')

./../../problem1/test01.csv
(79032, 108)
./../../problem1/test01_PCR.csv
(16342, 108)
./../../problem1/test02.csv
(128043, 108)
./../../problem1/test02_smkLim.csv
(33374, 108)
./../../problem1/test03.csv
(176071, 108)
./../../problem1/train01.csv
(86058, 108)
./../../problem1/train02.csv
(172839, 108)
./../../problem1/train03.csv
(225630, 108)
./../../problem1/train04.csv
(312409, 108)
./../../problem1/train05.csv
(389259, 108)
./../../problem1/train06.csv
(480356, 108)
./../../problem1/train07.csv
(544137, 108)
./../../problem1/train08.csv
(550518, 108)
./../../problem1/train09.csv
(573715, 108)
./../../problem1/train10.csv
(645708, 108)
./../../problem1/train11.csv
(689912, 108)
./../../problem1/train12.csv
(743932, 108)
./../problem1/final01.csv
./../problem1/final_label.csv
./../problem1/train13.csv
(791564, 108)
./../problem1/train14.csv
(798129, 108)
./../problem1/train15.csv
(824643, 108)
./../problem1/train16.csv
(837563, 108)
./../problem1/train17.csv
(891065, 108)
./../proble

In [8]:
# Convert to DataFrame formation
train=pd.DataFrame(train_data)
test_a=pd.DataFrame(phase1_test_normal)
ab_a=pd.DataFrame(phase1_test_abnormal)

In [9]:
# final data read , label = 1 --> abnormal
dt_final = pd.read_csv('./../problem1/final01.csv')
dt_final = dt_final.iloc[:,1:]
dt_label = pd.read_csv('./../problem1/final_label.csv', header=None, names=['final_label'])
X_data_fin_test = pd.concat([dt_final, dt_label], axis=1)
X_data_fin_test = X_data_fin_test.drop(['sensor_smk','sensor_air','sensor_cycle'], axis=1)
phase2_final_normal=X_data_fin_test[X_data_fin_test['final_label'] ==0.0].iloc[:,:].drop(['final_label'], axis = 1)
phase2_final_abnormal=X_data_fin_test[X_data_fin_test['final_label'] ==1.0].iloc[:,:].drop(['final_label'], axis = 1)

In [10]:
interval = 10
strides = 2

train_a=nor(train, interval, strides)
test_b=nor(test_a, interval, strides)
ab_b=nor(ab_a, interval, strides)
phase2_final_normal=nor(phase2_final_normal, interval, strides)
phase2_final_abnormal=nor(phase2_final_abnormal, interval, strides)

(521815, 10, 108)
(88031, 10, 108)
(16683, 10, 108)
(5414, 10, 108)
(5266, 10, 108)


In [11]:
train_n= train_a
test_n= test_b
ab_n= ab_b
p2_test_n = phase2_final_normal
p2_ab_n = phase2_final_abnormal

In [63]:
train_n.shape, test_n.shape, ab_n.shape, p2_test_n.shape, p2_ab_n.shape

((521815, 10, 108),
 (88031, 10, 108),
 (16683, 10, 108),
 (5414, 10, 108),
 (5266, 10, 108))

#  isoloation forest

In [12]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed

In [13]:
n_dimensions=64
timesteps=10
input_dim=train_a.shape[2]

In [14]:
train_n.shape, test_n.shape, ab_n.shape, p2_test_n.shape, p2_ab_n.shape

((521815, 10, 108),
 (88031, 10, 108),
 (16683, 10, 108),
 (5414, 10, 108),
 (5266, 10, 108))

# lstm encoder 2 

In [55]:
# Using auto encoder for the best optimization
def get_model(x):
    inputs = Input(shape=(timesteps, input_dim))
    encoded = LSTM(n_dimensions, return_sequences=False, name="encoder")(inputs)
    decoded = RepeatVector(timesteps)(encoded)
    decoded = LSTM(input_dim, return_sequences=True, name='decoder')(decoded)
    autoencoder = Model(inputs, decoded)
    encoder = Model(inputs, encoded)
    return autoencoder, encoder

get_model(train_a)

(<keras.engine.training.Model at 0x7f4a97779a90>,
 <keras.engine.training.Model at 0x7f4a978ce390>)

In [56]:
# Training
autoencoder, encoder = get_model(n_dimensions)
autoencoder.compile(optimizer='rmsprop', loss='mse', 
                    metrics=['acc', 'cosine_proximity'])

history = autoencoder.fit(train_a, train_a, batch_size=500, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [57]:
encoded_train = encoder.predict(train_a)
encoded_test = encoder.predict(test_b)
encoded_ab = encoder.predict(ab_b)
# final means validation data set
encoded_final_test = encoder.predict(phase2_final_normal)
encoded_final_ab = encoder.predict(phase2_final_abnormal)

In [None]:
# np.save('./encoded_train_0726', encoded_train)
# encoded_train = np.load('./encoded_train_0726.npy')
# np.save('./encoded_test_0726', encoded_test)
# encoded_test = np.load('./encoded_test_0726.npy')
# np.save('./encoded_ab_0726', encoded_ab)
# encoded_ab = np.load('./encoded_ab_0726.npy')
# np.save('./encoded_final_test_0726', encoded_final_test)
# encoded_final_test = np.load('./encoded_final_test_0726.npy')
# np.save('./encoded_final_ab_0726', encoded_final_ab)
# encoded_final_ab = np.load('./encoded_final_ab_0726.npy')

In [58]:
encoded_train.shape, encoded_test.shape, encoded_ab.shape, encoded_final_test.shape, encoded_final_ab.shape

((521815, 64), (88031, 64), (16683, 64), (5414, 64), (5266, 64))

In [104]:
clf=IsolationForest(contamination=0.038, random_state=16, n_jobs=-1, behaviour= "new", max_features=32, max_samples=64)
clf.fit(encoded_train)
#phase 1 -> acc
pred_test = clf.predict(encoded_test)
pred_abnor= clf.predict(encoded_ab)

pred_if = np.concatenate([pred_test, pred_abnor], axis=0)
Y_data_if = np.array([1] * encoded_test.shape[0] + [-1] * encoded_ab.shape[0])
result_if = acc_f1(Y_data_if, pred_if, abnormal = -1)

#phase 2  -> iou
pred_fin_test = clf.predict(encoded_final_test)
pred_fin_abnor= clf.predict(encoded_final_ab)

pred_iou = np.concatenate([pred_fin_test, pred_fin_abnor], axis=0)
Y_data_iou = np.array([1] * encoded_final_test.shape[0] + [-1] * encoded_final_ab.shape[0])

result_IOU = IOU(Y_data_iou, pred_iou, abnormal = -1)

#print("Accuracy: %.8f"%((result_if[0]+result_if[1])/2))
#print("IOU: %.8f"%(result_IOU))
print("Accuracy: %.8f"%((((result_if[0]+result_if[1])/2)+(result_IOU))/2))

Accuracy: 0.81427548
