In [1]:
import numpy as np
import keras
from keras.models import Sequential , Model
from keras.layers import Input , RepeatVector , TimeDistributed  , GRU , Bidirectional , LSTM , Dense
from keras import regularizers
import os
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
from keras.models import model_from_json
from keras.datasets import mnist
from gensim.models.word2vec import Word2Vec

gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)


Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [28]:
def double_model(maxlen , word_vector):
    inputs = Input(shape=(maxlen , word_vector))
    encoded = GRU(256 , return_sequences=True)(inputs)
    encoded = GRU(128, return_sequences=False)(encoded)
    decoded = RepeatVector(maxlen)(encoded)
    decoded = GRU(128,return_sequences=True)(decoded)
    decoded = GRU(word_vector ,return_sequences=True)(decoded)
    output = TimeDistributed(Dense(word_vector))(decoded)
    sequence_autoencoder = Model(inputs, output)
    
    encoder = Model(inputs, encoded)
    return sequence_autoencoder

In [3]:
def data_load(BASEPATH):
    data_files = []
    filenames = os.listdir(BASEPATH)[:20000]
    for filename in tqdm(filenames):
        with open(os.path.join(BASEPATH,filename) , 'rb') as f:
            data_files.append(pickle.load(f))
    return data_files
    
    

In [4]:
def pre_processing(data_files, word_vector, func_len , vector_size ):
    unknown = 'unknown'
    zero_padding = [0] * vector_size
    result_data = []
    file = []
    for data_file in tqdm(data_files):
        for blocks in data_file:
            vec_block = []
            for block in blocks[:func_len]:
                for mnemonic in block:
                    try:
                        vec_block.append(word_vector[mnemonic])
                    except:
                        vec_block.append(word_vector[unknown])
            if (len(vec_block) >= 15):
                if (len(vec_block) < func_len):
                    for i in range(0, func_len - len(vec_block)):
                        vec_block.append(zero_padding)
                file.append(vec_block[:func_len])
    X_train = np.array(file)
    return X_train

In [5]:
word2vec = Word2Vec.load('word2vec_0402_64_upgrade.wv')
word_vector = word2vec.wv


In [6]:
data_files = data_load(r'C:\capstone\modeling\3\data\ben')

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:33<00:00, 46.79it/s]


In [7]:
pre_datas = pre_processing(data_files , word_vector , 80 ,64)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:26<00:00, 25.90it/s]


In [11]:
data_size = len(pre_datas)
rate = 0.9
X_train = pre_datas[:int(data_size*rate)]
X_test = pre_datas[int(data_size*rate):]
print('Train : ', len(X_train))
print('Test : ', len(X_test))


Train :  3175998
Test :  352889


In [29]:
double_model = double_model(80,64)
double_model.compile(optimizer='adam' , loss='mae')
double_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 80, 64)            0         
_________________________________________________________________
gru_9 (GRU)                  (None, 80, 256)           246528    
_________________________________________________________________
gru_10 (GRU)                 (None, 128)               147840    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 80, 128)           0         
_________________________________________________________________
gru_11 (GRU)                 (None, 80, 128)           98688     
_________________________________________________________________
gru_12 (GRU)                 (None, 80, 64)            37056     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 80, 64)            4160

In [24]:
X_train.shape

(3175998, 80, 64)

In [30]:
double_model.fit(X_train, X_train,epochs=25,batch_size=2048,shuffle=False,verbose =2 ,validation_data=(X_test, X_test))

Train on 3175998 samples, validate on 352889 samples
Epoch 1/25
 - 1449s - loss: 0.0782 - val_loss: 0.0691
Epoch 2/25
 - 1450s - loss: 0.0688 - val_loss: 0.0676
Epoch 3/25
 - 1382s - loss: 0.0674 - val_loss: 0.0653
Epoch 4/25
 - 1726s - loss: 0.0660 - val_loss: 0.0636
Epoch 5/25
 - 1571s - loss: 0.0649 - val_loss: 0.0620
Epoch 6/25
 - 1357s - loss: 0.0636 - val_loss: 0.0612
Epoch 7/25
 - 1444s - loss: 0.0626 - val_loss: 0.0625
Epoch 8/25
 - 1413s - loss: 0.0614 - val_loss: 0.0588
Epoch 9/25
 - 1446s - loss: 0.0604 - val_loss: 0.0577
Epoch 10/25
 - 1672s - loss: 0.0587 - val_loss: 0.0559
Epoch 11/25
 - 1458s - loss: 0.0568 - val_loss: 0.0570
Epoch 12/25
 - 1429s - loss: 0.0554 - val_loss: 0.0526
Epoch 13/25
 - 1440s - loss: 0.0538 - val_loss: 0.0518
Epoch 14/25
 - 1406s - loss: 0.0525 - val_loss: 0.0496
Epoch 15/25
 - 1451s - loss: 0.0516 - val_loss: 0.0495
Epoch 16/25
 - 1620s - loss: 0.0508 - val_loss: 0.0495
Epoch 17/25
 - 1439s - loss: 0.0553 - val_loss: 0.1079
Epoch 18/25
 - 1414s 

<keras.callbacks.callbacks.History at 0x222ed2d8dc8>

In [40]:
model_json = double_model.to_json()
with open("model_ida_gru_0408.json", "w") as json_file : 
    json_file.write(model_json)

double_model.save_weights("model_ida_gru_0408.h5")
print("Saved model to disk0408")




Saved model to disk0408


In [93]:
# train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

THRESHOLD = 0.1
X_test_pred = double_model.predict(X_test[:54175])
test_mae_loss = np.mean(np.abs(X_test_pred - X_test[:54175]), axis=2)
test_mae_loss.shape
loss_data = []
for i in test_mae_loss:
    total_loss = 0
    for i2 in i:
        total_loss += i2
    loss_data.append(total_loss/80)




test_score_df = pd.DataFrame(index=[i for i in range(0,len(X_test[:54175]))])
test_score_df['loss'] = loss_data

test_score_df['threshold'] = THRESHOLD
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
# test_score_df['close'] = test[TIME_STEPS:].close

anomalies_ben = test_score_df[test_score_df.anomaly == True]

In [94]:
nine_count = 0
one_count = 0
one_to_five = 0
two_count = 0

for i in anomalies_ben['loss']:
    if(i<0.1):
        nine_count+=1
    elif (i<=0.15 and i>=0.1):
        one_count+=1
    elif (i<=0.2 and i>0.15):
        one_to_five+=1
    else:
        two_count+=1

print(nine_count)
print(one_count)
print(one_to_five)
print(two_count)

anomalies_ben

0
8697
2222
304


Unnamed: 0,loss,threshold,anomaly
20,0.109733,0.1,True
37,0.100678,0.1,True
190,0.109438,0.1,True
268,0.101253,0.1,True
297,0.133012,0.1,True
...,...,...,...
54152,0.118925,0.1,True
54153,0.186058,0.1,True
54154,0.129177,0.1,True
54155,0.129198,0.1,True


In [41]:
json_file = open("model_ida_gru_0408.json", "r") 
loaded_model_json = json_file.read() 
json_file.close() 
loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights("model_ida_gru_0408.h5")
print('complte')


complte


In [44]:
mal_files = data_load(r'C:\capstone\modeling\3\data\mal')

100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:02<00:00, 187.31it/s]


In [83]:
mal_data = pre_processing(mal_files , word_vector , 80 ,64)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:07<00:00, 68.65it/s]


In [84]:
mal_data.shape

(54175, 80, 64)

In [85]:
# train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

THRESHOLD = 0.1
X_test_pred = double_model.predict(mal_data)
test_mae_loss = np.mean(np.abs(X_test_pred - mal_data), axis=2)
test_mae_loss.shape
loss_data = []
for i in test_mae_loss:
    total_loss = 0
    for i2 in i:
        total_loss += i2
    loss_data.append(total_loss/80)




test_score_df = pd.DataFrame(index=[i for i in range(0,len(mal_data))])
test_score_df['loss'] = loss_data

test_score_df['threshold'] = THRESHOLD
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
# test_score_df['close'] = test[TIME_STEPS:].close

anomalies_mal = test_score_df[test_score_df.anomaly == True]

In [86]:
nine_count = 0
one_count = 0
one_to_five = 0
two_count = 0

for i in anomalies_mal['loss']:
    if(i<0.1):
        nine_count+=1
    elif (i<=0.15 and i>=0.1):
        one_count+=1
    elif (i<=0.2 and i>0.15):
        one_to_five+=1
    else:
        two_count+=1

print(nine_count)
print(one_count)
print(one_to_five)
print(two_count)

anomalies_mal

0
11299
1367
2086


Unnamed: 0,loss,threshold,anomaly
0,0.307676,0.1,True
1,0.307676,0.1,True
2,0.223715,0.1,True
3,0.303514,0.1,True
4,0.300381,0.1,True
...,...,...,...
54156,0.138105,0.1,True
54164,0.101490,0.1,True
54169,0.109547,0.1,True
54172,0.133012,0.1,True
