In [0]:
# 구글 드라이브 마운트, 주피터에선 사용 X
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# 필수 라이브러리 import
import numpy as np
import pandas as pd
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM
from torch.utils.data import DataLoader

Using TensorFlow backend.


In [0]:
# 변수 설정
normal_path = '/content/gdrive/My Drive/캡스톤/pre_processing_data2/' #정상파일 경로
mal_path = '/content/gdrive/My Drive/캡스톤/mal_pre/' #악성파일 경로
model_path = '/content/gdrive/My Drive/캡스톤/' #모델 가중치 저장 경로
max_len = 3 #n-그램 설정
batch_size = 1024
epochs = 10

In [0]:
### 함수 설정 ###

# 데이터 불러오기
def data_load(path):
  file_names = os.listdir(path)
  data = []

  for file_name in file_names:
    f = open(path + file_name)
    tokens = f.read()
    data.append(tokens.split('\n'))
  
  return data

# 데이터 전처리
## word -> index 변환
def get_index(word, t):
  if word in t.word_index:
    return [t.word_index[word]]
  else:
    return [0]

## 시퀀스 데이터 생성
def seqs_gen(data, t):
  seqs = []
  zero_vec = [0] * (max_len - 2)
  for words in data:
    seqs.append(zero_vec + get_index(words[0], t) + get_index(words[1], t))
    for i in range(2, len(words)):
      seqs.append(seqs[i - 2][1:] + get_index(words[i], t))
  return np.array(seqs)

## x와 y형태로 분할
def slice_nparray(seqs):
  x = seqs[:,:-1]
  y = to_categorical(seqs[:,-1], num_classes=vocab_size)
  
  return x, y

In [0]:
# 데이터 불러오기 및 train, valid 분할
normal_data = data_load(normal_path)
mal_data = data_load(mal_path)

train = normal_data[:80]
valid = normal_data[80:]

In [0]:
# word_index 생성
t = Tokenizer()
t.fit_on_texts(train)
vocab_size = len(t.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

단어 집합의 크기 : 593


In [0]:
# 시퀀스 데이터 생성
normal_train_seqs = seqs_gen(train, t)
normal_valid_seqs = seqs_gen(valid, t)
mal_seqs = seqs_gen(mal_data, t)

In [0]:
# x와 y로 분할
x_train, y_train = slice_nparray(normal_train_seqs)
x_valid, y_valid = slice_nparray(normal_valid_seqs)
x_test, y_test = slice_nparray(mal_seqs)

In [0]:
# 모델 정의
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_len-1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))






In [0]:
# 모델 최초 생성
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train,
          y_train,
          epochs=epochs,
          batch_size=batch_size,
          verbose=1)
model.save_weights(model_path + "lstm_model.h5")



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
# 모델 가중치 불러오기
model.load_weights(model_path + 'lstm_model.h5')
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [0]:
# 모델 평가 함수
def evaluate(x, y, dataset):
  pred_y = model.predict(x)
  valid_len = len(pred_y)

  threshold = round(vocab_size * 0.9)
  count = 0
  well = 0
  index = 0
  abnormal = 0
  abnormals = []

  for i in range(valid_len):
    print("\r{} / {}".format(i + 1, valid_len), end="")
    valid_index = np.where(y[i] == 1)[0]

    if valid_index == 0:
      pred_per = 0
    else:
      pred_per = pred_y[i][valid_index]
      pred_y[i].sort()

    if pred_per >= pred_y[i][threshold]:
      well += 1

    count += 1
    if count == len(dataset[index]):
      abnormal = count - well
      score = abnormal / len(dataset[index])
      index += 1
      count = 0
      well = 0
      abnormals.append(score)
  print()
  abnormals = np.array(abnormals)
  print("악성의 비율: {}".format(abnormals.mean()))


In [0]:
# 모델 평가
evaluate(x_train, y_train, normal_data[:80])
evaluate(x_valid, y_valid, normal_data[80:])
evaluate(x_test, y_test, mal_data)

2696067 / 2696067
악성의 비율: 0.05003652134733598
677198 / 677198
악성의 비율: 0.06196408747917996
518339 / 518339
악성의 비율: 0.13876476357540685
