## RNN 모델 N-Byte 방식 (함수정보 포함 vs 미포함 => 1:1 비율)

## (1) 데이터로드

In [1]:
# (1) 데이터로드
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

# 여러개 쳐도 나오게
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 파일읽기
gcc6_0_32 = pd.read_csv("../data/binutils_gcc3~9_op0~4_exec_csv/"+'gcc6'+"_0_32_exec.csv", index_col=0)

# 형태 출력
print(gcc6_0_32.shape)

# reset_index (hex processing 하면서 값이 빠졌으니까 + n_gram 에서 index를 다루기 때문에)
gcc6_0_32.reset_index(inplace=True, drop=True)

print('reset_index 완료')
print('input data shape')
gcc6_0_32.head()

(13936744, 2)
reset_index 완료
input data shape


Unnamed: 0,bin,label
0,53,1
1,83,0
2,ec,0
3,08,0
4,e8,0


In [2]:
# (2-1) 데이터체크 1 - hex(16진수)가 256 label을 가져야 dummies 변환 가능 

# 16진수 256개 종류가 있어서 pd.get_dummies 사용 가능.
print(len(gcc6_0_32['bin'].unique()))

# (2-2) 데이터 체크 2 - 1, 0 비율 ==> 1이 함수의 갯수를 뜻함
# 정답 데이터 1, 0 비율 확인  ==> 1이 함수의 갯수를 뜻함
print(gcc6_0_32['label'].value_counts())

256
0    13892354
1       44390
Name: label, dtype: int64


## (3) N Byte씩 자르기

In [3]:
######################## 
idx = gcc6_0_32[gcc6_0_32['label']==1].index  # 407, 474 ...
ls = list(idx)

# 최종 뽑을 행에 대한 index
ls_idx = [] 
left_idx, right_idx = 0, 4 # 3개씩

# 6gram
for k in range(left_idx, right_idx):
    ls_idx.extend(list(idx + k)) # index 형이라서 가능

#ls_idx = list(set(ls_idx)) 
ls_idx.sort() # 인덱스 정렬

# 1차 index 해당범위 초과한 것들 없애기
ls_idx = list(filter(lambda x: x<len(gcc6_0_32), ls_idx))
print(len(ls_idx))

# 2차 남은 index들 중 right_idx 나눈 나머지 없애기
sub = len(ls_idx)%(right_idx)
print('나머지', sub)

ls_idx = ls_idx[:len(ls_idx)-sub]
print('최종 길이', len(ls_idx))

print('gcc6_0_32', len(ls_idx))

# loc 로 수정필요
gcc6_0_32_Ngram = gcc6_0_32.loc[ls_idx,:].copy()

177560
나머지 0
최종 길이 177560
gcc6_0_32 177560


## (4) false data 만들기

In [None]:
# false data 만들기 - False 데이터 랜덤 생성
# random index
random_idx = np.random.randint(len(gcc6_0_32)-right_idx)

# 목표치
goal = len(gcc6_0_32_Ngram)/right_idx
count=0

d = pd.DataFrame(columns=gcc6_0_32.columns)

# goal 에 도달할 때까지
while count!=goal:
    
    # 진행상황 살펴보기 위함
    if count%1000==0:
        print(count, end=' ')
        
    # 랜덤 N 바이트씩 뽑음
    df = gcc6_0_32[random_idx:random_idx+right_idx]
    
    # 뽑은 index의 N 바이트 중에 1이 없는 경우만
    if 1 not in df['label']:
        d = pd.concat([d, df])
        count+=1

print('완료')

0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 

## (5) False Data + True Data 합치기

In [None]:
ff = pd.DataFrame(columns=gcc6_0_32.columns)

for i in range(0, int(len(d)/right_idx)):
    ff = pd.concat([ff, gcc6_0_32[i*right_idx:(i+1)*right_idx], d[i*right_idx:(i+1)*right_idx]])
    
    if i%1000==0:
        print(i, end=' ')
ff.shape

## (6) one hot encoding

In [None]:
# 훈련데이터 (gcc 최적화버전 0, 1, 2, 3 one hot encoding)
gcc6_0_32_onehot_Ngram = pd.get_dummies(ff['bin'])
gcc6_0_32_onehot_Ngram = pd.concat([ff['label'], gcc6_0_32_onehot_Ngram], axis=1)

print('원핫인코딩완료')
print(gcc6_0_32_onehot_Ngram.shape)

In [None]:
# 훈련 데이터, 훈련 라벨
x_gcc6_0_32_3 = gcc6_0_32_onehot_Ngram.iloc[:,1:].to_numpy()
y_gcc6_0_32_3 = gcc6_0_32_onehot_Ngram['label'].to_numpy()
print(x_gcc6_0_32_3.shape, y_gcc6_0_32_3.shape)

x_gcc6_0_32_3 = x_gcc6_0_32_3.reshape(-1, right_idx, x_gcc6_0_32_3.shape[1])
y_gcc6_0_32_3 = y_gcc6_0_32_3.reshape(-1, right_idx, 1)

print(x_gcc6_0_32_3.shape, y_gcc6_0_32_3.shape)

## (7) 모델

In [None]:
# (10) 양방향 LSTM 모델링 작업
from keras.models import Model, Sequential
from keras.layers import SimpleRNN, Input, Dense, LSTM
from keras.layers import Bidirectional, TimeDistributed

# 학습
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(patience = 3) # 조기종료 콜백함수 정의

xInput = Input(batch_shape=(None,right_idx, 256)) 
xBiLstm = Bidirectional(LSTM(32, return_sequences=True), merge_mode = 'concat')(xInput)
xOutput = TimeDistributed(Dense(1, activation ='sigmoid'))(xBiLstm) # 각 스텝에서 cost가 전송되고, 오류가 다음 step으로 전송됨.

model1 = Model(xInput, xOutput)
model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model1.summary()

## (8) 학습 - 10 KFold

In [None]:
# 교차검증 kfold
from sklearn.model_selection import KFold

# Accuracy, Precision, Recall, F1-Score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Confusion Matrix, ROC Curve
from sklearn.metrics import confusion_matrix, roc_auc_score

# 최종 평가지표들 평균용
accuracy, recall, precision, f1score, cm = [], [], [], [], []

# 11. 교차검증 kfold - k.split - 10회 / K-Fold 객체 생성
# kf = KFold(n_splits=10, shuffle=False, random_state=None) # KFold non shuffle 버전
kf = KFold(n_splits=10, shuffle=True, random_state=None) # KFold non shuffle 버전

for train, validation in kf.split(x_gcc6_0_32_3, y_gcc6_0_32_3):
    print('======Training stage======')
    model1.fit(x_gcc6_0_32_3[train],
               y_gcc6_0_32_3[train],
               epochs = 10,
               batch_size = 32,
               callbacks=[early_stopping])
    #k_accuracy = '%.4f' %(model1.evaluate(data_10000x[validation], data_10000y[validation])[1])

# 12. 교차검증결과 predict - 검증셋들
    # predict 값
    k_pr = model1.predict(x_gcc6_0_32_3[validation])
    
    # 테스트 predict 결과들 비교 (평가지표 보기위함)
    pred = np.round(np.array(k_pr).flatten().tolist())
    y_test = np.array(y_gcc6_0_32_3[validation]).flatten().tolist()
    
# 13. 평가지표들 출력
    ## 평가지표들
    k_accuracy = float(accuracy_score(y_test, pred))
    k_recall =  float(recall_score(y_test, pred))
    k_precision = float(precision_score(y_test, pred))
    k_f1_score = float(f1_score(y_test, pred))
    #k_cm = float(confusion_matrix(y_test, pred))
    
    print('accuracy_score', k_accuracy)
    print('recall_score', k_recall)
    print('precision_score', k_precision)
    print('f1_score', k_f1_score)
    #print('\nconfusion_matrix\n', k_cm)

    accuracy.append(k_accuracy)
    recall.append(k_recall)
    precision.append(k_precision)
    f1score.append(k_f1_score)
    #cm.append(k_cm)
#    print('roc_curve 면적', roc_auc_score(y_test, pred))

# 14. 최종 결과지표
print('\nK-fold cross validation Accuracy: {}'.format(accuracy))
print('\nK-fold cross validation Recall: {}'.format(recall))
print('\nK-fold cross validation Precision: {}'.format(precision))
print('\nK-fold cross validation F1-Score: {}'.format(f1score))
#print('\nK-fold cross validation ConfusionMatrix: {}'.format(cm))

## (9) 평가지표

In [None]:
print('10-Fold Cross_validation. Accuracy :', np.mean(accuracy))
print('10-Fold Cross_validation. Recall :', np.mean(recall))
print('10-Fold Cross_validation. Precision :', np.mean(precision))
print('10-Fold Cross_validation. F1-Score :', np.mean(f1score))