In [1]:
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection
import tensorflow as tf
from keras.models import Sequential, Model, model_from_json, load_model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Input, ELU, LSTM, Embedding, Convolution2D, MaxPooling2D, \
BatchNormalization, Convolution1D, MaxPooling1D, concatenate
from keras.preprocessing import sequence
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
from keras import backend as K
from pathlib import Path
import json
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from keras.layers import Bidirectional, SimpleRNN
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard, ModelCheckpoint

Using TensorFlow backend.



## Deep Learning 


Requirements:

- tensorflow 1.4.1
- keras 2.1.2


## Preprocess 

In [2]:
## Load data code_snippet
DATA_HOME = 'DL/'
Final_Labeled_Dataset = pd.read_csv('Dataset of IoT OSs code - training.csv', encoding= 'unicode_escape')
Final_Labeled_Dataset.sample(n=5).head(5) 

Unnamed: 0,code,isMalicious
1699,else { dcoConst = * ( ( float * ) &cs -> r...,1
4878,staticvoid G1 ( ) { int k ; for ( k ...,0
4738,staticvoid GG2B2 ( ) { char * data ; dat...,0
1709,"if ( BI ( CS -> CTL0, CS8 ) ) { dcoConst =...",1
4195,namespace CWE415_Double_Free__malloc_free_char...,0


In [3]:
Final_Labeled_Dataset.dropna(inplace=True)
Final_Labeled_Dataset.drop_duplicates(inplace=True)
Final_Labeled_Dataset.sample(n=5).head(5) 

Unnamed: 0,code,isMalicious
3459,staticvoid G2 ( ) { if ( STATIC_CONST...,0
4682,staticvoid GB2G1 ( ) { char * data ; da...,0
757,int32_t control4 = get_CONTROL ( ) ; retu...,1
3884,staticvoid G1 ( ) { int k ; for ( k ...,0
375,void harc ( ) { size_t lenc ; Ca ( lenc ...,1


In [4]:
Final_Labeled_Dataset.shape

(4598, 2)

In [5]:
code_snippet_int_tokens = [[printable.index(x) + 1 for x in code_snippet if x in printable] 
                           for code_snippet in Final_Labeled_Dataset.code]
max_len = 150
X = sequence.pad_sequences(code_snippet_int_tokens, maxlen=max_len)
target = np.array (Final_Labeled_Dataset.isMalicious)
print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (4598, 150) Vector dimension of target:  (4598,)


In [6]:
# Simple Cross-Validation: Split the data set into training and test data
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.30, random_state=30)

In [7]:
def print_layers_dims(model):
    l_layers = model.layers
    # Note None is ALWAYS batch_size
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)

# GENERAL save model to disk function!
def save_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    #have h5py installed
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)
    

# GENERAL load model from disk function!
def load_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    with open(fileModelJSON, 'r') as f:
         model_json = json.load(f)
         model = model_from_json(model_json)
    
    model.load_weights(fileWeights)
    return model

In [8]:
# Deep Learning model Definition - RNN Model for Binary Classification
# Main Input
main_input = Input(shape=(max_len,),dtype='int32')

# Embedded Layers
Emb_Layer = Embedding(input_dim=150, output_dim=32, input_length=150, W_regularizer=regularizers.l2(1e-4))(main_input) 
Emb_Layer = Bidirectional(SimpleRNN(150, return_sequences=False, dropout=0.0, recurrent_dropout=0.0))(Emb_Layer)
Emb_Layer = Dense(55, activation='softmax')(Emb_Layer)

# RNN Model Settings
RNN_model = Model(inputs=main_input, outputs=Emb_Layer)
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
RNN_model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
RNN_model.summary() 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 32)           4800      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300)               54900     
_________________________________________________________________
dense_1 (Dense)              (None, 55)                16555     
Total params: 76,255
Trainable params: 76,255
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fit model and Cross-Validation
RNN_history = RNN_model.fit(X_train, target_train, epochs=800, batch_size=16)
loss, accuracy = RNN_model.evaluate(X_test, target_test, verbose=1)
print('\nTesting Accuracy =', accuracy, '\n')


print(RNN_history.history.keys())
plt.plot(RNN_history.history['acc'])
#plt.plot(history.history['loss'])
plt.title('The RNN model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Accuracy'], loc='lower right')
plt.show()

print('\nFinal Cross-Validation Accuracy of RNN training model', accuracy, '\n')

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800