In [1]:
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection
import tensorflow as tf
from keras.models import Sequential, Model, model_from_json, load_model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Input, ELU, LSTM, Embedding, Convolution2D, MaxPooling2D, \
BatchNormalization, Convolution1D, MaxPooling1D, concatenate
from keras.preprocessing import sequence
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
from keras import backend as K
from pathlib import Path
import json
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

Using TensorFlow backend.



## Deep Learning 


Requirements:

- tensorflow 1.4.1
- keras 2.1.2


## Preprocess 

In [2]:
## Load data code_snippet
DATA_HOME = 'DL/'
Final_Labeled_Dataset = pd.read_csv('Dataset of IoT OSs code - training.csv', encoding= 'unicode_escape')
Final_Labeled_Dataset.sample(n=5).head(5) 

Unnamed: 0,code,isMalicious
4375,staticvoid GB2G2 ( ) { char * data ; dat...,0
4993,staticvoid G2 ( ) { if ( globalFive ==...,0
4071,staticvoid G1 ( ) { if ( globalReturns...,0
2051,"espn ( size_t destAddr01, const void *srcAddr,...",1
3638,"staticvoid G1 ( ) { if ( fputs ( "" ...",0


In [3]:
Final_Labeled_Dataset.shape

(5080, 2)

In [4]:
Final_Labeled_Dataset.dropna(inplace=True)
Final_Labeled_Dataset.drop_duplicates(inplace=True)
Final_Labeled_Dataset.sample(n=5).head(5) 

Unnamed: 0,code,isMalicious
3353,namespace CWE665_Improper_Initialization__char...,0
5042,namespace CWE676 { staticvoid G2 ( ) { ...,0
91,void vGattDemoSvcStartAS ( void ) { ( void ...,1
3827,staticvoid G1 ( ) { if ( 0 ) { prin...,0
1002,"CK ( CK1 CK_BYTE , CK_U , CK_BYTE , CK_UL )...",1


In [5]:
Final_Labeled_Dataset.shape

(4598, 2)

In [6]:
code_snippet_int_tokens = [[printable.index(x) + 1 for x in code_snippet if x in printable] 
                           for code_snippet in Final_Labeled_Dataset.code]
max_len = 150
X = sequence.pad_sequences(code_snippet_int_tokens, maxlen=max_len)
target = np.array (Final_Labeled_Dataset.isMalicious)
print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (4598, 150) Vector dimension of target:  (4598,)


In [7]:
# Simple Cross-Validation: Split the data set into training and test data
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.30, random_state=30)

In [8]:

# GENERAL get layer dimensions for any model!
def print_layers_dims(model):
    l_layers = model.layers
    # Note None is ALWAYS batch_size
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)

# GENERAL save model to disk function!
def save_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    #have h5py installed
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)
    

# GENERAL load model from disk function!
def load_model(fileModelJSON,fileWeights):
    #print("Saving model to disk: ",fileModelJSON,"and",fileWeights)
    with open(fileModelJSON, 'r') as f:
         model_json = json.load(f)
         model = model_from_json(model_json)
    
    model.load_weights(fileWeights)
    return model

## Training Model 2 - 1D Convolutions and Fully Connected Layers

In [9]:
# Deep Learning model Definition - CNN Model for Binary Classification

def CNN(max_len=150, emb_dim=32, max_vocab_len=150, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    
    def sum_1d(X):
        return K.sum(X, axis=1)
    
    def get_conv_layer(emb, kernel_size=5, filters=150):
        # Conv layer
        conv = Convolution1D(kernel_size=kernel_size, filters=filters, \
                     border_mode='same')(emb)
        conv = ELU()(conv)

        conv = Lambda(sum_1d, output_shape=(filters,))(conv)
        #conv = BatchNormalization(mode=0)(conv)
        conv = Dropout(0.5)(conv)
        return conv
    
        # Multiple Conv Layers
    # calling custom conv function from above
    conv1 = get_conv_layer(emb, kernel_size=2, filters=150)
    conv2 = get_conv_layer(emb, kernel_size=3, filters=150)
    conv3 = get_conv_layer(emb, kernel_size=4, filters=150)
    conv4 = get_conv_layer(emb, kernel_size=5, filters=150)

    # Fully Connected Layers
    merged = concatenate([conv1,conv2,conv3,conv4], axis=1)

    hidden1 = Dense(1024)(merged)
    hidden1 = ELU()(hidden1)
    hidden1 = BatchNormalization(mode=0)(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024)(hidden1)
    hidden2 = ELU()(hidden2)
    hidden2 = BatchNormalization(mode=0)(hidden2)
    hidden2 = Dropout(0.5)(hidden2)
      
        # Output layer (last fully connected layer)
    output = Dense(55, activation='softmax', name='output')(hidden2)
    
        # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [10]:
model = CNN()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 32)      4800        main_input[0][0]                 
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 150, 32)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 150, 150)     9750        dropout_1[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (

In [11]:
# Fit model and Cross-Validation, Training Model 2 CONV + FULLY CONNECTED
epochs = 800
batch_size = 64
CNN_model = CNN()
history = CNN_model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = CNN_model.evaluate(X_test, target_test, verbose=1)

print(history.history.keys())
plt.plot(history.history['acc'])
#plt.plot(history.history['loss'])
plt.title('The CNN model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Accuracy'], loc='lower right')
plt.show()

print('\nFinal Cross-Validation Accuracy of CNN training model', accuracy, '\n')

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800

KeyboardInterrupt: 