In [1]:
# Load Libraries - Make sure to run this cell!
import pandas as pd
import numpy as np
import re
from string import printable
from sklearn import model_selection

#import gensim
import tensorflow as tf
from keras.models import load_model
from keras.models import Sequential, Model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten, Merge
from keras.layers import Input, ELU, LSTM, Embedding, Convolution2D, MaxPooling2D, \
BatchNormalization, Convolution1D, MaxPooling1D, merge
from keras.preprocessing import sequence
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
from keras import initializations
from keras import backend as K

tf.python.control_flow_ops = tf

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.



## Featureless Deep Learning

This notebook shows three commonly used neural network architectures to detect malicious URLs using **featureless Deep Learning**. [Keras](https://keras.io/) is used as high-level API for [tensorflow](https://www.tensorflow.org/) backend). 



## Preprocess raw URLs

In [2]:
## Load data URL

DATA_HOME = 'data/'
df = pd.read_csv(DATA_HOME + 'url_data_mega_deep_learning.csv')
df.sample(n=25).head(25) 

Unnamed: 0,url,isMalicious
57628,theacsi.org/industries/restaurant/full-service...,0
42355,imodules.com/s/1374/giving/giving.aspx?sid=137...,0
192681,musicforyoudjs.com/dj/index.php?action=login,1
184396,rsu52.us/dwi-dui-lbi/wp-includes/www.paypal.co...,1
60704,ecomagination.com/digital/about-ge-digital,0
97221,wikipedia.org/wiki/Miller_House,0
663,leviton.com/www.leviton.com/en/products/networ...,0
17351,bartleby.com/topics/Obesity-America-Essay,0
174136,benefitauctionevents.net/includes/live.com/ind...,1
170303,google.com/ServiceLogin?service=CPanel&amp;pas...,1


In [3]:
# Initial Data Preparation URL

# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer 
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
 
# Step 3: Extract labels form df to numpy array
target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (194798, 75) Vector dimension of target:  (194798,)


In [4]:
# Simple Cross-Validation: Split the data set into training and test data
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.25, random_state=33)

In [5]:
# GENERAL get layer dimensions for any model!
def print_layers_dims(model):
    l_layers = model.layers
    # Note None is ALWAYS batch_size
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)

## Architecture 1 - Simple LSTM

In [6]:
## Deep Learning model Definition --- A --- (Simple LSTM)


def simple_lstm(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                dropout=0.2, W_regularizer=W_reg)(main_input) 

    # LSTM layer
    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
# Fit model and Cross-Validation, ARCHITECTURE 1 SIMPLE LSTM
nb_epoch = 3
batch_size = 32

model = simple_lstm()
model.fit(X_train, target_train, nb_epoch=nb_epoch, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Final Cross-Validation Accuracy 0.842032854258 

<keras.engine.topology.InputLayer object at 0x7fbfdba36048>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x7fbfe3df9ac8>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.recurrent.LSTM object at 0x7fc007f574a8>
Input Shape:  (None, 75, 32) Output Shape:  (None, 32)
<keras.layers.core.Dropout object at 0x7fbfdab2dbe0>
Input Shape:  (None, 32) Output Shape:  (None, 32)
<keras.layers.core.Dense object at 0x7fbfdab3cfd0>
Input Shape:  (None, 32) Output Shape:  (None, 1)


## Architecture 2 - 1D Convolution and LSTM

In [8]:
## Deep Learning model Definition --- B --- (1D Convolution and LSTM)

def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    # Conv layer
    conv = Convolution1D(filter_length=5, nb_filter=256, \
                     border_mode='same', activation=ELU())(emb)

    conv = MaxPooling1D(pool_length=4)(conv)
    #conv = BatchNormalization(mode=0)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM layer
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [9]:
# Fit model and Cross-Validation, ARCHITECTURE 2 CONV + LSTM
nb_epoch = 5
batch_size = 32

model = lstm_conv()
model.fit(X_train, target_train, nb_epoch=nb_epoch, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.889876796739 

<keras.engine.topology.InputLayer object at 0x7fbfcc52d0b8>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x7fbfcc52d1d0>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.core.Dropout object at 0x7fbfd359e1d0>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 32)
<keras.layers.convolutional.Convolution1D object at 0x7fbfd3591b38>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.pooling.MaxPooling1D object at 0x7fbfd35b7550>
Input Shape:  (None, 75, 256) Output Shape:  (None, 18, 256)
<keras.layers.core.Dropout object at 0x7fbfd360e198>
Input Shape:  (None, 18, 256) Output Shape:  (None, 18, 256)
<keras.layers.recurrent.LSTM object at 0x7fbfd360ef98>
Input Shape:  (None, 18, 256) Output Shape:  (None, 32)
<keras.layers.core.Dropout object at 0x7fbfcc47be80>
Input Shape:  (None, 32) Output S

## Architecture 3 - 1D Convolutions and Fully Connected Layers

In [10]:
## Deep Learning model Definition --- C --- (1D Convolutions and Fully Connected Layers)

def conv_fully(max_len=75, emb_dim=32, max_vocab_len=100, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    
    def sum_1d(X):
        return K.sum(X, axis=1)
    
    def get_conv_layer(emb, filter_length=5, nb_filter=256):
        # Conv layer
        conv = Convolution1D(filter_length=filter_length, nb_filter=nb_filter, \
                     border_mode='same', activation=ELU())(emb)

        conv = Lambda(sum_1d, output_shape=(nb_filter,))(conv)
        #conv = BatchNormalization(mode=0)(conv)
        conv = Dropout(0.5)(conv)
        return conv
        
    # Multiple Conv Layers
    
    # calling custom conv function from above
    conv1 = get_conv_layer(emb, filter_length=2, nb_filter=256)
    conv2 = get_conv_layer(emb, filter_length=3, nb_filter=256)
    conv3 = get_conv_layer(emb, filter_length=4, nb_filter=256)
    conv4 = get_conv_layer(emb, filter_length=5, nb_filter=256)

    # Fully Connected Layers
    merged = merge([conv1,conv2,conv3,conv4],mode="concat")

    hidden1 = Dense(1024,activation=ELU())(merged)
    hidden1 = BatchNormalization(mode=0)(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024,activation=ELU())(hidden1)
    hidden2 = BatchNormalization(mode=0)(hidden2)
    hidden2 = Dropout(0.5)(hidden2)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(hidden2)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [11]:
# Fit model and Cross-Validation, ARCHITECTURE 3 CONV + FULLY CONNECTED
nb_epoch = 5
batch_size = 32

model = conv_fully()
model.fit(X_train, target_train, nb_epoch=nb_epoch, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.811211499012 

<keras.engine.topology.InputLayer object at 0x7fbf8efd0780>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x7fbf8efd0710>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.core.Dropout object at 0x7fbf8efd0fd0>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 32)
<keras.layers.convolutional.Convolution1D object at 0x7fbf8ef0dd30>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.convolutional.Convolution1D object at 0x7fbf8ef70320>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.convolutional.Convolution1D object at 0x7fbf8eeec908>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.convolutional.Convolution1D object at 0x7fbf8efb8ac8>
Input Shape:  (None, 75, 32) Output Shape:  (None, 75, 256)
<keras.layers.core.Lambda object at 0x7fbf8ef40ef

In [12]:
# get probabilities of target predictions
target_proba = model.predict(X_test, batch_size=1)

In [13]:
# get for example word2vec embedding weight matix
l_layers = model.layers
weights = l_layers[1].get_weights()
weights[0].shape

(100, 32)