In [1]:
# Build character map function for encoding URL string

import string
ascii_letters = string.ascii_letters # 1~52
digits = string.digits # 53~62
punctuation = string.punctuation # 63~94
total_char = ascii_letters + digits + punctuation    

UNKNOWN_CHAR = len(total_char) + 1
TOTAL_FEATURES = UNKNOWN_CHAR + 1 # include the default padding integer 0 
charmap = {
    c: idx+1
    for idx, c in enumerate(total_char)
}

def encodeChar(c):
    return charmap.get(c, UNKNOWN_CHAR)

encodeChar("x"), encodeChar("a"), encodeChar("我")

(24, 1, 95)

In [2]:
# load dataset

import pandas
import statistics
df = pandas.read_csv("all_urls.csv")

df["len"] = df.url.apply(lambda s: len(s))

In [3]:
# view the length stats
df.len.describe()

count    23007.000000
mean       116.103099
std        114.441389
min         15.000000
25%         45.000000
50%         79.000000
75%        140.000000
max       1641.000000
Name: len, dtype: float64

In [4]:
# find a Length for large coverage for all sample URL
# Length = 400 has ~98% coverage
for t in [200, 300, 400, 500, 600, 700, 800, 900, 1000]:
    print("x={} {:.5f}%".format(t, 100 * sum(df.len.apply(lambda x: x > t)) / len(df.len)))

x=200 18.16404%
x=300 4.48994%
x=400 2.16891%
x=500 1.51693%
x=600 1.19094%
x=700 0.52593%
x=800 0.25644%
x=900 0.19559%
x=1000 0.10432%


In [5]:
# sampling train/test dataset

from sklearn.model_selection import train_test_split

sub_df = df.sample(20000)
url_train, url_test, y_train, y_test = train_test_split(sub_df.url, sub_df.label, test_size=0.2)

print('Loading data...')
print(len(url_train), 'train sequences')
print(len(url_test), 'test sequences')

Loading data...
16000 train sequences
4000 test sequences


In [6]:
# Some configurations 

# Embedding
max_features = TOTAL_FEATURES
maxlen = 400 # ~98% coverage, paper uses 96% coverage
embedding_size = 128

# Training
batch_size = 64 # paper param
epochs = 20 # paper param

# Convolution
kernel_size = 5
filters = 64
pool_size = 2

# LSTM
lstm_output_size = 70

# Dropout ratio
Dropout_ratio = 0.25

In [141]:
# encode the URL by one-hot encoding and padding feature vector by 'pre'

print('Pad sequences (samples x time)')

x_train = pad_sequences(url_train.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')
x_test = pad_sequences(url_test.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')

print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (16000, 400)
x_test shape: (4000, 400)


In [143]:
print('Build model...')

model = Sequential(name="CNN-LSTM for phishing detection")
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dropout(Dropout_ratio))
model.add(Dense(1))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
Model: "CNN-LSTM for phishing detection"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 400, 128)          12288     
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 396, 64)           41024     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 198, 64)           0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 70)                37800     
_________________________________________________________________
dropout_7 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 71        
_________________________________________________________________
activation_8 (Activa

In [144]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16000 samples, validate on 4000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20

KeyboardInterrupt: 