# Binary LSTM classification model

In this notebook, we've replicated the binary LSTM model for the DGA/non-DGA classification of a domain name; from the Endgame paper:

"Predicting Domain Generation Algorithms with Long Short-Term Memory Networks"
http://arxiv.org/abs/1611.00791v1


In [24]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.preprocessing import text

from tensorflow.python.client import device_lib

In [2]:
# Check if gpu can be utilized for acceleration
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9544853544771074043
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 233287680
locality {
  bus_id: 1
  links {
  }
}
incarnation: 18310093570657416723
physical_device_desc: "device: 0, name: GeForce GTX 780M, pci bus id: 0000:01:00.0, compute capability: 3.0"
]


In [40]:
# Read DGA and Cisco high confidence data
dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)

# Path and file variables for saving model information
path_dir = '.\\saved_models\\'
name_encoder      = path_dir + 'binary_tokenizer.pkl'
name_model        = path_dir + 'binary_LSTM.json'
name_weights      = path_dir + 'binary_LSTM.h5'
name_categories   = path_dir + 'binary_categories.pkl'

In [41]:
# display head
def display_df(dga_df_, cisco_df_):
    display("DGA feed sample: {}".format( dga_df_.shape) )
    display(dga_df_.head())
    display("Cisco feed sample: {}".format( cisco_df_.shape) )
    display(cisco_df_.head())

In [42]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(1,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain']
cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
dga_df_slim['dga'] = 'DGA'
cisco_df_slim['dga'] = 'non-DGA'

display_df(dga_df_slim, cisco_df_slim)
unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
unified_df['dga'], labels = pd.factorize(unified_df['dga'], sort=True)   # binary factorization and potentially realigning the DGA categories
with open(name_categories, 'wb') as catEnc:
    pickle.dump(labels, catEnc, protocol=pickle.HIGHEST_PROTOCOL)

'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
0,plvklpgwivery.com,DGA
1,dnuxdhcgblsgy.net,DGA
2,qjlullhfkiowp.biz,DGA
3,elkidddodxdly.ru,DGA
4,rnbfwuprlwfor.org,DGA


'Cisco feed sample: (1000000, 2)'

Unnamed: 0,domain,dga
0,netflix.com,non-DGA
1,api-global.netflix.com,non-DGA
2,prod.netflix.com,non-DGA
3,push.prod.netflix.com,non-DGA
4,google.com,non-DGA


In [7]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X = unified_df['domain']
Y = unified_df['dga']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)

In [20]:
# Binary classification LSTM model

TRAIN_MODEL = True                                          # Load saved model otherwise
max_features = 1000                                          # length of vocabulary
batch_size = 128                                             # input batch size
num_epochs = 1                                               # epochs to train
    
if TRAIN_MODEL != True:
    file = open(name_model, 'r')
    model_load = file.read()
    file.close()
    model = model_from_json(model_load)
    model.load_weights(name_weights)
    with open(name_encoder, 'rb') as tokenEnc:
        encoder = pickle.load(tokenEnc)
    with open(name_categories, 'rb') as catEnc:
        labels = pickle.load(catEnc)
    print('SKIPPED MODEL TRAINING.\nSAVED MODEL IS NOW LOADED!')

else:                                                        # train the model
    # encode string characters to integers
    encoder = text.Tokenizer(num_words=500, char_level=True)
    encoder.fit_on_texts(X_train)                            # build character indices
    X_train_tz = encoder.texts_to_sequences(X_train)
    
    # Model definition - this is the core model from Endgame
    model=Sequential()
    model.add(Embedding(max_features, 128, input_length=75))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    
    # Pad sequence where sequences are case insensitive characters encoded to
    # integers from 0 to number of valid characters
    X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)
    
    # Train where Y_train is 0-1
    model.fit(X_train_pad, Y_train, batch_size=batch_size, epochs=num_epochs)

Epoch 1/1


For a typical training on a dual core CPU, each epoc took about 2.5 to 3 times more training time, compared to training with a GPU.

In [21]:
# Validation on test dataset

X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=75)
Y_pred = model.predict_classes(X_test_pad)
acc = accuracy_score(Y_test, Y_pred)
print("Model accuracy = {:8.3f} %".format(acc*100))

Model accuracy =   98.610 %


In [38]:
# Inspect a few prediction probabilities

Y_pred1_prob = model.predict(X_test_pad)
Y_pred_prob = np.hstack([1-Y_pred1_prob, Y_pred1_prob])   # in using sigmoid, the output probability is of class 1
print(Y_pred_prob.shape)
backup_ = Y_pred_prob.copy()
pred_table = X_test.to_frame()
pred_table.columns = ['domain']
pred_table['trueClass'] = [labels[i] for i in Y_test]
pred_table['predClass'] = [labels[i[0]] for i in Y_pred]
pred_table['predProb'] = [Y_pred_prob[idx][Y_pred[idx]] for idx in range(0, Y_pred.shape[0]) ]

print('\nCorrectly predicted Domains:')
display(pred_table[pred_table['trueClass'] == pred_table['predClass'] ].head(10) )

print('\nMis-predicted Domains:')
display(pred_table[pred_table['trueClass'] != pred_table['predClass'] ].head(10) )

(276391, 2)

Correctly predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
124546,ns47.domaincontrol.com,non-DGA,non-DGA,[0.99998236]
660921,britishlibrary.typepad.co.uk,non-DGA,non-DGA,[0.99999034]
446456,a538.casalemedia.com,non-DGA,non-DGA,[0.99999726]
600919,ign-ar8de21s8pinm-8d3d0d118-4d8d69dgoogleplayd...,non-DGA,non-DGA,[0.99999845]
1186650,gbggekvj.eu,DGA,DGA,[0.96344274]
115543,ewr-66.ewr-rtb1.rfihub.com,non-DGA,non-DGA,[0.9999989]
1360357,vsagkcaahpxrfbmqljnnxutj.com,DGA,DGA,[0.9976635]
464912,static.bladeandsoul.com,non-DGA,non-DGA,[0.99999106]
1097547,dlpyniywfxxp.com,DGA,DGA,[0.99842197]
606453,trans11212.addressy.com,non-DGA,non-DGA,[0.9999838]



Mis-predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
90390,6htb5ck86hk8i9.com,non-DGA,DGA,[0.98788714]
610853,wvxlsagkeuye.ir,non-DGA,DGA,[0.8436637]
1259769,gsilnc.net,DGA,non-DGA,[0.9438188]
1376705,lyryrirc.com,DGA,non-DGA,[0.9907708]
655678,1bxencbsmr.ddns.net,non-DGA,DGA,[0.9302808]
1227223,ycarusapao.com,DGA,non-DGA,[0.9550892]
1108369,hinmlowklleu.com,DGA,non-DGA,[0.7869851]
961259,idiidowowowofjgjgisos.su,non-DGA,DGA,[0.9819254]
1302961,bilvdat.me,DGA,non-DGA,[0.96486735]
1224773,wxvsxols.com,DGA,non-DGA,[0.7490418]


In [39]:
# Save model and weights
if TRAIN_MODEL == True:
    model_save = model.to_json()
    with open(name_model, 'w') as file:
        file.write(model_save)
    model.save_weights(name_weights)
    with open(name_encoder, 'wb') as tokenEnc:
        pickle.dump(encoder, tokenEnc, protocol=pickle.HIGHEST_PROTOCOL)
    print('MODEL SAVED TO DISK!')
else:
    print('MODEL AREADY SAVED TO DISK.')

MODEL SAVED TO DISK!


## Look ahead and next steps:
__1__ Look closer at the misclassified domains. Any particular DGA category stands out? What do we need to improve? 

__2__ Improving classification accuracy - more balanced dataset especially for the multiclass classification.

__3__ Learning from scratch takes significant time. Need to implement model update in batches of new domain dataset.

__4__ Modify the model to do multiclass classification across the various DGA categories. Do we need to trim down the categories - dataset shows 60+ categories and new ones may be added any time.