# Binary LSTM classification model
## Trained on older feed (Sept), inferred on latest feed (Nov)

** This notebook trains the Binary classification model on an older (Sept'18) High Confidence (HC) DGA feed and then does the inference on a more recent feed, from Nov'18.**

In [2]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.preprocessing import text

from tensorflow.python.client import device_lib

In [3]:
# Check if gpu can be utilized for acceleration
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11208480337296706510
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2384576512
locality {
  bus_id: 1
  links {
  }
}
incarnation: 662418254486582015
physical_device_desc: "device: 0, name: GeForce GTX 780M, pci bus id: 0000:01:00.0, compute capability: 3.0"
]


In [16]:
# Read DGA and Cisco high confidence data

dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)
dga_new_df = pd.read_csv('..\\data\\2018-11-12\\dga-feed-high.csv', header=None, skiprows=15)

# Path and file variables for saving model information
path_dir = '.\\saved_models\\trainOld_inferNew\\'
name_encoder      = path_dir + 'binary_tokenizer.pkl'
name_model        = path_dir + 'binary_LSTM.json'
name_weights      = path_dir + 'binary_LSTM.h5'
name_categories   = path_dir + 'binary_categories.pkl'

In [17]:
# display head
def display_df(dga_df_, cisco_df_, dga_all_df_):
    display("DGA feed sample: {}".format( dga_df_.shape) )
    display(dga_df_.head())
    display("DGA feed high and low confidence sample: {}".format( dga_all_df_.shape))
    display(dga_all_df_.head())
    display("Cisco feed sample: {}".format( cisco_df_.shape) )
    display(cisco_df_.head())

In [18]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(1,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain']
dga_new_df_slim =   dga_new_df.drop(columns=range(1,dga_new_df.shape[1]), inplace=False)
dga_new_df_slim.columns = ['domain']

cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
dga_df_slim['dga'] = 1
dga_new_df_slim['dga'] = 1
cisco_df_slim['dga'] = 0

#display_df(dga_df_slim, cisco_df_slim, dga_all_df_slim)
unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
unified_df['dga'], labels = pd.factorize(unified_df['dga'], sort=True)   # binary factorization and potentially realigning the DGA categories
with open(name_categories, 'wb') as catEnc:
    pickle.dump(labels, catEnc, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X_train = unified_df['domain']
Y_train = unified_df['dga']
X_test  = dga_new_df_slim['domain']
Y_test  = dga_new_df_slim['dga']

In [23]:
# Binary classification LSTM model

TRAIN_MODEL = True                                          # Load saved model otherwise
max_features = 1000                                          # length of vocabulary
batch_size = 128                                             # input batch size
num_epochs = 1                                               # epochs to train
    
if TRAIN_MODEL != True:
    file = open(name_model, 'r')
    model_load = file.read()
    file.close()
    model = model_from_json(model_load)
    model.load_weights(name_weights)
    with open(name_tokenizer, 'rb') as tokenEnc:
        encoder = pickle.load(tokenEnc)
    with open(name_categories, 'rb') as catEnc:
        labels = pickle.load(catEnc)
    print('MODEL TRAINING SKIPPED.\nSAVED MODEL IS NOW LOADED!')

else:                                                        # train the model
    # encode string characters to integers
    encoder = text.Tokenizer(num_words=500, char_level=True)
    encoder.fit_on_texts(X_train)                            # build character indices
    X_train_tz = encoder.texts_to_sequences(X_train)
    
    # Model definition - this is the core model from Endgame
    model=Sequential()
    model.add(Embedding(max_features, 128, input_length=75))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')
    
    # Pad sequence where sequences are case insensitive characters encoded to
    # integers from 0 to number of valid characters
    X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)
    
    # Train where Y_train is 0-1
    model.fit(X_train_pad, Y_train, batch_size=batch_size, epochs=num_epochs)

Epoch 1/1


In [24]:
# Validation on test dataset

X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=75)
Y_pred = model.predict_classes(X_test_pad)
acc = accuracy_score(Y_test, Y_pred)
print("Model accuracy on new DGA feed = {:8.3f} %".format(acc*100))

Model accuracy on new DGA feed =   97.887 %


**Skipping the model's prediction probability retrieval in this notebook.**

In [25]:
# Save model and weights
if TRAIN_MODEL == True:
    model_save = model.to_json()
    with open(name_model, 'w') as file:
        file.write(model_save)
    model.save_weights(name_weights)
    with open(name_encoder, 'wb') as tokenEnc:
        pickle.dump(encoder, tokenEnc, protocol=pickle.HIGHEST_PROTOCOL)
    print('MODEL SAVED TO DISK!')
else:
    print('MODEL AREADY SAVED TO DISK.')

MODEL SAVED TO DISK!
