# Multiclass LSTM classification model

In this notebook, we've replicated the multiclass LSTM model for the classification of various DGA categories a domain name may belong to; based on the Endgame paper:

"Predicting Domain Generation Algorithms with Long Short-Term Memory Networks"
http://arxiv.org/abs/1611.00791v1


In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils import to_categorical

from tensorflow.python.client import device_lib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Confirm gpu is being picked up for acceleration
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1878327588361705297
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2954407936
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12261155028882633162
physical_device_desc: "device: 0, name: GeForce GTX 780M, pci bus id: 0000:01:00.0, compute capability: 3.0"
]


In [3]:
# Read DGA and Cisco high confidence data
dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)

In [4]:
""" Display head/tail/sample of the DGA and/or nonDGA data frames """
def display_df(dga_df_=None, cisco_df_=None, sample='head', seed=21):
    
    if dga_df_ is not None:
        display("DGA feed sample: {}".format( dga_df_.shape) )
        if sample=='head':
            display(dga_df_.head())
        elif sample=='tail':
            display(dga_df_.tail())
        elif 'sample' in sample:
            cnt = int(sample.strip('sample'))
            display(dga_df_.sample(n=cnt, random_state=seed))
            
    if cisco_df_ is not None:
        display("Cisco feed sample: {}".format( cisco_df_.shape) )
        if sample=='head':
            display(cisco_df_.head())
        elif sample=='tail':
            display(cisco_df_.tail())
        elif 'sample' in sample:
            cnt = int(sample.strip('sample'))
            display(cisco_df_.sample(n=cnt, random_state=seed))
            

In [5]:
display_df(dga_df, cisco_df)

'DGA feed sample: (381953, 4)'

Unnamed: 0,0,1,2,3
0,plvklpgwivery.com,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
1,dnuxdhcgblsgy.net,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
2,qjlullhfkiowp.biz,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
3,elkidddodxdly.ru,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
4,rnbfwuprlwfor.org,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt


'Cisco feed sample: (1000000, 2)'

Unnamed: 0,0,1
0,1,netflix.com
1,2,api-global.netflix.com
2,3,prod.netflix.com
3,4,push.prod.netflix.com
4,5,google.com


In [6]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(2,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain', 'dga']

cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
cisco_df_slim['dga'] = 'nonDGA'

display_df(dga_df_slim, None, 'sample5')

'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
57569,a94421b9c998056fb42456ad25ea55bfb9.hk,Domain used by dyre DGA for 26 Jun 2018
171315,wjb92vcerh.net,Domain used by shiotob/urlzone/bebloh DGA - no...
267753,vhnvvwx.net,Domain used by pykspa (varying date seeds)
250789,csyyhnyiwejluy.su,Domain used by ranbyus (uses previous 31 days ...
154897,ypuyuvscckuc.pw,Domain used by tinba DGA for 25 Jun 2018


In [7]:
SUFFIXES = [' DGA', ' (', ' -']

""" Extract the DGA categories from the description string """
def strip_cat(input_str_row, lstrip_str="Domain used by ", rtrunc_str=SUFFIXES, verbose=False):
    if verbose:
        print('-'*50, '\nInput:    ', input_str_row['dga'])
    str1 = input_str_row['dga'].replace(lstrip_str, '')
    if verbose:
        print('Lstrip:   ', str1)
    str2 = str1
    for i in rtrunc_str:
        idx = str2.find(i)
        if idx != -1:
            str2 = str2[0:idx]
            if verbose:
                print('Trimmed:  ', str2)
            break
    return str2

In [8]:
# Trim description down to the DGA category names

verbosity = False

dga_df_slim['dga'] = dga_df_slim.apply(lambda row: strip_cat(row, verbose=verbosity), axis=1)

display_df(dga_df_slim, None, 'sample5')


'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
57569,a94421b9c998056fb42456ad25ea55bfb9.hk,dyre
171315,wjb92vcerh.net,shiotob/urlzone/bebloh
267753,vhnvvwx.net,pykspa
250789,csyyhnyiwejluy.su,ranbyus
154897,ypuyuvscckuc.pw,tinba


In [9]:
# Extract unique DGA categories

categories = list(dga_df_slim['dga'].unique())
print("Categories of DGA domains: {}\n".format(len(categories)))
print(categories)
categories.append('nonDGA')
print("\nTotal output classes will be: {}\n".format(len(categories)))
num_categories = len(categories)

Categories of DGA domains: 43

['Cryptolocker - Flashback', 'Post Tovar GOZ', 'geodo', 'dyre', 'corebot', 'symmi', 'padcrypt', 'locky', 'tinba', 'pushdo', 'P2P Gameover Zeus', 'shiotob/urlzone/bebloh', 'hesperbot', 'cryptowall', 'ramnit', 'dircrypt', 'ranbyus', 'pykspa', 'murofet', 'Volatile Cedar / Explosive', 'beebone', 'bedep', 'fobber', 'necurs', 'qakbot', 'tempedreve', 'ramdo', 'kraken', 'bamital', 'vawtrak', 'sisron', 'chinad', 'gozi', 'sphinx', 'proslikefan', 'vidro', 'madmax', 'dromedan', 'g01', 'pandabanker', 'mirai', 'unknownjs', 'unknowndropper']

Total output classes will be: 44



In [10]:
# Check skewness in the dataset with respect to DGA categories

counts = dga_df_slim['dga'].value_counts().to_frame()
print("\nMost frequent categories:", end='')
display(counts.head())
print("\nLeast frequent categories:", end='')
display(counts.tail())


Most frequent categories:

Unnamed: 0,dga
tinba,66688
Post Tovar GOZ,66000
ramnit,56174
necurs,43008
murofet,28520



Least frequent categories:

Unnamed: 0,dga
gozi,24
mirai,3
dromedan,2
madmax,1
g01,1


The above data shows heavy skew and is highly imbalanced for more than half the lower DGA categories in the sorted list. This might affect the training and prediction of the model for these categories.

In [11]:
THRESHOLD_COUNT = 50                     # lower count limit beyond which categories are merged together
MERGED_CAT_STR = 'mergedDGA'             # name of merged category

MERGED_CAT_LIST = []
""" Trim down the categories that have sparse data, and merge them into one """
def trim_categories(input_row, threshold=THRESHOLD_COUNT):
    if input_row['dga'] < threshold:
        newcat = MERGED_CAT_STR
        MERGED_CAT_LIST.append(input_row.name)
    else:
        newcat = input_row.name
    return newcat


In [12]:
counts['newCat'] = counts.apply(trim_categories, axis=1)
print("Merged categories: ", MERGED_CAT_LIST)
display(counts)

Merged categories:  ['pandabanker', 'gozi', 'mirai', 'dromedan', 'madmax', 'g01']


Unnamed: 0,dga,newCat
tinba,66688,tinba
Post Tovar GOZ,66000,Post Tovar GOZ
ramnit,56174,ramnit
necurs,43008,necurs
murofet,28520,murofet
ranbyus,26040,ranbyus
qakbot,20000,qakbot
pykspa,14215,pykspa
shiotob/urlzone/bebloh,12521,shiotob/urlzone/bebloh
kraken,8988,kraken


In [13]:
# Update DGA frame with new categories

def update_categories(input_row):
    if input_row['dga'] in MERGED_CAT_LIST:
        return MERGED_CAT_STR
    else:
        return input_row['dga']

dga_df_slim['dga'] = dga_df_slim.apply(update_categories, axis=1)


In [14]:
display(dga_df_slim['dga'].unique())

array(['Cryptolocker - Flashback', 'Post Tovar GOZ', 'geodo', 'dyre',
       'corebot', 'symmi', 'padcrypt', 'locky', 'tinba', 'pushdo',
       'P2P Gameover Zeus', 'shiotob/urlzone/bebloh', 'hesperbot',
       'cryptowall', 'ramnit', 'dircrypt', 'ranbyus', 'pykspa', 'murofet',
       'Volatile Cedar / Explosive', 'beebone', 'bedep', 'fobber',
       'necurs', 'qakbot', 'tempedreve', 'ramdo', 'kraken', 'bamital',
       'vawtrak', 'sisron', 'chinad', 'mergedDGA', 'sphinx',
       'proslikefan', 'vidro', 'unknownjs', 'unknowndropper'],
      dtype=object)

In [15]:
# Combine DGA/nonDGA dataframes, and factorize categories (mapping to integer indices)

unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
temp_df = unified_df.copy()
unified_df['catIndex'], labels = pd.factorize(unified_df['dga'], sort=True)
num_categories = len(labels)

In [16]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X = unified_df['domain']
Y = unified_df['catIndex']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)
Y_train_binarized = to_categorical(Y_train, num_classes=num_categories)

In [17]:
# Multiclass LSTM model

TRAIN_MODEL = False                                          # Load saved model otherwise
max_features = 1000                                          # length of vocabulary
batch_size = 128                                             # input batch size
num_epochs = 1                                               # epochs to train
num_labels = num_categories                                  # final number of output classes, after potentially merging DGA categories 
    
if TRAIN_MODEL == False:
    file = open('.\\saved_models\\multiclass_LSTM.json', 'r')
    model_load = file.read()
    file.close()
    model = model_from_json(model_load)
    model.load_weights('.\\saved_models\\multiclass_LSTM.h5')
    with open('.\\saved_models\\multiclass_LSTM.pickle', 'rb') as tokenEnc:
        encoder = pickle.load(tokenEnc)
    print('MODEL TRAINING SKIPPED.\nSAVED MODEL IS NOW LOADED!')

else:                                                        # train the model
    # encode string characters to integers
    encoder = text.Tokenizer(num_words=500, char_level=True)
    encoder.fit_on_texts(X_train)                            # build character indices
    X_train_tz = encoder.texts_to_sequences(X_train)
    
    # Model definition - this is the core model from Endgame
    model=Sequential()
    model.add(Embedding(max_features, 128, input_length=75))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    # Pad sequence where sequences are case insensitive characters encoded to
    # integers from 0 to number of valid characters55
    X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)
    
    # Train where Y_train is 0-1
    model.fit(X_train_pad, Y_train_binarized, batch_size=batch_size, epochs=num_epochs)

SKIPPED MODEL TRAINING.
SAVED MODEL IS NOW LOADED!


In [18]:
# Validation on test dataset

X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=75)
Y_pred = model.predict_classes(X_test_pad)
Y_pred_prob = model.predict(X_test_pad)

In [19]:
# Inspect a few prediction probabilities

backup_ = Y_pred_prob.copy()
pred_table = X_test.to_frame()
pred_table.columns = ['domain']
pred_table['trueClass'] = [labels[i] for i in Y_test]
pred_table['predClass'] = [labels[i] for i in Y_pred]
pred_table['predProb'] = [Y_pred_prob[idx][Y_pred[idx]] for idx in range(0, Y_pred.shape[0]) ]

print('\nCorrectly predicted Domains:')
display(pred_table[pred_table['trueClass'] == pred_table['predClass'] ].head(10) )

print('\nMis-predicted Domains:')
display(pred_table[pred_table['trueClass'] != pred_table['predClass'] ].head(10) )


Correctly predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
124546,ns47.domaincontrol.com,nonDGA,nonDGA,0.999999
660921,britishlibrary.typepad.co.uk,nonDGA,nonDGA,0.999989
446456,a538.casalemedia.com,nonDGA,nonDGA,0.999999
600919,ign-ar8de21s8pinm-8d3d0d118-4d8d69dgoogleplayd...,nonDGA,nonDGA,0.999999
1186650,gbggekvj.eu,ramnit,ramnit,0.734519
115543,ewr-66.ewr-rtb1.rfihub.com,nonDGA,nonDGA,0.999999
1360357,vsagkcaahpxrfbmqljnnxutj.com,qakbot,qakbot,0.61369
464912,static.bladeandsoul.com,nonDGA,nonDGA,0.999998
1097547,dlpyniywfxxp.com,tinba,tinba,0.956867
606453,trans11212.addressy.com,nonDGA,nonDGA,0.999999



Mis-predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
1359727,uxamuoylbidlktngprh.com,qakbot,ramnit,0.976411
90390,6htb5ck86hk8i9.com,nonDGA,chinad,0.486218
610853,wvxlsagkeuye.ir,nonDGA,necurs,0.95437
1259769,gsilnc.net,pykspa,nonDGA,0.758437
1158840,ibcmcycuemsvstbepeybarwpbey.info,P2P Gameover Zeus,qakbot,0.372635
1183450,epvnqmksvxxvdxlw.com,ramnit,murofet,0.530508
1245532,bcmcfmtkyiuiul.com,ranbyus,ramnit,0.516562
1376705,lyryrirc.com,vawtrak,nonDGA,0.901365
1290765,slsykrrahowsxw.net,murofet,ranbyus,0.66252
1342033,yapyerh.su,necurs,nonDGA,0.430539


In [20]:
acc = accuracy_score(Y_test, Y_pred)
print("Model's overall accuracy = {:8.3f} %\n".format(acc*100))
metrics_report = classification_report(Y_test, Y_pred, target_names=labels)
print(metrics_report)

Model's overall accuracy =   95.299 %

                            precision    recall  f1-score   support

  Cryptolocker - Flashback       0.45      0.56      0.50      1223
         P2P Gameover Zeus       0.48      0.18      0.26       393
            Post Tovar GOZ       1.00      1.00      1.00     12996
Volatile Cedar / Explosive       0.89      1.00      0.94       204
                   bamital       1.00      0.87      0.93        47
                     bedep       0.00      0.00      0.00        42
                   beebone       0.00      0.00      0.00        36
                    chinad       0.93      0.91      0.92       317
                   corebot       0.94      0.98      0.96        62
                cryptowall       0.00      0.00      0.00        23
                  dircrypt       0.00      0.00      0.00       139
                      dyre       0.98      1.00      0.99      1593
                    fobber       0.00      0.00      0.00        99
        

  'precision', 'predicted', average, warn_for)


In [21]:
# Save model and weights
if TRAIN_MODEL == True:
    model_save = model.to_json()
    with open('.\\saved_models\\multiclass_LSTM.json', 'w') as file:
        file.write(model_save)
    model.save_weights('.\\saved_models\\multiclass_LSTM.h5')
    with open('.\\saved_models\\multiclass_LSTM.pickle', 'wb') as tokenEnc:
        pickle.dump(encoder, tokenEnc, protocol=pickle.HIGHEST_PROTOCOL)
    print('MODEL SAVED TO DISK!')
else:
    print('MODEL AREADY SAVED TO DISK.')

MODEL AREADY SAVED TO DISK.


## Look ahead and next steps:
__1__ Look closer at the misclassified domains. Any particular DGA category stands out? What do we need to improve? 

__2__ Improving classification accuracy - more balanced dataset especially for the multiclass classification.

__3__ Learning from scratch takes significant time. Need to implement model update in batches of new domain dataset.

__4__ We may get a dynamic dataset with more than 60 categories for example, and code need to step in and trim down the categories to an upper limit say 50 at max. Or this could be implemented as dropping off the categories with inadequate data available, say less than 1000 available domain names.