# Multiclass LSTM classification model - on DGA non-high feed

In this notebook, we've replicated the multiclass LSTM model for the classification of various DGA categories a domain name may belong to; based on the Endgame paper:

"Predicting Domain Generation Algorithms with Long Short-Term Memory Networks"
http://arxiv.org/abs/1611.00791v1


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.utils import to_categorical

from tensorflow.python.client import device_lib

Using TensorFlow backend.


In [2]:
# Confirm gpu is being picked up for acceleration
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3932626750999856979
]


In [3]:
# Read DGA and Cisco high confidence data
dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
dga_all_df = pd.read_csv('..\\data\\2018-10-14\\dga-feed.csv', header=None, skiprows=14)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)

In [4]:
""" Display head/tail/sample of the DGA and/or nonDGA data frames """
def display_df(dga_df_=None, cisco_df_=None, sample='head', seed=21):
    
    if dga_df_ is not None:
        display("DGA feed sample: {}".format( dga_df_.shape) )
        if sample=='head':
            display(dga_df_.head())
        elif sample=='tail':
            display(dga_df_.tail())
        elif 'sample' in sample:
            cnt = int(sample.strip('sample'))
            display(dga_df_.sample(n=cnt, random_state=seed))
            
    if cisco_df_ is not None:
        display("Cisco feed sample: {}".format( cisco_df_.shape) )
        if sample=='head':
            display(cisco_df_.head())
        elif sample=='tail':
            display(cisco_df_.tail())
        elif 'sample' in sample:
            cnt = int(sample.strip('sample'))
            display(cisco_df_.sample(n=cnt, random_state=seed))
            

In [5]:
display_df(dga_df, cisco_df)

'DGA feed sample: (381953, 4)'

Unnamed: 0,0,1,2,3
0,plvklpgwivery.com,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
1,dnuxdhcgblsgy.net,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
2,qjlullhfkiowp.biz,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
3,elkidddodxdly.ru,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt
4,rnbfwuprlwfor.org,Domain used by Cryptolocker - Flashback DGA fo...,2018-06-23,http://osint.bambenekconsulting.com/manual/cl.txt


'Cisco feed sample: (1000000, 2)'

Unnamed: 0,0,1
0,1,netflix.com
1,2,api-global.netflix.com
2,3,prod.netflix.com
3,4,push.prod.netflix.com
4,5,google.com


In [6]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(2,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain', 'dga']
dga_all_df_slim =   dga_all_df.drop(columns=range(2,dga_all_df.shape[1]), inplace=False)
dga_all_df_slim.columns = ['domain', 'dga']

cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
cisco_df_slim['dga'] = 'nonDGA'

display_df(dga_df_slim, None, 'sample5')
display_df(dga_all_df_slim, None, 'sample5')

'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
57569,a94421b9c998056fb42456ad25ea55bfb9.hk,Domain used by dyre DGA for 26 Jun 2018
171315,wjb92vcerh.net,Domain used by shiotob/urlzone/bebloh DGA - no...
267753,vhnvvwx.net,Domain used by pykspa (varying date seeds)
250789,csyyhnyiwejluy.su,Domain used by ranbyus (uses previous 31 days ...
154897,ypuyuvscckuc.pw,Domain used by tinba DGA for 25 Jun 2018


'DGA feed sample: (852819, 2)'

Unnamed: 0,domain,dga
226102,csbdllaabettingk.com,Domain used by banjori - not date seeded
435079,pdfnalitydevonianizuwb.com,Domain used by banjori - not date seeded
586794,xzlvidablyhoosieraw.com,Domain used by banjori - not date seeded
620903,zzznsemitismgavenuteq.com,Domain used by banjori - not date seeded
489670,sgbsmachuslazaroqok.com,Domain used by banjori - not date seeded


In [7]:
SUFFIXES = [' DGA', ' (', ' -']

""" Extract the DGA categories from the description string """
def strip_cat(input_str_row, lstrip_str="Domain used by ", rtrunc_str=SUFFIXES, verbose=False):
    if verbose:
        print('-'*50, '\nInput:    ', input_str_row['dga'])
    str1 = input_str_row['dga'].replace(lstrip_str, '')
    if verbose:
        print('Lstrip:   ', str1)
    str2 = str1
    for i in rtrunc_str:
        idx = str2.find(i)
        if idx != -1:
            str2 = str2[0:idx]
            if verbose:
                print('Trimmed:  ', str2)
            break
    return str2

In [8]:
# Trim description down to the DGA category names

verbosity = False

dga_df_slim['dga'] = dga_df_slim.apply(lambda row: strip_cat(row, verbose=verbosity), axis=1)
dga_all_df_slim['dga'] = dga_all_df_slim.apply(lambda row: strip_cat(row, verbose=verbosity), axis=1)

display_df(dga_df_slim, None, 'sample5')
display_df(dga_all_df_slim, None, 'sample5')


'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
57569,a94421b9c998056fb42456ad25ea55bfb9.hk,dyre
171315,wjb92vcerh.net,shiotob/urlzone/bebloh
267753,vhnvvwx.net,pykspa
250789,csyyhnyiwejluy.su,ranbyus
154897,ypuyuvscckuc.pw,tinba


'DGA feed sample: (852819, 2)'

Unnamed: 0,domain,dga
226102,csbdllaabettingk.com,banjori
435079,pdfnalitydevonianizuwb.com,banjori
586794,xzlvidablyhoosieraw.com,banjori
620903,zzznsemitismgavenuteq.com,banjori
489670,sgbsmachuslazaroqok.com,banjori


In [9]:
# Extract unique DGA categories

categories = list(dga_df_slim['dga'].unique())
print("Categories of DGA domains: {}\n".format(len(categories)))
print(categories)
categories.append('nonDGA')
print("\nTotal output classes will be: {}\n".format(len(categories)))
num_categories = len(categories)

Categories of DGA domains: 43

['Cryptolocker - Flashback', 'Post Tovar GOZ', 'geodo', 'dyre', 'corebot', 'symmi', 'padcrypt', 'locky', 'tinba', 'pushdo', 'P2P Gameover Zeus', 'shiotob/urlzone/bebloh', 'hesperbot', 'cryptowall', 'ramnit', 'dircrypt', 'ranbyus', 'pykspa', 'murofet', 'Volatile Cedar / Explosive', 'beebone', 'bedep', 'fobber', 'necurs', 'qakbot', 'tempedreve', 'ramdo', 'kraken', 'bamital', 'vawtrak', 'sisron', 'chinad', 'gozi', 'sphinx', 'proslikefan', 'vidro', 'madmax', 'dromedan', 'g01', 'pandabanker', 'mirai', 'unknownjs', 'unknowndropper']

Total output classes will be: 44



In [10]:
# Extract unique DGA categories on DGA non-high feed

categoriesNHC = list(dga_all_df_slim['dga'].unique())
print("Categories of DGA domains in non-high confidence feed: {}\n".format(len(categoriesNHC)))
print(categoriesNHC)
categoriesNHC.append('nonDGA')
print("\nTotal output classes will be: {}\n".format(len(categoriesNHC)))
num_categoriesNHC = len(categoriesNHC)

Categories of DGA domains in non-high confidence feed: 52

['Cryptolocker - Flashback', 'Post Tovar GOZ', 'geodo', 'dyre', 'corebot', 'symmi', 'nymaim', 'padcrypt', 'virut', 'locky', 'matsnu', 'tinba', 'pushdo', 'P2P Gameover Zeus', 'shifu', 'shiotob/urlzone/bebloh', 'banjori', 'hesperbot', 'cryptowall', 'ramnit', 'dircrypt', 'ranbyus', 'simda', 'pykspa', 'murofet', 'Volatile Cedar / Explosive', 'beebone', 'bedep', 'suppobox', 'fobber', 'necurs', 'qakbot', 'tempedreve', 'ramdo', 'kraken', 'bamital', 'vawtrak', 'sisron', 'chinad', 'gozi', 'sphinx', 'proslikefan', 'vidro', 'madmax', 'dromedan', 'g01', 'pandabanker', 'pizd', 'mirai', 'tofsee', 'unknownjs', 'unknowndropper']

Total output classes will be: 53



In [11]:
# compare the two categories
setHC = set(categories)
setNHC = set(categoriesNHC)
unseenCategories = setNHC - setHC
print("New UNSEEEN categories in NHC feed: ", unseenCategories)

New UNSEEEN categories in NHC feed:  {'simda', 'shifu', 'suppobox', 'banjori', 'pizd', 'matsnu', 'nymaim', 'tofsee', 'virut'}


In [12]:
# Check skewness in the dataset with respect to DGA categories

counts = dga_df_slim['dga'].value_counts(normalize=True).to_frame()
print("\nFraction of Most frequent categories in HC (training) feed:", end='')
display(counts.head())
print("\nFraction of Least frequent categories in HC (training) feed:", end='')
display(counts.tail())



Fraction of Most frequent categories in HC (training) feed:

Unnamed: 0,dga
tinba,0.174597
Post Tovar GOZ,0.172796
ramnit,0.14707
necurs,0.1126
murofet,0.074669



Fraction of Least frequent categories in HC (training) feed:

Unnamed: 0,dga
gozi,6.3e-05
mirai,8e-06
dromedan,5e-06
madmax,3e-06
g01,3e-06


The above data shows heavy skew and is highly imbalanced for more than half the lower DGA categories in the sorted list. This might affect the training and prediction of the model for these categories.

In [13]:
# Check the counts of new categories if any, in NHC feed
countsNHC = dga_all_df_slim['dga'].value_counts(normalize=True).to_frame()
print("\nFraction of unseen categories in NHC feed:", end='')
display(countsNHC.loc[list(unseenCategories),:])



Fraction of unseen categories in NHC feed:

Unnamed: 0,dga
simda,0.017301
shifu,0.002733
suppobox,0.001189
banjori,0.515025
pizd,0.000598
matsnu,5.6e-05
nymaim,0.007035
tofsee,4.7e-05
virut,0.000704


In [11]:
THRESHOLD_COUNT = 50                     # lower count limit beyond which categories are merged together
MERGED_CAT_STR = 'mergedDGA'             # name of merged category

MERGED_CAT_LIST = []
""" Trim down the categories that have sparse data, and merge them into one """
def trim_categories(input_row, threshold=THRESHOLD_COUNT):
    if input_row['dga'] < threshold:
        newcat = MERGED_CAT_STR
        MERGED_CAT_LIST.append(input_row.name)
    else:
        newcat = input_row.name
    return newcat


In [12]:
counts['newCat'] = counts.apply(trim_categories, axis=1)
print("Merged categories: ", MERGED_CAT_LIST)
display(counts)

Merged categories:  ['pandabanker', 'gozi', 'mirai', 'dromedan', 'g01', 'madmax']


Unnamed: 0,dga,newCat
tinba,66688,tinba
Post Tovar GOZ,66000,Post Tovar GOZ
ramnit,56174,ramnit
necurs,43008,necurs
murofet,28520,murofet
ranbyus,26040,ranbyus
qakbot,20000,qakbot
pykspa,14215,pykspa
shiotob/urlzone/bebloh,12521,shiotob/urlzone/bebloh
kraken,8988,kraken


In [13]:
# Update DGA frame with new categories

def update_categories(input_row):
    if input_row['dga'] in MERGED_CAT_LIST:
        return MERGED_CAT_STR
    else:
        return input_row['dga']

dga_df_slim['dga'] = dga_df_slim.apply(update_categories, axis=1)


In [14]:
display(dga_df_slim['dga'].unique())

array(['Cryptolocker - Flashback', 'Post Tovar GOZ', 'geodo', 'dyre',
       'corebot', 'symmi', 'padcrypt', 'locky', 'tinba', 'pushdo',
       'P2P Gameover Zeus', 'shiotob/urlzone/bebloh', 'hesperbot',
       'cryptowall', 'ramnit', 'dircrypt', 'ranbyus', 'pykspa', 'murofet',
       'Volatile Cedar / Explosive', 'beebone', 'bedep', 'fobber',
       'necurs', 'qakbot', 'tempedreve', 'ramdo', 'kraken', 'bamital',
       'vawtrak', 'sisron', 'chinad', 'mergedDGA', 'sphinx',
       'proslikefan', 'vidro', 'unknownjs', 'unknowndropper'],
      dtype=object)

In [15]:
# Combine DGA/nonDGA dataframes, and factorize categories (mapping to integer indices)

unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
temp_df = unified_df.copy()
unified_df['catIndex'], labels = pd.factorize(unified_df['dga'], sort=True)

In [16]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X = unified_df['domain']
Y = unified_df['catIndex']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)
Y_train_binarized = to_categorical(Y_train, num_classes=num_categories)

In [21]:
# Multiclass LSTM model

TRAIN_MODEL = False                                          # Load saved model otherwise
max_features = 1000                                          # length of vocabulary
batch_size = 128                                             # input batch size
num_epochs = 5                                               # epochs to train
num_labels = num_categories                                  # number of output classes
    
if TRAIN_MODEL != True:
    file = open('.\\saved_models\\multiclass_LSTM.json', 'r')
    model_load = file.read()
    file.close()
    model = model_from_json(model_load)
    model.load_weights('.\\saved_models\\multiclass_LSTM.h5')
    print('SKIPPED MODEL TRAINING.\nSAVED MODEL IS NOW LOADED!')

else:                                                        # train the model
    # encode string characters to integers
    encoder = text.Tokenizer(num_words=500, char_level=True)
    encoder.fit_on_texts(X_train)                            # build character indices
    X_train_tz = encoder.texts_to_sequences(X_train)
    
    # Model definition - this is the core model from Endgame
    model=Sequential()
    model.add(Embedding(max_features, 128, input_length=75))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    
    # Pad sequence where sequences are case insensitive characters encoded to
    # integers from 0 to number of valid characters
    X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)
    
    # Train where Y_train is 0-1
    model.fit(X_train_pad, Y_train_binarized, batch_size=batch_size, epochs=num_epochs)

SKIPPED MODEL TRAINING.
SAVED MODEL IS NOW LOADED!


In [None]:
# Validation on test dataset

X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=75)
Y_pred = model.predict_classes(X_test_pad)

In [None]:
acc = accuracy_score(Y_test, Y_pred)
print("Model's overall accuracy = {:8.3f} %\n".format(acc*100))
metrics_report = classification_report(Y_test, Y_pred, target_names=labels)
print(metrics_report)

In [None]:
# Save model and weights
if TRAIN_MODEL == True:
    model_save = model.to_json()
    with open('.\\saved_models\\multiclass_LSTM.json', 'w') as file:
        file.write(model_save)
    model.save_weights('.\\saved_models\\multiclass_LSTM.h5')
    print('MODEL SAVED TO DISK!')
else:
    print('MODEL AREADY SAVED TO DISK.')

## Look ahead and next steps:
__1__ Look closer at the misclassified domains. Any particular DGA category stands out? What do we need to improve? 

__2__ Improving classification accuracy - more balanced dataset especially for the multiclass classification.

__3__ Learning from scratch takes significant time. Need to implement model update in batches of new domain dataset.

__4__ We may get a dynamic dataset with more than 60 categories for example, and code need to step in and trim down the categories to an upper limit say 50 at max. Or this could be implemented as dropping off the categories with inadequate data available, say less than 1000 available domain names.