In [1]:
import numpy

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM

import random

In [2]:
# PARAMS
model_file = "/models/model-proposed.h5" # 50 neurons + 10 batch + softsign

In [3]:
# fix random seed for reproducibility
numpy.random.seed(1234)

clean_file   = "/data/alexa-32k.txt"
malware_file = "/data/dga-32k.txt"

In [4]:
# Load clean and malware datasets
with open(clean_file) as f:
    clean_domains = f.read().splitlines()
clean_domains.append(67*"x")

with open(malware_file) as f:
    malware_domains = f.read().splitlines()
malware_domains.append(67*"x")

In [5]:
# Create Dictionary
charset = list("abcdefghijklmnopqrstuvwxyz0123456789.-")
dictionary = dict(zip(charset, range(len(charset))))
reverse_dictionary = dict(zip(range(len(charset)), charset))

In [6]:
# Translate Domain Name to Vector
def domain_to_vector(domain, dictionary):
    res = []
    for c in list(domain):
        v = [float(0)] * len(dictionary)
        v[dictionary[c]] = 1.0
        res.append(v)
    return res

In [7]:
# Translate Domain List to DataSet format
def domainlist_to_dataset(domainlist, result, dictionary):
    x = [ domain_to_vector(v, dictionary) for v in domainlist ]
    y = [ [result] for y in range(len(x))]
    
    return x, y

In [8]:
# Extract Domain Lists and Merge them
x_clean_noarray, y_clean_noarray = domainlist_to_dataset(clean_domains, 0, dictionary)
x_malware_noarray, y_malware_noarray = domainlist_to_dataset(malware_domains, 1, dictionary)
x_noarray = x_clean_noarray[:-1] + x_malware_noarray
y_noarray = y_clean_noarray[:-1] + y_malware_noarray

In [9]:
# Padding zeros & Convert to Array
novalue = [float(0)] * len(dictionary)
x_noarray_pad = pad_sequences(x_noarray, dtype=float, value=novalue, padding='post')
x_sorted = numpy.array(x_noarray_pad, dtype=float)[:-1]
y_sorted = numpy.array(y_noarray, dtype=float)[:-1]

In [10]:
# Randomize dataset
x_discard, x, y_discard, y = train_test_split(x_sorted, y_sorted, test_size=0.9, random_state=4)

In [11]:
# Get malware samples in a separate dataset
x_malware_noarray_pad = pad_sequences(x_malware_noarray, dtype=float, value=novalue, padding='post')
x_malware = numpy.array(x_malware_noarray_pad, dtype=float)[:-1]
y_malware = numpy.array(y_malware_noarray, dtype=float)[:-1]

In [12]:
# Get clean samples in a separate dataset
x_clean_noarray_pad = pad_sequences(x_clean_noarray, dtype=float, value=novalue, padding='post')
x_clean = numpy.array(x_clean_noarray_pad, dtype=float)[:-1]
y_clean = numpy.array(y_clean_noarray, dtype=float)[:-1]

In [13]:
# Load LSTM Model
model = load_model(model_file)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 50)                17800     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 17,851
Trainable params: 17,851
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Predict malware samples
score_malware = numpy.asarray(model.predict(x_malware))
pred_malware = numpy.where(score_malware >= 0.5, 1, 0)

In [15]:
# Predict clean samples
score_clean = numpy.asarray(model.predict(x_clean))
pred_clean = numpy.where(score_clean >= 0.5, 1, 0)

In [16]:
# Obtan false negatives (from malware)
false_negative_index = numpy.where(pred_malware == 0)[0]
numpy.random.seed(1234)
false_negative_index_sample = numpy.random.choice(false_negative_index, 10)
false_negative_domains = numpy.take(malware_domains, false_negative_index_sample)
x_false_negative_domains = numpy.take(x_malware, false_negative_index_sample, axis=0)

false_negative_domains

array(['nosegrain.net', 'onlletgodftxsels.ru', 'dzrecwimln.com',
       'blaspsacerpotest.com', 'cdpsad.com', 'earnestinelongstaff.net',
       'facenine.net', 'chiefdinner.net', 'tosxxoa.com',
       'persitretinere.com'], dtype='<U67')

In [17]:
# Obtan false positives (from clean)
false_positive_index = numpy.where(pred_clean == 1)[0]
false_positive_index_sample = false_positive_index[0:10]
false_positive_domains = numpy.take(clean_domains, false_positive_index_sample)
x_false_positive_domains = numpy.take(x_clean, false_positive_index_sample, axis=0)

false_positive_domains

array(['doubleclick.net', 'slideshare.net', 'slickdeals.net',
       'adplxmd.com', 'secureserver.net', 'themeforest.net',
       'trackingclick.net', 'daikynguyenvn.com', 'prjcq.com',
       'bookmyshow.com'], dtype='<U67')