# Imports and file paths

In [1]:
import sys
import random
import itertools
import multiprocessing
from os import path

from ngram import NGram 
import numpy as np
import tensorflow as tf

from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split

DATA_DIR = "../../data"
bro_file = path.join(DATA_DIR, "bro.dat")
good_urls_file = path.join(DATA_DIR, "top-1m.csv")

DOMAIN_MATCH = 'Intel::DOMAIN'

NGRAM_N = 3
MAX_DOMAINS = 50000



# Bro

## Parse bro list of bad domains, IPs, and URLs

In [2]:
%%time

bad_domains = []
good_domains = []

with open(bro_file) as f:
    f.readline() # first line is a comment. skip over it now
    for idx, line in enumerate(f):
        l = line.strip().split("\t")
        if len(l) is not 4:
            continue

        if l[1] == DOMAIN_MATCH and len(bad_domains) < MAX_DOMAINS:
            bad_domains.append(l[0].lower())

with open(good_urls_file) as f:
    good_domains = [line.rstrip().split(",")[1].lower() for idx, line in enumerate(f) if idx < MAX_DOMAINS]

CPU times: user 1.36 s, sys: 32 ms, total: 1.39 s
Wall time: 1.39 s


# ML

## N-Grams

In [3]:
ngram_gen = NGram(N=NGRAM_N)
char_list = list("abcdefghijklmnopqrstuvwxyz1234567890.-")

In [4]:
n_perm = ["".join(tup) for tup in itertools.product(char_list, repeat=NGRAM_N)]
perm_lookup = {perm: idx for idx, perm in enumerate(n_perm)}
feature_length = len(perm_lookup)

## Neural Network

In [5]:
# Data vomiter
class data_vomit():
        
    def __init__(self):
        
        class testing():
            def __init__(self):
                self.domains = []
                self.labels = []
        
        self.test = testing()
        self.data_feed = np.array([[d, 1] for d in good_domains] + [[d, 0] for d in bad_domains])
        np.random.shuffle(self.data_feed)
        self.X_train, self.test.domains, self.y_train, self.test.labels = train_test_split(self.data_feed[:,0], self.data_feed[:,1], test_size=0.03)

        self.test.labels = self.to_one_hot(self.test.labels)
        self.y_train = self.to_one_hot(self.y_train)

        self.loc = 0
        self.total_data = len(self.X_train)
        
    def str2ngram(self, domains_arr):
        ngram_result = []
        for d in domains_arr:
            gram_result = np.zeros(feature_length)
            domain = "".join([c for c in list(d) if c in char_list])
            for gram in list(ngram_gen._split(d)):
                gram_result[perm_lookup[gram]] += 1
            ngram_result.append(gram_result)
        return ngram_result
        
    def to_one_hot(self, arr):
        arr = np.array(arr, dtype=np.int8)
        result = np.zeros((len(arr), 2))
        result[np.arange(len(arr)), arr] = 1
        return result
            
    def next_batch(self, n):
        if n + self.loc <= self.total_data:
            x = self.X_train[self.loc:self.loc+n]
            y = self.y_train[self.loc:self.loc+n]
            self.loc = (self.loc + n) % self.total_data
            return x, y
        else:
            x = np.append(self.X_train[self.loc:self.total_data],self.X_train[0:n-self.total_data+self.loc])
            y = np.append(self.y_train[self.loc:self.total_data],self.y_train[0:n-self.total_data+self.loc])
            self.loc = n-self.total_data+self.loc
            return x, y

In [6]:
x = tf.placeholder(tf.float32, [None, feature_length])
W1 = tf.Variable(tf.random_normal([feature_length, 1000]))
b1 = tf.Variable(tf.random_normal([1000]))
hl1 = tf.matmul(x, W1) + b1

W2 = tf.Variable(tf.random_normal([1000, 500]))
b2 = tf.Variable(tf.random_normal([500]))
hl2 = tf.matmul(hl1, W2) + b2

W3 = tf.Variable(tf.random_normal([500, 2]))
b3 = tf.Variable(tf.random_normal([2]))
y = tf.matmul(hl2, W3) + b3

# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 2], name="correct_label")

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)

In [7]:
%%time

def evaluate_success(sess):
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return sess.run(accuracy, feed_dict={x: data.str2ngram(data.test.domains), y_: data.test.labels}) * 100

NUM_EPOCHS = 2
MINIBATCH_SIZE = 200

sess = tf.InteractiveSession()
tf.initialize_all_variables().run()

data = data_vomit()
for i in range(NUM_EPOCHS):
    for _ in range(int(feature_length / MINIBATCH_SIZE)):
        batch_xs, batch_ys = data.next_batch(MINIBATCH_SIZE)
        sess.run(train_step, feed_dict={x: data.str2ngram(batch_xs), y_: batch_ys})
    print("Epoch: " + str(i+1) + ": " + str(evaluate_success(sess)) + "%")

Epoch: 1: 83.7333321571%
Epoch: 2: 86.2333357334%
CPU times: user 58min 43s, sys: 35.1 s, total: 59min 18s
Wall time: 8min 25s


In [8]:
sess.close()