# Imports and file paths

In [1]:
import sys
import random
import itertools
import multiprocessing
from os import path

from ngram import NGram 
import numpy as np

from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

DATA_DIR = "../../data"
bro_file = path.join(DATA_DIR, "bro.dat")
good_urls_file = path.join(DATA_DIR, "top-1m.csv")

DOMAIN_MATCH = 'Intel::DOMAIN'
ADDR_MATCH = 'Intel::ADDR'
URL_MATCH = 'Intel::URL'

CORES = multiprocessing.cpu_count()

NGRAM_N = 2
MAX_DOMAINS = 50000


# Bro

## Parse bro list of bad domains, IPs, and URLs

In [2]:
%%time

bad_urls = []
bad_addrs = []
bad_domains = []
unknowns = []

with open(bro_file) as f:
    f.readline() # first line is a comment. skip over it now
    for idx, line in enumerate(f):
        l = line.strip()
            
        l = l.split("\t")
        if len(l) is not 4:
            continue
            
        if l[1] == DOMAIN_MATCH:
            bad_domains.append(l[0])
        elif l[1] == ADDR_MATCH:
            bad_addrs.append(l[0])
        elif l[1] == URL_MATCH:
            bad_urls.append(l[0])
        else:
            unknowns.append(l)

CPU times: user 1.32 s, sys: 80 ms, total: 1.4 s
Wall time: 1.4 s


In [3]:
print("IPs: " + str(len(bad_addrs)))
print("URLs: " + str(len(bad_urls)))
print("Domains: " + str(len(bad_domains)))
print("Etc: " + str(len(unknowns)))

IPs: 155313
URLs: 44224
Domains: 1182345
Etc: 9256


In [4]:
print(set([l[1] for l in unknowns]))

{'Intel::FILE_NAME', 'Intel::FILE_HASH', 'Intel::EMAIL'}


# Machine Learning

## N-Grams

Begin by loading in the top 50k "good" domains. Generate n-grams for all domains, and combine in a manner suitable for sklearns classifiers. Do a train / test split at 25%.

In [5]:
ngram_gen = NGram(N=NGRAM_N)
char_list = list("abcdefghijklmnopqrstuvwxyz1234567890.-")

In [6]:
with open(good_urls_file) as f:
    good_domains = [line.rstrip().split(",")[1] for idx, line in enumerate(f) if idx < MAX_DOMAINS]

In [7]:
n_perm = ["".join(tup) for tup in itertools.product(char_list, repeat=NGRAM_N)]
perm_lookup = {perm: idx for idx, perm in enumerate(n_perm)}
feature_length = len(perm_lookup)

In [8]:
#take up to 50k bad domains
if len(bad_domains) > MAX_DOMAINS:
    bad_domains = bad_domains[0:MAX_DOMAINS]

domains_features = np.zeros([len(good_domains) + len(bad_domains), feature_length], dtype=np.int8)

for idx, good_domain in enumerate(good_domains):
    good_url = "".join([c for c in list(good_domain) if c in char_list])
    for gram in list(ngram_gen._split(good_domain)):
        domains_features[idx, perm_lookup[gram]] += 1
        
for idx, bad_url in enumerate(bad_domains):
    bad_url = "".join([c for c in list(bad_url) if c in char_list])
    for gram in list(ngram_gen._split(bad_url)):
        domains_features[idx + len(good_domains), perm_lookup[gram]] += 1   

y = ["good" for i in range(len(good_domains))] + ["bad" for i in range(len(bad_domains))]

## Naive Bayes

In [9]:
%%time

def NB(domains_features, y, alpha=150):
    clf = MultinomialNB(alpha=alpha)

    # Perform the predictions
    scores = cross_val_score(clf, domains_features, y, cv=10)
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean()*100, scores.std() * 2))

NB(domains_features, y)

Accuracy: 84.2660 (+/- 0.0074)
CPU times: user 3.48 s, sys: 836 ms, total: 4.32 s
Wall time: 4.32 s


## Random Forest

In [10]:
%%time

def random_forest(trees, domains_features, y, cores=multiprocessing.cpu_count()):
    clf = RandomForestClassifier(n_estimators=trees, n_jobs=cores)

    # Perform the predictions
    scores = cross_val_score(clf, domains_features, y, cv=10)
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean()*100, scores.std() * 2))
    
random_forest(40, domains_features, y)

Accuracy: 90.5950 (+/- 0.0052)
CPU times: user 41min 6s, sys: 1.53 s, total: 41min 8s
Wall time: 5min 25s


## SVM

In [None]:
%%time

def svm(domains_features, y, kernel='rbf'):
    clf = SVC(kernel=kernel)

    # Perform the predictions
    scores = cross_val_score(clf, domains_features, y, cv=10)
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean()*100, scores.std() * 2))

svm(domains_features, y)