# Imports and file paths

In [1]:
import sys
import random
import itertools
import multiprocessing
from os import path

from ngram import NGram 
import numpy as np

from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

DATA_DIR = "../../data"
bro_file = path.join(DATA_DIR, "bro.dat")
good_urls_file = path.join(DATA_DIR, "top-1m.csv")

DOMAIN_MATCH = 'Intel::DOMAIN'
ADDR_MATCH = 'Intel::ADDR'
URL_MATCH = 'Intel::URL'

CORES = multiprocessing.cpu_count()

NGRAM_N = 2
MAX_DOMAINS = 50000




# Bro

## Parse bro list of bad domains, IPs, and URLs

In [2]:
%%time

bad_urls = []
bad_addrs = []
bad_domains = []
unknowns = []

with open(bro_file) as f:
    f.readline() # first line is a comment. skip over it now
    for idx, line in enumerate(f):
        l = line.strip()
            
        l = l.split("\t")
        if len(l) is not 4:
            continue
            
        if l[1] == DOMAIN_MATCH:
            bad_domains.append(l[0])
        elif l[1] == ADDR_MATCH:
            bad_addrs.append(l[0])
        elif l[1] == URL_MATCH:
            bad_urls.append(l[0])
        else:
            unknowns.append(l)

CPU times: user 1.28 s, sys: 56 ms, total: 1.34 s
Wall time: 1.33 s


In [3]:
print("IPs: " + str(len(bad_addrs)))
print("URLs: " + str(len(bad_urls)))
print("Domains: " + str(len(bad_domains)))
print("Etc: " + str(len(unknowns)))

IPs: 155313
URLs: 44224
Domains: 1182345
Etc: 9256


In [4]:
print(set([l[1] for l in unknowns]))

{'Intel::EMAIL', 'Intel::FILE_HASH', 'Intel::FILE_NAME'}


# Machine Learning

## N-Grams

Begin by loading in the top 50k "good" domains. Generate n-grams for all domains, and combine in a manner suitable for sklearns classifiers. Do a train / test split at 25%.

In [5]:
ngram_gen = NGram(N=NGRAM_N)
char_list = list("abcdefghijklmnopqrstuvwxyz1234567890.-")

In [6]:
with open(good_urls_file) as f:
    good_domains = [line.rstrip().split(",")[1] for idx, line in enumerate(f) if idx < MAX_DOMAINS]

In [7]:
n_perm = ["".join(tup) for tup in itertools.product(char_list, repeat=NGRAM_N)]
perm_lookup = {perm: idx for idx, perm in enumerate(n_perm)}
feature_length = len(perm_lookup)

In [8]:
#take up to 50k bad domains
if len(bad_domains) > MAX_DOMAINS:
    bad_domains = bad_domains[0:MAX_DOMAINS]

domains_features = np.zeros([len(good_domains) + len(bad_domains), feature_length], dtype=np.int8)

for idx, good_domain in enumerate(good_domains):
    good_url = "".join([c for c in list(good_domain) if c in char_list])
    for gram in list(ngram_gen._split(good_domain)):
        domains_features[idx, perm_lookup[gram]] += 1
        
for idx, bad_url in enumerate(bad_domains):
    bad_url = "".join([c for c in list(bad_url) if c in char_list])
    for gram in list(ngram_gen._split(bad_url)):
        domains_features[idx + len(good_domains), perm_lookup[gram]] += 1   


In [9]:
y = ["good" for i in range(len(good_domains))] + ["bad" for i in range(len(bad_domains))]

X_train, X_test, y_train, y_test = train_test_split(domains_features, y, test_size=0.25)

del(domains_features)

## Naive Bayes

In [10]:
def NB(X_train, X_test, y_train, y_test, alpha=150):
    clf = MultinomialNB(alpha=alpha)

    # Fit the model
    clf.fit(X_train, y_train)

    # Perform the predictions
    y_predicted = clf.predict(X_test)

    print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
    print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

#NB(X_train, X_test, y_train, y_test)

Accuracy = 84.096 %
Classification Report 
              precision    recall  f1-score   support

       good       0.80      0.91      0.85     12526
        bad       0.90      0.77      0.83     12474

avg / total       0.85      0.84      0.84     25000



## Random Forest

In [11]:
def random_forest(trees, X_train, X_test, y_train, y_test, cores=multiprocessing.cpu_count()):
    clf = RandomForestClassifier(n_estimators=trees, n_jobs=cores)

    # Fit the model
    clf.fit(X_train, y_train)

    # Perform the predictions
    y_predicted = clf.predict(X_test)

    print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
    print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))
    
#random_forest(40, X_train, X_test, y_train, y_test)

Accuracy = 90.27199999999999 %
Classification Report 
              precision    recall  f1-score   support

       good       0.85      0.97      0.91     12526
        bad       0.97      0.83      0.90     12474

avg / total       0.91      0.90      0.90     25000



## SVM

In [12]:
def svm(X_train, X_test, y_train, y_test, kernel='rbf'):
    clf = SVC(kernel=kernel)

    # Fit the model
    clf.fit(X_train, y_train)

    # Perform the predictions
    y_predicted = clf.predict(X_test)

    print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
    print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

#svm(X_train, X_test, y_train, y_test)

## Neural Network

In [13]:
# Data vomiter
def single_epoch_data():
    data_feed = [[d, "good"] for d in good_domains] + [[d, "bad"] for d in bad_domains]
    random.shuffle(data_feed)
    for datum in data_feed:
        yield datum