# Imports and file paths

In [1]:
import sys
import itertools
from os import path

from ngram import NGram 
import numpy as np

from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

DATA_DIR = "../../data"
bro_file = path.join(DATA_DIR, "bro.dat")
good_urls_file = path.join(DATA_DIR, "top-1m.csv")

DOMAIN_MATCH = 'Intel::DOMAIN'
ADDR_MATCH = 'Intel::ADDR'
URL_MATCH = 'Intel::URL'
NGRAM_N = 2

MAX_DOMAINS = 50000




# Bro

## Parse bro list of bad domains, IPs, and URLs

In [2]:
%%time

bad_urls = []
bad_addrs = []
bad_domains = []
unknowns = []

with open(bro_file) as f:
    f.readline() # first line is a comment. skip over it now
    for idx, line in enumerate(f):
        l = line.strip()
            
        l = l.split("\t")
        if len(l) is not 4:
            continue
            
        if l[1] == DOMAIN_MATCH:
            bad_domains.append(l[0])
        elif l[1] == ADDR_MATCH:
            bad_addrs.append(l[0])
        elif l[1] == URL_MATCH:
            bad_urls.append(l[0])
        else:
            unknowns.append(l)

CPU times: user 1.26 s, sys: 56 ms, total: 1.32 s
Wall time: 1.31 s


In [3]:
print("IPs: " + str(len(bad_addrs)))
print("URLs: " + str(len(bad_urls)))
print("Domains: " + str(len(bad_domains)))
print("Etc: " + str(len(unknowns)))

IPs: 155313
URLs: 44224
Domains: 1182345
Etc: 9256


In [4]:
print(set([l[1] for l in unknowns]))

{'Intel::FILE_HASH', 'Intel::EMAIL', 'Intel::FILE_NAME'}


# Machine Learning

## N-Grams

Begin by loading in the top 50k "good" domains. Generate n-grams for all domains, and combine in a manner suitable for sklearns classifiers. Do a train / test split at 25%.

In [5]:
ngram_gen = NGram(N=NGRAM_N)
char_list = list("abcdefghijklmnopqrstuvwxyz1234567890.-")

In [6]:
with open(good_urls_file) as f:
    good_urls = [line.rstrip().split(",")[1] for idx, line in enumerate(f) if idx < MAX_DOMAINS]

In [7]:
n_perm = ["".join(tup) for tup in itertools.product(char_list, repeat=NGRAM_N)]
perm_lookup = {perm: idx for idx, perm in enumerate(n_perm)}
feature_length = len(perm_lookup)

In [8]:
num_bad_domains = min(MAX_DOMAINS, len(bad_domains))

urls_features = np.zeros([len(good_urls) + num_bad_domains, feature_length], dtype=np.int8)

for idx, good_url in enumerate(good_urls):
    good_url = "".join([c for c in list(good_url) if c in char_list])
    for gram in list(ngram_gen._split(good_url)):
        urls_features[idx, perm_lookup[gram]] += 1
        
for idx, bad_url in enumerate(bad_domains):
    if idx >= MAX_DOMAINS:
        break
    bad_url = "".join([c for c in list(bad_url) if c in char_list])
    for gram in list(ngram_gen._split(bad_url)):
        urls_features[idx + len(good_urls), perm_lookup[gram]] += 1   


In [9]:
y = ["good" for i in range(len(good_urls))] + ["bad" for i in range(num_bad_domains)]

X_train, X_test, y_train, y_test = train_test_split(urls_features, y, test_size=0.25)

del(urls_features)

## Naive Bayes

In [10]:
%%time

clf = MultinomialNB(alpha=150)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

Accuracy = 84.076 %
Classification Report 
              precision    recall  f1-score   support

       good       0.79      0.92      0.85     12482
        bad       0.90      0.76      0.83     12518

avg / total       0.85      0.84      0.84     25000

CPU times: user 320 ms, sys: 84 ms, total: 404 ms
Wall time: 405 ms


## Random Forest

In [11]:
%%time

clf = RandomForestClassifier(n_estimators=40, n_jobs=8)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

Accuracy = 90.49199999999999 %
Classification Report 
              precision    recall  f1-score   support

       good       0.85      0.98      0.91     12482
        bad       0.97      0.83      0.90     12518

avg / total       0.91      0.90      0.90     25000

CPU times: user 3min 18s, sys: 316 ms, total: 3min 19s
Wall time: 26.2 s


## SVM

In [12]:
%%time

clf = SVC(kernel='rbf')

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

Accuracy = 89.02799999999999 %
Classification Report 
              precision    recall  f1-score   support

       good       0.83      0.98      0.90     12482
        bad       0.97      0.80      0.88     12518

avg / total       0.90      0.89      0.89     25000

CPU times: user 2h 18min 36s, sys: 480 ms, total: 2h 18min 37s
Wall time: 2h 18min 37s
