# Imports and file paths

In [1]:
import sys
import itertools
from os import path

from ngram import NGram 
import numpy as np

from sklearn.datasets import fetch_mldata
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

DATA_DIR = "../../data"
bro_file = path.join(DATA_DIR, "bro.dat")
good_urls_file = path.join(DATA_DIR, "top-1m.csv")

DOMAIN_MATCH = 'Intel::DOMAIN'
ADDR_MATCH = 'Intel::ADDR'
URL_MATCH = 'Intel::URL'
NGRAM_N = 2



# Bro

## Parse bro list of bad domains, IPs, and URLs

In [2]:
%%time

bad_urls = []
addrs = []
domains = []
unknowns = []

with open(bro_file) as f:
    f.readline() # first line is a comment. skip over it now
    for idx, line in enumerate(f):
        l = line.strip()
            
        l = l.split("\t")
        if len(l) is not 4:
            continue
            
        if l[1] == DOMAIN_MATCH:
            domains.append(l[0])
        elif l[1] == ADDR_MATCH:
            addrs.append(l[0])
        elif l[1] == URL_MATCH:
            bad_urls.append(l[0])
        else:
            unknowns.append(l)

CPU times: user 1.28 s, sys: 72 ms, total: 1.36 s
Wall time: 1.36 s


In [3]:
print("IPs: " + str(len(addrs)))
print("URLs: " + str(len(bad_urls)))
print("Domains: " + str(len(domains)))
print("Etc: " + str(len(unknowns)))

IPs: 155313
URLs: 44224
Domains: 1182345
Etc: 9256


In [4]:
print(set([l[1] for l in unknowns]))

{'Intel::FILE_HASH', 'Intel::FILE_NAME', 'Intel::EMAIL'}


# Machine Learning

## N-Grams

Begin by loading in the top 50k "good" domains. Generate n-grams for all domains, and combine in a manner suitable for sklearns classifiers. Do a train / test split at 25%.

In [5]:
ngram_gen = NGram(N=NGRAM_N)
char_list = list("abcdefghijklmnopqrstuvwxyz1234567890")

In [6]:
with open(good_urls_file) as f:
    good_urls = [line.rstrip().split(",")[1] for idx, line in enumerate(f) if idx < 50000]

In [7]:
n_perm = ["".join(tup) for tup in itertools.product(char_list, repeat=NGRAM_N)]
perm_lookup = {perm: idx for idx, perm in enumerate(n_perm)}
feature_length = len(perm_lookup)

In [8]:
good_urls_features = np.zeros([len(good_urls), feature_length], dtype=np.int8)
for idx, good_url in enumerate(good_urls):
    good_url = "".join([c for c in list(good_url) if c in char_list])
    for gram in list(ngram_gen._split(good_url)):
        good_urls_features[idx, perm_lookup[gram]] += 1
        
bad_urls_features = np.zeros([len(bad_urls), feature_length], dtype=np.int8)
for idx, bad_url in enumerate(bad_urls):
    bad_url = "".join([c for c in list(bad_url) if c in char_list])
    for gram in list(ngram_gen._split(bad_url)):
        bad_urls_features[idx, perm_lookup[gram]] += 1   


In [9]:
X, y = np.append(good_urls_features, bad_urls_features, axis=0), ["good" for i in range(len(good_urls_features))] + ["bad" for i in range(len(bad_urls_features))]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

## Random Forest

In [10]:
%%time

clf = RandomForestClassifier(n_estimators=20, n_jobs=8)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

Accuracy = 96.97317031754118 %
Classification Report 
              precision    recall  f1-score   support

       good       0.95      0.99      0.97     12456
        bad       0.99      0.94      0.97     11100

avg / total       0.97      0.97      0.97     23556

CPU times: user 1min 2s, sys: 152 ms, total: 1min 2s
Wall time: 9.49 s


## Naive Bayes

In [11]:
%%time

clf = MultinomialNB(alpha=150)

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))

Accuracy = 86.64459161147903 %
Classification Report 
              precision    recall  f1-score   support

       good       0.86      0.90      0.88     12456
        bad       0.88      0.83      0.85     11100

avg / total       0.87      0.87      0.87     23556

CPU times: user 284 ms, sys: 72 ms, total: 356 ms
Wall time: 355 ms


## SVM

In [None]:
%%time

clf = SVC(kernel='linear')

# Fit the model
clf.fit(X_train, y_train)

# Perform the predictions
y_predicted = clf.predict(X_test)

print("Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100))
print("Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=["good", "bad"])))