# Exploration of Merging Data Files

This notebook was built to explore how to best merge the Cisco Umbrella 1m Domains file and the Bambenek Consulting DGA High-Confidence Feed.

In [30]:
# Libraries
import zipfile
import numpy as np
import pandas as pd
import re
import os
import time
import sys
import lstm_binary
import imp
from sklearn.model_selection import train_test_split

In [76]:
PROJECT_ROOT = "./"
BINARY_TOKENIZER_FILE = PROJECT_ROOT + "saved_models/large_data/binary_tokenizer.pkl"
BINARY_CATEGORIES_FILE = PROJECT_ROOT + "saved_models/large_data/binary_categories.pkl"
BINARY_MODEL_JSON = PROJECT_ROOT + "saved_models/large_data/binary_LSTM.json"
BINARY_MODEL_H5 = PROJECT_ROOT + "saved_models/large_data/binary_LSTM.h5"

In [58]:
dtype_dic= {'domain': str, 
            'dga' : int}
merged_csv = '/home/nscsekhar/1214/team_cyber/data/merged-2018-12-15_2month.csv'
df = pd.read_csv(merged_csv, dtype = dtype_dic)

In [59]:
X = df['domain'].astype(str)
Y = df['dga']

In [60]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)

In [61]:
imp.reload(lstm_binary)
train_model = lstm_binary.LSTMBinary()
train_model.train(X_train, Y_train)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [77]:
train_model.save(BINARY_TOKENIZER_FILE, BINARY_MODEL_JSON, BINARY_MODEL_H5)

MODEL SAVED TO DISK!


In [78]:
testmodel = lstm_binary.LSTMBinary()
testmodel.load(BINARY_TOKENIZER_FILE, BINARY_MODEL_JSON, BINARY_MODEL_H5)

SAVED BINARY MODEL IS NOW LOADED!


In [79]:
imp.reload(lstm_binary)
urllist = ["www.google.com", "www.netflix.com", "plvklpgwivery.com"]
urltypes = testmodel.predict(urllist)
print("URL type:", urltypes)

URL type: [[0]
 [0]
 [1]]


In [55]:
X_test

323493                                www.courthousenews.com
1693266                            brushymountainbeefarm.com
2301105                             u3692416.ct.sendgrid.net
2619340                         wapvunigs83gag9ohh8li10e.biz
3003119                                         fdxgdoqyw.co
2805477                             ioisyavelvcfaentjenry.me
3479417                                  yoomvrqqnglmmwwv.ru
3548027                                 1434141147.localhost
3009887                                        kflxwscnpu.ki
345398                              download-toolbar.avg.com
3440832                                  jpsgysoglwodlfh.biz
1447426                     1hwa8721aexmpd1fx0rpd1b0a50h.biz
2131023                                 1161506543.localhost
60410                ipv4-c199-atl001-ix.1.oca.nflxvideo.net
10082                                        media.admob.com
3266609                                 uxknoudnjfuinqfd.com
3545720                 

In [65]:
urltypes = testmodel.predict(X_test)

In [66]:
print((urltypes.shape))

(787345, 1)


In [67]:
print(len(Y_test.values.shape))

1


In [21]:
print(urltypes[:10])
print(Y_test.values[:10])

[[1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]]
[1 0 0 0 1 0 1 0 1 1]


In [68]:
corrects = np.count_nonzero(urltypes.flatten() == Y_test.values)

In [69]:
incorrects = np.nonzero(urltypes.reshape((-1,)) != Y_test.values)
print(len(incorrects[0]))

14721


In [74]:
def incorrect_preds(predictions, Y_test, X_test, filename):
    incorrects = np.nonzero(predictions.reshape((-1,)) != Y_test.values)
    
    handle = open(filename, "w")
    for val in incorrects[0]:
        row = str(X_test.values[val]) + "," + str(Y_test.values[val]) + "\n"
        handle.write(row)    

In [75]:
incorrect_preds(urltypes, Y_test, X_test, "sample.csv")

In [70]:
print(corrects)

772624


In [71]:
total = len(urltypes)

In [72]:
print("Accuracy:", corrects * 1.0/total)

Accuracy: 0.9813029866195886
