In [18]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
import tldextract as tde

#Load Model dan Load Word2vec
model_name = 'BIGRU_BIGRAM_W2V_MULTICLASS'
w2v_name = 'word2vec'
best_model = tf.keras.models.load_model('../Models/trained_models/' + model_name+ '.h5', compile=False)
word2vec = Word2Vec.load('../Models/word2vec/'+w2v_name+'.model')

In [28]:
#this is the preprocess
def n_grams(word, n):

    # We can't find n-grams if the word has less than n letters.
    if n > len(word):
        return []

    output = []
    start_idx = 0
    end_idx = start_idx + n

    # Grab all n-grams except the last one
    while end_idx < len(word):
        n_gram = word[start_idx:end_idx]
        output.append(n_gram)
        start_idx = end_idx - 1
        end_idx = start_idx + n

    # Grab the last n-gram
    last_n_gram_start = len(word) - n
    last_n_gram_end = len(word)
    output.append(word[last_n_gram_start:last_n_gram_end])

    return output

extract = tde.TLDExtract(
include_psl_private_domains=True,
# suffix_list_urls=["file:///D:/THESIS_LAB/Dataset/public_suffix_list.txt"], 
cache_dir='./cache_tld/',
fallback_to_snapshot=False)

def extract_tld(domain):
    ext = extract(domain)
    return ext

def Preprocess(data):

    #Remove Subdomain
    clean_domains = []
    hostname = data['Host Name'].tolist()
    for dom in hostname:
        tmp = extract_tld(dom.lower())
        dms = tmp.domain+'.'+tmp.suffix
        clean_domains.append(dms)
    # return clean_domains


    # Creating the Bigram model
    totalRecord = len(clean_domains)
    corpus=[]
    length=0
    for i, dom in enumerate(clean_domains):
        bigram = n_grams(dom,2)
        corpus.append(bigram)
        # if len(bigram)>length:
        #     length = len(bigram)
    
    X = np.zeros([len(corpus), 75], dtype=np.int32)
    for i, sentence in enumerate(corpus):
        #print(sentence)
        for t, word in enumerate(sentence):
            X[i, t] = word2vec.wv.key_to_index[word]
    
    return X
    


In [51]:
#Open DNS Capture Data
# Load data

data_home = '../DNSCapture/'
# dnscap = pd.read_csv(data_home+'10.107.21.162_10012023.csv', encoding='ISO-8859-1', sep=',')
dnscap = pd.read_csv(data_home+'10.107.1.232_27122022.csv', encoding='ISO-8859-1', sep=',')
print(len(dnscap))
print(dnscap.columns)
#Remove NXDomain
# dnscap.drop(dnscap.index[dnscap["Response Code"] == 'Name Error'])

X = Preprocess(dnscap)

Y_pred = best_model.predict(X)
Y_pred = np.argmax(Y_pred,axis=1)

dga_labels_dict = {'normal':0, 'bamital': 1, 'banjori':2, 'bedep':3, 'chinad':4, 'conficker':5, 'corebot':6, 'cryptolocker':7, 'dnschanger':8, 'dyre':9, 'emotet':10, 'gozi':11,'locky':12, 'matsnu':13, 'monerominer':14, 'murofet':15, 'mydoom':16, 'padcrypt':17, 'pandabanker':18, 'qakbot':19,'rovnix':20, 'sisron':21, 'sphinx':22, 'suppobox':23,'sutra':24, 'symmi':25,'szribi':26, 'tinynuke':27, 'torpig':28, 'vidro':29,'virut':30}

#reverse
inv_label = {v: k for k, v in dga_labels_dict.items()}
label_pred = list(map(inv_label.get, Y_pred))
pred = {'DGA Domain?': label_pred}
col_pred = pd.DataFrame(pred)

new_df = pd.concat([dnscap,col_pred], axis=1)

#swap columns
cols = list(new_df.columns)
a, b = cols.index('Query ID'), cols.index('DGA Domain?')
cols[b], cols[a] = cols[a], cols[b]
new_df = new_df[cols]
new_df

1725
Index(['Host Name', 'Port Number', 'Query ID', 'Request Type', 'Request Time',
       'Response Time', 'Duration', 'Response Code', 'Records Count', 'A',
       'CNAME', 'AAAA', 'NS', 'MX', 'SOA', 'PTR', 'SRV', 'TEXT',
       'Source Address', 'Destination Address', 'IP Country'],
      dtype='object')


Unnamed: 0,Host Name,Port Number,DGA Domain?,Request Type,Request Time,Response Time,Duration,Response Code,Records Count,A,...,NS,MX,SOA,PTR,SRV,TEXT,Source Address,Destination Address,IP Country,Query ID
0,contile.services.mozilla.com,57445,normal,A,12/27/2022 10:29:34 AM.160,12/27/2022 10:29:34 AM.163,2 ms,Ok,1,34.117.237.239,...,,,,,,,10.107.1.232,10.10.233.11,,820D
1,contile.services.mozilla.com,57445,normal,AAAA,12/27/2022 10:29:34 AM.165,12/27/2022 10:29:34 AM.167,1 ms,Ok,1,,...,,,"Admin: awsdns-hostmaster.amazon.com, Primary S...",,,,10.107.1.232,10.10.233.11,,F195
2,wpad.pu.go.id,57445,normal,A,12/27/2022 10:29:52 AM.785,12/27/2022 10:29:52 AM.787,1 ms,Name Error,1,,...,,,"Admin: postmaster.pu.go.id, Primary Server: ns...",,,,10.107.1.232,10.10.233.11,,9EBE
3,dns.google,56402,normal,A,12/27/2022 10:30:23 AM.264,12/27/2022 10:30:23 AM.266,2 ms,Ok,2,8.8.4.4 8.8.8.8,...,,,,,,,10.107.1.232,10.10.233.11,,04BA
4,dns.google,64660,normal,,12/27/2022 10:30:23 AM.264,12/27/2022 10:30:23 AM.266,2 ms,Ok,1,,...,,,"Admin: cloud-dns-hostmaster.google.com, Primar...",,,,10.107.1.232,10.10.233.11,,6C49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,config.edge.skype.com,55852,normal,A,12/27/2022 3:32:59 PM.048,12/27/2022 3:32:59 PM.049,1 ms,Ok,5,13.107.42.16,...,,,,,,,10.107.1.232,10.10.233.11,,00B4
1721,web.whatsapp.com,55061,normal,A,12/27/2022 3:33:08 PM.276,12/27/2022 3:33:08 PM.287,11 ms,Ok,2,157.240.208.60,...,,,,,,,10.107.1.232,10.10.233.11,,6E31
1722,mmx-ds.cdn.whatsapp.net,55852,normal,A,12/27/2022 3:33:08 PM.288,12/27/2022 3:33:08 PM.289,1 ms,Ok,1,157.240.208.60,...,,,,,,,10.107.1.232,10.10.233.11,,408D
1723,mmx-ds.cdn.whatsapp.net,55061,normal,AAAA,12/27/2022 3:33:08 PM.290,12/27/2022 3:33:08 PM.292,2 ms,Ok,1,,...,,,,,,,10.107.1.232,10.10.233.11,,7402
