In [1]:
import json
import pandas as pd
import community.community_louvain
import networkx as nx
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import os
import gzip

In [2]:
df = pd.read_csv('dnsdata.csv')
columns = ['query', 'qclass', 'qtype', 'rcode', 'answers', 'TTLs']
df = df.loc[:, columns]
df

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs
0,145.58.150.184.in-addr.arpa,1.0,12.0,,,
1,sinkhole.paloaltonetworks.com,1.0,28.0,,,
2,time.windows.com,1.0,28.0,,,
3,time.windows.com,1.0,1.0,,,
4,time.milkyway.com,1.0,1.0,,,
...,...,...,...,...,...,...
692347,wpad.dnet.domtar,1.0,1.0,,,
692348,time.milkyway.com,1.0,28.0,,,
692349,cm.iotcplatform.com,1.0,1.0,,,
692350,e28578.d.akamaiedge.net,1.0,1.0,0.0,"['23.43.161.168', '23.43.161.170', '23.43.161....","[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]"


In [3]:
df['answers'] = df['answers'].str.len()
df

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs
0,145.58.150.184.in-addr.arpa,1.0,12.0,,,
1,sinkhole.paloaltonetworks.com,1.0,28.0,,,
2,time.windows.com,1.0,28.0,,,
3,time.windows.com,1.0,1.0,,,
4,time.milkyway.com,1.0,1.0,,,
...,...,...,...,...,...,...
692347,wpad.dnet.domtar,1.0,1.0,,,
692348,time.milkyway.com,1.0,28.0,,,
692349,cm.iotcplatform.com,1.0,1.0,,,
692350,e28578.d.akamaiedge.net,1.0,1.0,0.0,119.0,"[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]"


In [4]:
df['TTLs'] = df['TTLs'].apply(lambda x: [] if pd.isna(x) else eval(x))
df['TTLs'] = df['TTLs'].apply(lambda x: sum(x))

In [5]:
df['TTLs'] = df['TTLs'].replace(0.0, -1.0)
df

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs
0,145.58.150.184.in-addr.arpa,1.0,12.0,,,-1.0
1,sinkhole.paloaltonetworks.com,1.0,28.0,,,-1.0
2,time.windows.com,1.0,28.0,,,-1.0
3,time.windows.com,1.0,1.0,,,-1.0
4,time.milkyway.com,1.0,1.0,,,-1.0
...,...,...,...,...,...,...
692347,wpad.dnet.domtar,1.0,1.0,,,-1.0
692348,time.milkyway.com,1.0,28.0,,,-1.0
692349,cm.iotcplatform.com,1.0,1.0,,,-1.0
692350,e28578.d.akamaiedge.net,1.0,1.0,0.0,119.0,70.0


In [6]:
df['qlength'] = df['query'].str.len()

In [8]:
df = df.dropna(subset=['query'])
df.fillna(-1.0, inplace=True)
df = df[~df['query'].str.endswith('.arpa')]
df

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs,qlength
1,sinkhole.paloaltonetworks.com,1.0,28.0,-1.0,-1.0,-1.0,29.0
2,time.windows.com,1.0,28.0,-1.0,-1.0,-1.0,16.0
3,time.windows.com,1.0,1.0,-1.0,-1.0,-1.0,16.0
4,time.milkyway.com,1.0,1.0,-1.0,-1.0,-1.0,17.0
5,time.milkyway.com,1.0,28.0,-1.0,-1.0,-1.0,17.0
...,...,...,...,...,...,...,...
692347,wpad.dnet.domtar,1.0,1.0,-1.0,-1.0,-1.0,16.0
692348,time.milkyway.com,1.0,28.0,-1.0,-1.0,-1.0,17.0
692349,cm.iotcplatform.com,1.0,1.0,-1.0,-1.0,-1.0,19.0
692350,e28578.d.akamaiedge.net,1.0,1.0,0.0,119.0,70.0,23.0


In [8]:
def query_length(query):
    return len(query)

def query_tld_length(query):
    return len(query.split('.')[-1])

def query_domain_length(query):
    try:
        length =  len(query.split('.')[-2])
    except:
        length = 0
    return length

qlengths = [query_length(query) for query in df['query']]
qsufflengths = [query_tld_length(query) for query in df['query']]
qdomlengths = [query_domain_length(query) for query in df['query']]

df['qlength'] = qlengths
df['tldlength'] = qsufflengths
df['domainlength'] = qdomlengths
df

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs,qlength,query_parts_count,qparts,suffixlength,domainlength
1,sinkhole.paloaltonetworks.com,1.0,28.0,-1.0,-1.0,-1.0,29,3,3,3,16
2,time.windows.com,1.0,28.0,-1.0,-1.0,-1.0,16,3,3,3,7
3,time.windows.com,1.0,1.0,-1.0,-1.0,-1.0,16,3,3,3,7
4,time.milkyway.com,1.0,1.0,-1.0,-1.0,-1.0,17,3,3,3,8
5,time.milkyway.com,1.0,28.0,-1.0,-1.0,-1.0,17,3,3,3,8
...,...,...,...,...,...,...,...,...,...,...,...
692347,wpad.dnet.domtar,1.0,1.0,-1.0,-1.0,-1.0,16,3,3,6,4
692348,time.milkyway.com,1.0,28.0,-1.0,-1.0,-1.0,17,3,3,3,8
692349,cm.iotcplatform.com,1.0,1.0,-1.0,-1.0,-1.0,19,3,3,3,12
692350,e28578.d.akamaiedge.net,1.0,1.0,0.0,119.0,70.0,23,4,4,3,10


In [9]:
data = df[['qtype', 'qlength', 'qparts', 'suffixlength', 'qclass',
           'qtype', 'rcode', 'answers', 'TTLs', 'domainlength']].to_numpy()
data

array([[ 28.,  29.,   3., ...,  -1.,  -1.,  16.],
       [ 28.,  16.,   3., ...,  -1.,  -1.,   7.],
       [  1.,  16.,   3., ...,  -1.,  -1.,   7.],
       ...,
       [  1.,  19.,   3., ...,  -1.,  -1.,  12.],
       [  1.,  23.,   4., ..., 119.,  70.,  10.],
       [ 28.,  15.,   2., ...,  -1.,  -1.,  11.]])

In [14]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Selecting relevant features
features = ['query', 'qclass', 'qtype', 'rcode', 'answers', 'TTLs']
df_selected = df[features]

scaler = StandardScaler()
df_selected_scaled = scaler.fit_transform(data)

X_train, X_test = train_test_split(df_selected_scaled, test_size=0.2, random_state=42)

# Define the architecture of the autoencoder
input_dim = X_train.shape[1]
encoding_dim = 4
input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='selu')(input_layer)
decoder = Dense(input_dim, activation='selu')(encoder)

autoencoder = Model(input_layer, decoder)

# Compile & train the model
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, validation_data=(X_test, X_test))

reconstructed_data = autoencoder.predict(df_selected_scaled)

# Calculate the reconstruction error
mse = np.mean(np.power(df_selected_scaled - reconstructed_data, 2), axis=1)

threshold = np.mean(mse) + 2 * np.std(mse)

anomalies = df_selected[mse > threshold]

print("Anomalies found:")
print(anomalies)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Anomalies found:
               query  qclass  qtype  rcode  answers  TTLs
18645   version.bind     3.0   16.0   -1.0     -1.0  -1.0
30090   version.bind     3.0   16.0   -1.0     -1.0  -1.0
68376   version.bind     3.0   16.0   -1.0     -1.0  -1.0
114561  version.bind     3.0   16.0   -1.0     -1.0  -1.0
131673  version.bind     3.0   16.0   -1.0     -1.0  -1.0
...              ...     ...    ...    ...  

In [15]:
anomalies

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs
18645,version.bind,3.0,16.0,-1.0,-1.0,-1.0
30090,version.bind,3.0,16.0,-1.0,-1.0,-1.0
68376,version.bind,3.0,16.0,-1.0,-1.0,-1.0
114561,version.bind,3.0,16.0,-1.0,-1.0,-1.0
131673,version.bind,3.0,16.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...
669573,version.bind,3.0,16.0,-1.0,-1.0,-1.0
670986,version.bind,3.0,16.0,-1.0,-1.0,-1.0
671232,version.bind,3.0,16.0,-1.0,-1.0,-1.0
672268,version.bind,3.0,16.0,-1.0,-1.0,-1.0


In [16]:
y = anomalies['query'].unique()
y

array(['version.bind',
       'lb._dns-sd._udp.\x90ct\x01\x08{u\x01{c62976d4-38f0-465c-bfbd-cbff42493045}',
       'gem.gbc.criteo.com', 'gbc4.va.us.criteo.com',
       'lb._dns-sd._udp.Ø±m\x018¬x\x01{1bdd55c9-f886-4d46-8eb0-1f22b8cfdfe5}',
       'ag.gbc.criteo.com',
       'lb._dns-sd._udp.\x14$)\x01\x14&)\x01\x14()\x01\x14*)\x01\x14,)\x01e'],
      dtype=object)

In [17]:
z = anomalies[~anomalies['query'].str.contains('version.bind', case=False, na=False)]
z

Unnamed: 0,query,qclass,qtype,rcode,answers,TTLs
179469,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
179510,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
179530,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
181022,gem.gbc.criteo.com,1.0,1.0,0.0,529.0,2481270.0
244062,gbc4.va.us.criteo.com,1.0,1.0,0.0,504.0,2411612.0
305617,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
305641,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
305712,lb._dns-sd._udp.ct{u{c62976d4-38f0-465c-bf...,1.0,12.0,3.0,-1.0,-1.0
362775,lb._dns-sd._udp.Ø±m8¬x{1bdd55c9-f886-4d46-8e...,1.0,12.0,3.0,-1.0,-1.0
362822,lb._dns-sd._udp.Ø±m8¬x{1bdd55c9-f886-4d46-8e...,1.0,12.0,3.0,-1.0,-1.0


In [24]:
anomalies.to_csv("Anomalies.csv")