### Amazon Route53 Resolver Query Logs
##### Read CSV to DataFrame

In [None]:
import pandas as pd

df = pd.read_csv('/tmp/7beb115e-69e3-44e3-8a0f-3793d59ceac7.csv', sep=',')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.head(1)

##### Identify Query Source

In [None]:
pd.crosstab(
    index = [
        df["query_hostname"],
        df["query_type"],
        df["accountid"],
        df["region"],
        df["src_vpc_uid"],
        df["src_instance_uid"],
        df["src_ip"]
    ], 
    columns = [
        df["rcode"]
    ],                      
    margins = True
).sort_index(
    axis = 1,
    ascending = False
).sort_values(
    by = [
        'All'
    ],
    ascending = False
)   

##### Identify Host Source

In [None]:
pd.crosstab(
    index = [
        df["src_ip"],
        df["src_instance_uid"],
        df["src_vpc_uid"],
        df["region"],
        df["accountid"],
        df["query_type"],
        df["query_hostname"]
    ], 
    columns = [
        df["rcode"]
    ],                      
    margins = True
).sort_index(
    axis = 1,
    ascending = False
).sort_values(
    by = [
        'All'
    ],
    ascending = False
)  

##### Query Times

In [None]:
import matplotlib.pyplot as plt

df1 = df['time'].value_counts().rename_axis('time').reset_index(name='counts')
df1 = df1.sort_values(by=['time'], ascending=True)

plt.figure(figsize=(20,10))
plt.scatter(df1['time'], df1['counts'])
plt.show()

##### Query Deviations

In [None]:
import matplotlib.pyplot as plt
import numpy as np

df2 = df['query_hostname'].value_counts().rename_axis('query_hostname').reset_index(name='counts')

records = df2.loc[:,'counts'].to_numpy()
mean = records.mean()
median = np.median(records)
stddev = np.std(records)

plt.figure(figsize=(20,10))
plt.hist(df2.loc[:,'counts'], log=True)
plt.plot([mean, mean],[0,15**4], label="Mean")
plt.plot([median,median],[0,15**4],label="Median")
plt.plot([mean+stddev,mean+stddev],[0,15**4], label="+1 std")
plt.plot([mean+(stddev*2),mean+(stddev*2)],[0,15**4], label="+2 std")

plt.legend()
plt.show()

##### Identify Query Answers

In [None]:
import matplotlib.pyplot as plt
import numpy as np

df3 = df[['query_hostname','answers']]
df3 = df3[df3['answers'].notna()]

items = []

for index, row in df3.iterrows():
    answers = row['answers'].split('}, {')
    for answer in answers:
        answer = answer.replace('[','').replace(']','').replace('{','').replace('}','')
        item = answer.split(', ')
        items.append([row['query_hostname'],item[1][6:]])

df4 = pd.DataFrame(items, columns=['query_hostname','answer'])

pd.crosstab(
    index = [
        df4["query_hostname"]
    ], 
    columns = [
        df4["answer"]
    ],                      
    margins = True
).sort_index(
    axis = 1,
    ascending = False
).sort_values(
    by = [
        'All'
    ],
    ascending = False
)


##### Answer Deviations

In [None]:
import matplotlib.pyplot as plt
import numpy as np

df5 = df4['answer'].value_counts().rename_axis('answer').reset_index(name='counts')

records = df5.loc[:,'counts'].to_numpy()
mean = records.mean()
median = np.median(records)
stddev = np.std(records)

plt.figure(figsize=(20,10))
plt.hist(df5.loc[:,'counts'], log=True)
plt.plot([mean, mean],[0,15**4], label="Mean")
plt.plot([median,median],[0,15**4],label="Median")
plt.plot([mean+stddev,mean+stddev],[0,15**4], label="+1 std")
plt.plot([mean+(stddev*2),mean+(stddev*2)],[0,15**4], label="+2 std")

plt.legend()
plt.show()