# Technical Demonstration Notebook
##### This notebook is the work of Michael Baek and Richard Youn for ECE 4972/3

In [1]:
# import all relevant libraries
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print(__version__)
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
print("Import Successful")

2.0.12


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Import Successful


## Loading and splitting data into smaller dataframes based on local machine number

In [2]:
# loading Data into pandas dataframe
df = pd.read_csv("cs448b_ipasn.csv")
        
#creating smaller dataframes for each local node
node0df = df[df['l_ipn'] ==0]
node1df = df[df['l_ipn'] ==1]
node2df = df[df['l_ipn'] ==2]
node3df = df[df['l_ipn'] ==3]
node4df = df[df['l_ipn'] ==4]
node5df = df[df['l_ipn'] ==5]
node6df = df[df['l_ipn'] ==6]
node7df = df[df['l_ipn'] ==7]
node8df = df[df['l_ipn'] ==8]
node9df = df[df['l_ipn'] ==9]
        
# Are there any r_asns that only the compromised machines connect to? Kaggle states that machines 1,3,4,5, and 6 are compromised
# making sets of all the nodes' r_asn #s and going to '&&' them together to find the like r_asns
node1set = set(node1df['r_asn'])
node3set = set(node3df['r_asn'])
node4set = set(node4df['r_asn'])
node5set = set(node5df['r_asn'])
node6set = set(node6df['r_asn'])
r_asn_same_compromised = node1set&node3set&node4set&node5set&node6set
r_asn_same_compromised = sorted((list(r_asn_same_compromised)))
#print("List of remote machines shared by all compromised machines\n",r_asn_same_compromised, "\n# of remote machines shared:",len(r_asn_same_compromised))

### Creating Functions to speed up exploration

In [3]:
# function to check if a r_asn is in the set of shared compromised machines
def check_asn_in_compromised(n):
    if n in r_asn_same_compromised:
        print('R_Asn ',n,' is in the set')
    else:
        print('R_Asn ',n,' is NOT in the set')
        
# simpler plot functions to call upon plot.ly's API
# scatterplot function
def scatterplot(titlename, xlabel, xdata, ylabel, ydata):
    layout=dict(title=titlename, 
                yaxis=dict(title=ylabel),
                xaxis=dict(title=xlabel))
    trace=go.Scatter(x=xdata, y=ydata, mode='markers')
    data=[trace]
    fig=dict(data=data,layout=layout)
    iplot(fig)

# lineplot function
def lineplot(titlename, xlabel, xdata, ylabel, ydata):
    layout=dict(title=titlename, 
                yaxis=dict(title=ylabel),
                xaxis=dict(title=xlabel))
    trace=go.Scatter(x=xdata, y=ydata)
    data=[trace]
    fig=dict(data=data,layout=layout)
    iplot(fig)


In [4]:
lst =[ 1,2,3,4]
for num in lst:
    check_asn_in_compromised(num)

R_Asn  1  is NOT in the set
R_Asn  2  is NOT in the set
R_Asn  3  is NOT in the set
R_Asn  4  is in the set


## To begin, let's see a plot of all flows over all time

In [5]:
lineplot('Total Flows vs Time', 'Time', df.date, 'Flows', df.f) #move to a different part of presentation

On Sept 17 and Sept 18, there are 2 very significant spikes in flow. What machines are involved in this?

In [6]:
sept17_all = df[df['date']=='2006-09-17']
sept17_all.sort_values('f',ascending=False).head(5)

Unnamed: 0,date,l_ipn,r_asn,f
18237,2006-09-17,4,3671,274011
18236,2006-09-17,4,3265,646
18170,2006-09-17,0,9370,471
18202,2006-09-17,2,9370,421
18293,2006-09-17,8,6517,235


In [7]:
sept18_all = df[df['date']=='2006-09-18']
sept18_all.sort_values('f',ascending=False).head(5)

Unnamed: 0,date,l_ipn,r_asn,f
18392,2006-09-18,4,3671,784234
18391,2006-09-18,4,3265,548
18351,2006-09-18,2,1239,542
18305,2006-09-18,0,1239,541
18468,2006-09-18,8,15169,372


In [8]:
#Checking top 5 instances of flows in this dataset
df.sort_values('f',ascending = False).head(5)

Unnamed: 0,date,l_ipn,r_asn,f
18392,2006-09-18,4,3671,784234
18237,2006-09-17,4,3671,274011
20584,2006-09-29,8,19916,7902
18833,2006-09-20,8,19916,7899
18748,2006-09-20,2,9316,5214


For both dates, r_asn 3671 is where the bulk of the flows come in. How does this remote machine behave for all previous dates?

## Node 1 Exploration

In [9]:
title='Node 1 Flows vs Time (lineplot)'
xlabel='Time'
x=node1df.date
ylabel='Flows'
y=node1df.f
lineplot(title,xlabel,x,ylabel,y)

In [10]:
title='Node 1 Flows vs Time (scatterplot)'
xlabel='Time'
x=node1df.date
ylabel='Flows'
y=node1df.f
scatterplot(title,xlabel,x,ylabel,y)

We can see a gap in actvity in the between August 24 - 28th. I would like to make the assumption that the machines that were active on the final days were deemed suspicious and were the reason why this node was shut down so early.

In [11]:
node1df[node1df['date']=='2006-08-24'].sort_values('f',ascending=False)

Unnamed: 0,date,l_ipn,r_asn,f
13322,2006-08-24,1,3307,2044
13345,2006-08-24,1,33438,1663
13320,2006-08-24,1,2108,1660
13335,2006-08-24,1,6746,34
13347,2006-08-24,1,39632,15
13323,2006-08-24,1,3561,10
13327,2006-08-24,1,4538,4
13342,2006-08-24,1,17200,4
13334,2006-08-24,1,6478,4
13316,2006-08-24,1,73,4


Suspcious r_asns on Aug 24: 3307, 33438, and 2108. All others have a relatively normal level of flow.

In [12]:
# final day of activity
node1df[node1df['date']=='2006-08-28'].sort_values('f',ascending=False)

Unnamed: 0,date,l_ipn,r_asn,f
14143,2006-08-28,1,3561,5
14142,2006-08-28,1,73,4
14144,2006-08-28,1,4134,2


Suspicious r_asns on Aug 28: 3561, 73, and 4134. These 3 were also active on the final day of activity prior to the first shut down. How do these r_asns behave over the entire span of the timeline?

In [13]:
#[73, 2108, 3307,3561, 4134, 33438] #these 3 are the common ones. lets see if they behave as beacons

n1_73=node1df[node1df['r_asn']==73]
n1_2108=node1df[node1df['r_asn']==2108]
n1_3307=node1df[node1df['r_asn']==3307]
n1_3561=node1df[node1df['r_asn']==3561]
n1_4134=node1df[node1df['r_asn']==4134]
n1_33438=node1df[node1df['r_asn']==33438]

t0= go.Scatter(x=n1_73.date,y=n1_73.f, mode='markers',name='R-ASN 73')
t1= go.Scatter(x=n1_2108.date,y=n1_2108.f, mode='markers',name='R-ASN 2108')
t2= go.Scatter(x=n1_3307.date,y=n1_3307.f, mode='markers',name='R-ASN 3307')
t3= go.Scatter(x=n1_3561.date,y=n1_3561.f, mode='markers',name='R-ASN 3561')
t4= go.Scatter(x=n1_4134.date,y=n1_4134.f, mode='markers',name='R-ASN 4134')
t5= go.Scatter(x=n1_33438.date,y=n1_33438. f, mode='markers',name='R-ASN 33438')

fig = tools.make_subplots(rows=3,cols=2)
fig['layout'].update(height=500, width=975, title='R-ASN 73, 2108, 3307, 3561, 4134, 33438 ')
fig.append_trace(t0, 1, 1)
fig.append_trace(t1, 1, 2)
fig.append_trace(t2, 2, 1)
fig.append_trace(t3, 2, 2)
fig.append_trace(t4, 3, 1)
fig.append_trace(t5, 3, 2)
iplot(fig)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]
[ (3,1) x5,y5 ]  [ (3,2) x6,y6 ]



In node 1, the remote machines that seem to be active throughout are r_asn 4134 and r_asn 3561. We would like to assume that this means that they are serving as beacons for the bot master.

R_ASN 2108, 3307, 33438 all have increased activity closer to when the attacks occurred, which may mean that the bot master may have activated these nodes only for the sake of attacking node 1.

There is a spike in r_asn on Aug 19, which lines up with when r_asn 3307, 2108, and 33438 were reactivated into causing attacks.

### Just for curiousity sake, lets see if the nodes above are in the set of all nodes that are shared between all of the confirmed compromised machines

In [14]:
r_asn_list = [71, 2108, 3307, 3561, 4134, 33438 ]
for asn in r_asn_list:
    check_asn_in_compromised(asn)

R_Asn  71  is NOT in the set
R_Asn  2108  is NOT in the set
R_Asn  3307  is NOT in the set
R_Asn  3561  is in the set
R_Asn  4134  is in the set
R_Asn  33438  is NOT in the set


We see that r_asn 3561 and 4134 have some sort of connection to all nodes in the network that were compromised. Both of these also have served as beacons for the bot master for node 1. As we move on to node 5, lets see how these 2 asns behave.

### ***check other spikes for the suspicious nodes above to narrow down the search for other attacks.

## Node 5 Exploration

Start off the same as we did with node 1, keeping an eye out for r_asn 3561 and r_asn 4134

Also, do the other r_asns from node 1 have any influence on node 5?

In [15]:
node5r_asn_set = set(node5df.r_asn)
node1_sus_r_asn_set = set(r_asn_list)
checkedset = node5r_asn_set and node1_sus_r_asn_set
checkedset
#node5r_asn_set

{71, 2108, 3307, 3561, 4134, 33438}

All 6 are active in node 5. We should watch out for this and see if they are used again

In [16]:
title='Node 5 Flows vs Time (lineplot)'
xlabel='Time'
x=node5df.date
ylabel='Flows'
y=node5df.f
lineplot(title,xlabel,x,ylabel,y)

In [17]:
title='Node 5 Flows vs Time (scatterplot)'
xlabel='Time'
x=node5df.date
ylabel='Flows'
y=node5df.f
scatterplot(title,xlabel,x,ylabel,y)

### Significant spikes : July 15, Aug 19

Aug 19 was also the date of the final series of spikes for node 1. lets inspect this first.

Additionally, 4134 had a spike in activity on this very date before the other r_asns that had no activity began to turn on

In [18]:
n5_aug19 = node5df[node5df['date']=='2006-08-19']
n5_aug19.sort_values('f',ascending=False)

Unnamed: 0,date,l_ipn,r_asn,f
12175,2006-08-19,5,4134,322
12176,2006-08-19,5,4713,7
12179,2006-08-19,5,8070,7
12171,2006-08-19,5,2152,5
12186,2006-08-19,5,21844,3
12180,2006-08-19,5,8075,3
12170,2006-08-19,5,1659,2
12185,2006-08-19,5,13867,2
12183,2006-08-19,5,9930,2
12169,2006-08-19,5,137,2


How many connections did r_asn 4134 make to node 1 on August 19 to compare to node 5?

In [19]:
node1df[node1df['r_asn']==4134].sort_values('f',ascending=False).head()

Unnamed: 0,date,l_ipn,r_asn,f
12071,2006-08-19,1,4134,335
1472,2006-07-07,1,4134,129
11438,2006-08-16,1,4134,47
7450,2006-07-31,1,4134,37
4435,2006-07-21,1,4134,36


#### Checking July 15

In [20]:
node5df[node5df['date']=='2006-07-15'].sort_values('f', ascending=False)

Unnamed: 0,date,l_ipn,r_asn,f
3221,2006-07-15,5,4837,308
3227,2006-07-15,5,13749,5
3223,2006-07-15,5,8075,4
3220,2006-07-15,5,4436,4
3225,2006-07-15,5,9800,4
3229,2006-07-15,5,17785,2
3222,2006-07-15,5,5078,2
3213,2006-07-15,5,81,2
3219,2006-07-15,5,4134,2
3215,2006-07-15,5,701,2


Here, 4837 is the most active node, but 4134 is still active on this date.

How about Sept 4th - the day it was deemed compromised?

In [21]:
node5df[node5df['date']=='2006-09-04'].sort_values('f',ascending=False)

Unnamed: 0,date,l_ipn,r_asn,f
15672,2006-09-04,5,4134,23
15677,2006-09-04,5,8075,7
15668,2006-09-04,5,2152,5
15673,2006-09-04,5,4837,5
15669,2006-09-04,5,2828,4
15676,2006-09-04,5,8070,4
15679,2006-09-04,5,13768,3
15670,2006-09-04,5,3462,2
15674,2006-09-04,5,6356,2
15680,2006-09-04,5,13867,2


r_asn 4134 is active on the day node 5 was deemed an issue.

In [22]:
node5df[node5df['date']=='2006-09-05']

Unnamed: 0,date,l_ipn,r_asn,f
15832,2006-09-05,5,4134,3
15833,2006-09-05,5,6356,2


r_asn 4134 is active on the final day of node 5's life cycle.

Suspicious r_asns: 4134,6356,4837

In [23]:
check_asn_in_compromised(4134) # explain the set notation logic here
check_asn_in_compromised(6356) # explain the set notation logic here
check_asn_in_compromised(4837) # explain the set notation logic here

R_Asn  4134  is in the set
R_Asn  6356  is NOT in the set
R_Asn  4837  is in the set


6356 is not in the set of r_asns shared between all compromised nodes. However, 4837 is, and 4837 seems to be heavily involved in the activity of node 5

r_asn 4134 sent had 335 connections to node 1 and 322 connections to node 5. I'm going to assume there is a relationship between the attacks on node 1 and node 5, both being manipulated by r_asn 4134. Does it seem to have any relationship with any of the original attacks in node 1?

In [24]:
suspicious_list_n5 = [4134, 4837, 6356]

n5_4134 = node5df[node5df['r_asn'] == 4134]
n5_4837 = node5df[node5df['r_asn'] == 4837]
n5_6356 = node5df[node5df['r_asn'] == 6356]

t0= go.Scatter(x=n5_4134.date,y=n5_4134.f, mode='markers',name='R-ASN 4134')
t1= go.Scatter(x=n5_4837.date,y=n5_4837.f, mode='markers',name='R-ASN 4837')
t2= go.Scatter(x=n5_6356.date,y=n5_6356.f, mode='markers',name='R-ASN 6356')


fig = tools.make_subplots(rows=3,cols=1)
fig['layout'].update(height=500, width=975, title='R-ASN 4134, 4837, and 6356')
fig.append_trace(t0, 1, 1)
fig.append_trace(t1, 2, 1)
fig.append_trace(t2, 3, 1)
iplot(fig)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



In [25]:
suspicious_list_n1 = [2108, 3307, 3561, 4134, 33438]

n5_73=node5df[node5df['r_asn']==73]
n5_2108=node5df[node5df['r_asn']==2108]
n5_3307=node5df[node5df['r_asn']==3307]
n5_3561=node5df[node5df['r_asn']==3561]
n5_4134=node5df[node5df['r_asn']==4134]
n5_33438=node5df[node5df['r_asn']==33438]

t0= go.Scatter(x=n5_73.date,y=n5_73.f, mode='markers',name='R-ASN 73')
t1= go.Scatter(x=n5_2108.date,y=n5_2108.f, mode='markers',name='R-ASN 2108')
t2= go.Scatter(x=n5_3307.date,y=n5_3307.f, mode='markers',name='R-ASN 3307')
t3= go.Scatter(x=n5_3561.date,y=n5_3561.f, mode='markers',name='R-ASN 3561')
t4= go.Scatter(x=n5_4134.date,y=n5_4134.f, mode='markers',name='R-ASN 4134')
t5= go.Scatter(x=n5_33438.date,y=n5_33438. f, mode='markers',name='R-ASN 33438')

fig = tools.make_subplots(rows=3,cols=2)
fig['layout'].update(height=500, width=975, title='R-ASN 73, 2108, 3307, 3561, 4134, 33438 ')
fig.append_trace(t0, 1, 1)
fig.append_trace(t1, 1, 2)
fig.append_trace(t2, 2, 1)
fig.append_trace(t3, 2, 2)
fig.append_trace(t4, 3, 1)
fig.append_trace(t5, 3, 2)
iplot(fig)  


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]
[ (3,1) x5,y5 ]  [ (3,2) x6,y6 ]



Here, not as much evidence is conclusive. What I want to speculate is that 4134 is still suspicious because it is active during the entire timespan of node 5's life cycle.

# Checking if 4134 was involved with several other remote asns on Aug 19

In [26]:
node3df[node3df['date']=='2006-08-19'].sort_values('f',ascending=False).head()

Unnamed: 0,date,l_ipn,r_asn,f
12146,2006-08-19,3,4134,313
12147,2006-08-19,3,4713,7
12149,2006-08-19,3,6461,6
12154,2006-08-19,3,12006,3
12153,2006-08-19,3,9930,2


In [27]:
node6df[node6df['date']=='2006-08-19'].sort_values('f',ascending=False).head()

Unnamed: 0,date,l_ipn,r_asn,f
12193,2006-08-19,6,4134,530
12189,2006-08-19,6,3215,4
12194,2006-08-19,6,4713,3
12195,2006-08-19,6,4812,3
12187,2006-08-19,6,137,2


In [28]:
node0df[node0df['date']=='2006-08-19'].sort_values('f',ascending=False).head()

Unnamed: 0,date,l_ipn,r_asn,f
12045,2006-08-19,0,9318,163
12049,2006-08-19,0,13462,44
12033,2006-08-19,0,3561,32
12036,2006-08-19,0,4134,8
12038,2006-08-19,0,4837,6


### on Aug 19, 4134 had a high level of activity on all of the other compromised nodes.

#### Plotting all of 4134's activity on all of the different nodes

In [29]:
node1_4134 = node1df[node1df['r_asn']==4134]
node3_4134 = node3df[node3df['r_asn']==4134]
node4_4134 = node4df[node4df['r_asn']==4134]
node5_4134 = node5df[node5df['r_asn']==4134]
node6_4134 = node6df[node6df['r_asn']==4134]

t0= go.Scatter(x=node1_4134.date,y=node1_4134.f, mode='markers',name='node 1')
t1= go.Scatter(x=node3_4134.date,y=node3_4134.f, mode='markers',name='node 3')
t2= go.Scatter(x=node4_4134.date,y=node4_4134.f, mode='markers',name='node 4')
t3= go.Scatter(x=node5_4134.date,y=node5_4134.f, mode='markers',name='node 5')
t4= go.Scatter(x=node6_4134.date,y=node6_4134.f, mode='markers',name='node 6')
fig = tools.make_subplots(rows=5,cols=1)
fig['layout'].update(height=700, width=975, title='R_ASN 4134 on all of the compromised nodes')
fig.append_trace(t0, 1, 1)
fig.append_trace(t1, 2, 1)
fig.append_trace(t2, 3, 1)
fig.append_trace(t3, 4, 1)
fig.append_trace(t4, 5, 1)
iplot(fig)  


This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]
[ (4,1) x4,y4 ]
[ (5,1) x5,y5 ]



### 4134 seems to behave as a master controller that activates other machines to prepare attacks on targetted machines. At this time, we decided to talk with the Splunk team to see what they had to show.

# END ANACONDA DEMONSTRATION

In [31]:
len(r_asn_same_compromised)

185

In [3]:
node0_4134 = node0df[node0df['r_asn']==4134]
node1_4134 = node1df[node1df['r_asn']==4134]
node2_4134 = node2df[node2df['r_asn']==4134]
node3_4134 = node3df[node3df['r_asn']==4134]
node4_4134 = node4df[node4df['r_asn']==4134]
node5_4134 = node5df[node5df['r_asn']==4134]
node6_4134 = node6df[node6df['r_asn']==4134]
node7_4134 = node7df[node7df['r_asn']==4134]
node8_4134 = node8df[node8df['r_asn']==4134]
node9_4134 = node9df[node9df['r_asn']==4134]


t0= go.Scatter(x=node0_4134.date,y=node0_4134.f, mode='markers',name='node 0')
t1= go.Scatter(x=node1_4134.date,y=node1_4134.f, mode='markers',name='node 1')
t2= go.Scatter(x=node2_4134.date,y=node2_4134.f, mode='markers',name='node 2')
t3= go.Scatter(x=node3_4134.date,y=node3_4134.f, mode='markers',name='node 3')
t4= go.Scatter(x=node4_4134.date,y=node4_4134.f, mode='markers',name='node 4')
t5= go.Scatter(x=node5_4134.date,y=node5_4134.f, mode='markers',name='node 5')
t6= go.Scatter(x=node6_4134.date,y=node6_4134.f, mode='markers',name='node 6')
t7= go.Scatter(x=node7_4134.date,y=node7_4134.f, mode='markers',name='node 7')
t8= go.Scatter(x=node8_4134.date,y=node8_4134.f, mode='markers',name='node 8')
t9= go.Scatter(x=node9_4134.date,y=node9_4134.f, mode='markers',name='node 9')

fig = tools.make_subplots(rows=5,cols=2)
fig['layout'].update(height=700, width=975, title='R_ASN 4134 on all nodes in network')
fig.append_trace(t0, 1, 1)
fig.append_trace(t1, 1, 2)
fig.append_trace(t2, 2, 1)
fig.append_trace(t3, 2, 2)
fig.append_trace(t4, 3, 1)
fig.append_trace(t5, 3, 2)
fig.append_trace(t6, 4, 1)
fig.append_trace(t7, 4, 2)
fig.append_trace(t8, 5, 1)
fig.append_trace(t9, 5, 2)
iplot(fig)  


This is the format of your plot grid:
[ (1,1) x1,y1 ]    [ (1,2) x2,y2 ]  
[ (2,1) x3,y3 ]    [ (2,2) x4,y4 ]  
[ (3,1) x5,y5 ]    [ (3,2) x6,y6 ]  
[ (4,1) x7,y7 ]    [ (4,2) x8,y8 ]  
[ (5,1) x9,y9 ]    [ (5,2) x10,y10 ]



### At this point, we shared with the Spunk team that we believed that r_asn 4134 is a bot controller, manipulating other machines to do the actual damaging attacks. 

# END ANACONDA TEAM DEMONSTRATION