# Raw Extraction of required data 

Note that the raw extraction of required data takes a very long time, the import of pre-computed values is avaliable at the Analysis section.

In [1]:
import pandas as pd
import numpy as np
import tqdm
import csv
from datetime import datetime

In [2]:
# Defining the list indexes
indexes = {
    'Malware':0,
    'Phishing':1,
    'Exploits':2,
    'Fraudlent Services':3,
    'Unwanted Programs':4,
    'Spammers':5,
    'Unlabelled':6,
}

attack_map = ['Malware', 'Phishing', 'Exploits', 'Fraudlent Services',
            'Unwanted Programs', 'Spammers', 'Unlabelled']

In [3]:
# Convert date fields back into a python date object
date_calculation_cache = {}
def make_date(day, month, year):
    date = '%s-%s-%s' % (day, month, year)
    # Check cache before recomputation
    if date not in date_calculation_cache:
        iso_cal = datetime.strptime(date, "%d-%m-%Y").isocalendar()
        date_calculation_cache[date] = iso_cal
    else:
        iso_cal = date_calculation_cache[date]
    return iso_cal

In [4]:
host_attacks = {attk: {} for attk in indexes}
host_attacks['Labelled'] = {}

for attack in indexes:
    print(attack)
    files = []
    if attack == 'Unlabelled':
        # There are 23 files containing Unlabelled reports
        for i in range(23):
            files.append(('./../Datastore/%s_reports_%s.csv'%(attack,i), attack, i))
    else:
        files.append(('./../Datastore/%s_reports.csv'%attack, attack, None))
    #########################
    
    # Process the files
    for file_name, a, file_num in files:
        print(file_name)
        with open(file_name) as data_file:
            filereader = csv.reader(data_file)
            if attack == 'Unlabelled':
                activity_relabel = pd.read_csv('./../Unlabelled_predictions/%s_predictions_%s.csv'%(a, file_num),
                                               index_col=[0], header=None)
                activity_relabel = activity_relabel.transpose().values[0]
            for n, report in enumerate(tqdm.tqdm(filereader)):
                try:
                    # Unpack Row information
                    splits = report[0:9]

                    day = splits[5]
                    month = splits[6]
                    year = splits[7]
                    datasource = splits[8]        
                    org = splits[3]
                    asn = splits[2]
                    cc = splits[1]
                    ip = splits[0]
                    
                    # Compute week bins
                    iso_cal = make_date(day, month, year)
                    year_num, week_num, weekday_num = iso_cal

                    # Invalid Timestamp, likely 0.
                    if year_num == 1970:
                        continue
                    week = (year_num-1970)*52 + week_num

                except Exception, e: 
                    print(str(e))

                if ip is None or ip == '':
                    print("Skipping invalid IP: %s" % ip)
                    continue

                # Relabelling unlabelled reports
                if (attack == 'Unlabelled'):
                    attack_label = attack_map[activity_relabel[n]]
                else:
                    attack_label = attack

                # Selected key 'i' for scrutiny
                i = week
                host = asn
                if i not in host_attacks[attack_label]:
                    host_attacks[attack_label][i] = {}
                # Initialise the host holder for the key
                if host not in host_attacks[attack_label][i]:
                    host_attacks[attack_label][i][host] = 0
                # Increment the attack, key, host value
                host_attacks[attack_label][i][host] += 1

                # We maintain a distinct class for Labelled Data
                if attack_label != 'Unlabelled':
                    if i not in host_attacks['Labelled']:
                        host_attacks['Labelled'][i] = {}
                    # Initialise the host holder for the key
                    if host not in host_attacks['Labelled'][i]:
                        host_attacks['Labelled'][i][host] = 0
                    # Increment the attack, key, host value
                    host_attacks['Labelled'][i][host] += 1

print('Finished')

6918it [00:00, 52317.54it/s]
0it [00:00, ?it/s]

Spammers
./../Datastore/Spammers_reports.csv
Malware
./../Datastore/Malware_reports.csv


3190529it [00:25, 123196.75it/s]
11750it [00:00, 117463.42it/s]

Exploits
./../Datastore/Exploits_reports.csv


201108it [00:01, 121376.14it/s]
8601it [00:00, 85980.77it/s]

Phishing
./../Datastore/Phishing_reports.csv


2382889it [00:27, 87473.31it/s]
9742it [00:00, 97388.75it/s]

Fraudlent Services
./../Datastore/Fraudlent Services_reports.csv


829301it [00:07, 113371.10it/s]
8006it [00:00, 80032.03it/s]

Unwanted Programs
./../Datastore/Unwanted Programs_reports.csv


698700it [00:09, 69953.22it/s]


Unlabelled
./../Datastore/Unlabelled_reports_0.csv


  mask |= (ar1 == a)
2000138it [00:22, 90371.78it/s]


./../Datastore/Unlabelled_reports_1.csv


2000007it [00:21, 95130.93it/s]


./../Datastore/Unlabelled_reports_2.csv


2000047it [00:21, 93057.30it/s]


./../Datastore/Unlabelled_reports_3.csv


2000022it [00:18, 107886.41it/s]


./../Datastore/Unlabelled_reports_4.csv


2000471it [00:23, 86526.97it/s]


./../Datastore/Unlabelled_reports_5.csv


2000567it [00:31, 63379.57it/s]


./../Datastore/Unlabelled_reports_6.csv


2000130it [00:21, 94260.54it/s]


./../Datastore/Unlabelled_reports_7.csv


2000541it [00:21, 95142.57it/s] 


./../Datastore/Unlabelled_reports_8.csv


2000282it [00:18, 106890.38it/s]


./../Datastore/Unlabelled_reports_9.csv


2000016it [00:17, 112952.71it/s]


./../Datastore/Unlabelled_reports_10.csv


2000165it [00:19, 100539.43it/s]


./../Datastore/Unlabelled_reports_11.csv


2000021it [00:18, 109050.65it/s]


./../Datastore/Unlabelled_reports_12.csv


2000114it [00:19, 100111.94it/s]


./../Datastore/Unlabelled_reports_13.csv


2000038it [00:17, 111243.67it/s]


./../Datastore/Unlabelled_reports_14.csv


2000304it [00:20, 96168.77it/s]


./../Datastore/Unlabelled_reports_15.csv


2000019it [00:19, 102045.94it/s]


./../Datastore/Unlabelled_reports_16.csv


2000455it [00:19, 100620.39it/s]


./../Datastore/Unlabelled_reports_17.csv


2000040it [00:17, 111157.46it/s]


./../Datastore/Unlabelled_reports_18.csv


2000023it [00:18, 109771.76it/s]


./../Datastore/Unlabelled_reports_19.csv


2000058it [00:18, 107627.36it/s]


./../Datastore/Unlabelled_reports_20.csv


2000016it [00:16, 124703.87it/s]


./../Datastore/Unlabelled_reports_21.csv


2000294it [00:19, 102401.47it/s]


./../Datastore/Unlabelled_reports_22.csv


332782it [00:02, 116526.84it/s]

Finished





In [5]:
df = {}
# Turn the dictionary of dictionaries into dictionaries of DataFrames
for attack in host_attacks:
    df[attack] = pd.DataFrame(host_attacks[attack])
    df[attack].fillna(0, inplace=True)

In [6]:
# Transpose the dataframe, such that the weeks are on the index.
for attack in host_attacks:
    df[attack] = df[attack].transpose()

In [7]:
# Our temporal analysis does not consider the remaining Unlabelled reports
del df['Unlabelled']

# Churn Model Fitting

In [8]:
new_df = df

# The array should be a sparse array, with sequentially increasing
# time bins, each containing the number of reports for a host.
def compute_lifetimes(arr):
    # To maintain time of each mal-activity campaign
    results = {'Death': [],'Life': []}
    # To maintain number of reports of each mal-activity campaign
    s_results = {'Death': [],'Life': []}
    # Start all hosts in a death state
    curr_track = 'Death'
    # Current life or death week tracking
    track_time = 0
    track_severity = 0
    for time_dur in arr:
        if time_dur > 0:
            if curr_track == 'Life':
                track_time += 1
                track_severity += time_dur
            else:
                results[curr_track].append(track_time)
                s_results[curr_track].append(track_severity)
                track_time = 1
                track_severity = time_dur
                curr_track = 'Life'

        else:
            if curr_track == 'Death':
                track_time += 1
                track_severity += time_dur
            else:
                results[curr_track].append(track_time)
                s_results[curr_track].append(track_severity)
                track_time = 1
                track_severity= time_dur
                curr_track = 'Death'
                
    results[curr_track].append(track_time)
    s_results[curr_track].append(track_severity)
                
    if len(results['Life']) == 0:
        return None
    
    assert len(results['Life']) == len(s_results['Life'])
    
    return (np.mean(results['Life']), np.mean(results['Death']),
            np.mean(s_results['Life']), np.mean(s_results['Death']),
            np.mean(np.divide(s_results['Life'], results['Life'])),
            len(results['Life']), len(results['Death'])
           )
res_arr = {}

In [9]:
# Compute lifetime statistice for every host
res_arr = {}
for attack in new_df:
    print(attack)
    res_arr[attack] = {}
    for col in new_df[attack]:

        res = compute_lifetimes(new_df[attack][col])

        if res is not None:
            res_arr[attack][col] = res

print('Finished')

Spammers
Malware
Exploits
Phishing
Fraudlent Services
Unwanted Programs
Labelled
Finished


In [10]:
life_death_recipr_df_holder = {}
for attack in new_df:
    life_death_df = pd.DataFrame(res_arr[attack])

    life_death_df.index = ['Life', 'Death', 'Life_S', 'Death_S',
                           'Severity', 'n_Life', 'n_Death']
    
    recipricol_mean_duration = {}
    for host in life_death_df:
        l, d, l_s, d_s, s, n_l, n_d = life_death_df[host]
        recipricol_mean_duration[host] = 1/(l+d)
        
    life_death_recipr_df = life_death_df.append(
                pd.DataFrame(recipricol_mean_duration, index=[0]))

    life_death_recipr_df.index = ['Life', 'Death', 'Life_S', 'Death_S',
                                  'Severity', 'Number of Lives',
                                  'Number of Deaths',
                                  'Reciprocal Mean Duration']

    life_death_recipr_df_holder[attack] = life_death_recipr_df

In [11]:
# Drop Invalid entries
for attk in life_death_recipr_df_holder:
    life_death_recipr_df_holder[attk].drop('', axis=1, inplace=True,
                                           errors='ignore')
    life_death_recipr_df_holder[attk].drop('-1', axis=1, inplace=True,
                                           errors='ignore')
    life_death_recipr_df_holder[attk].drop('unknown', axis=1, inplace=True,
                                           errors='ignore')

In [12]:
# If carrying through to next section
# weekly_life_death_severity_reciprical
week_l_d_s_r = life_death_recipr_df_holder

In [13]:
print("Saving Statistics in:")
for attack, values in life_death_recipr_df_holder.items():
    out_file_name = "./as_churn_stats/%s_weekly_churn_AS.csv" % attack
    print(out_file_name)
    values.to_csv(out_file_name, header=True, index=True, encoding='utf-8')
print("Finished")

Saving Statistics in:
./as_churn_stats/Spammers_weekly_churn_AS.csv
./as_churn_stats/Malware_weekly_churn_AS.csv
./as_churn_stats/Exploits_weekly_churn_AS.csv
./as_churn_stats/Phishing_weekly_churn_AS.csv
./as_churn_stats/Fraudlent Services_weekly_churn_AS.csv
./as_churn_stats/Unwanted Programs_weekly_churn_AS.csv
./as_churn_stats/Labelled_weekly_churn_AS.csv
Finished


# Analysis Code

## Import previously computed statistics

In [14]:
print("Loading Statistics from:")
week_l_d_s_r = {}
for attack in ['Spammers', 'Malware', 'Exploits', 'Phishing',
               'Fraudlent Services', 'Unwanted Programs', 'Labelled']:
    in_file_name = "./as_churn_stats/%s_weekly_churn_AS.csv" % attack
    print(in_file_name)
    week_l_d_s_r[attack] = pd.read_csv(in_file_name, index_col=[0])
print("Finished")

Loading Statistics from:
./as_churn_stats/Spammers_weekly_churn_AS.csv
./as_churn_stats/Malware_weekly_churn_AS.csv
./as_churn_stats/Exploits_weekly_churn_AS.csv
./as_churn_stats/Phishing_weekly_churn_AS.csv
./as_churn_stats/Fraudlent Services_weekly_churn_AS.csv
./as_churn_stats/Unwanted Programs_weekly_churn_AS.csv
./as_churn_stats/Labelled_weekly_churn_AS.csv
Finished


## Top Lifetime (Persistent)

In [26]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Life', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
4134,147.0,23.333333,208362.333333,0.0,476.624905,3.0,3.0,0.005871
4837,38.909091,6.916667,33749.636364,0.0,157.383452,11.0,12.0,0.021822
9800,38.0,236.5,521.0,0.0,13.710526,1.0,2.0,0.003643
32613,28.133333,5.5625,1080.933333,0.0,11.367625,15.0,16.0,0.029677
28753,26.1875,5.411765,1243.8125,0.0,39.717811,16.0,17.0,0.031646
24940,25.611111,2.777778,3832.888889,0.0,21.727808,18.0,18.0,0.035225
16276,23.25,2.3,52089.75,0.0,135.970418,20.0,20.0,0.039139
46514,23.0,244.0,2714.0,0.0,118.0,1.0,2.0,0.003745
34040,22.0,244.5,4543.0,0.0,206.5,1.0,2.0,0.003752
35994,21.3,27.090909,166678.2,0.0,1707.235979,10.0,11.0,0.020665


## Lowest Deathtime (Resilient)

In [22]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Death', ascending=True).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
36351,18.8,1.576923,3021.04,0.0,16.288273,25.0,26.0,0.049075
26496,20.608696,1.608696,19390.608696,0.0,77.321736,23.0,23.0,0.04501
40034,8.816327,1.612245,4181.591837,0.0,22.066605,49.0,49.0,0.09589
13335,7.103448,1.677966,4207.862069,0.0,21.891005,58.0,59.0,0.113877
14618,9.042553,1.829787,22422.893617,0.0,85.142362,47.0,47.0,0.091977
21740,6.238095,1.873016,667.539683,0.0,5.545725,63.0,63.0,0.123288
13768,6.047619,2.063492,517.84127,0.0,6.865311,63.0,63.0,0.123288
16509,6.433333,2.083333,429667.833333,0.0,1816.67134,60.0,60.0,0.117417
133618,4.185185,2.123457,449.901235,0.0,6.919654,81.0,81.0,0.158513
46606,6.389831,2.233333,1020.542373,0.0,7.643284,59.0,60.0,0.115967


## Top Rate of Arrival (Recurrent)

In [23]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Reciprocal Mean Duration', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
8001,1.771739,3.741935,7.434783,0.0,4.5701,92.0,93.0,0.181367
9931,2.808989,2.9,15.494382,0.0,4.167158,89.0,90.0,0.175162
46636,2.011364,3.752809,10.772727,0.0,4.068479,88.0,89.0,0.173485
13649,2.272727,3.494382,27.568182,0.0,5.890317,88.0,89.0,0.173397
31103,2.825581,3.08046,18.895349,0.0,5.160857,86.0,87.0,0.169318
11403,2.047059,3.918605,26.482353,0.0,8.925098,85.0,86.0,0.167626
7303,2.235294,3.732558,15.764706,0.0,5.066471,85.0,86.0,0.167564
20021,1.761905,4.270588,29.702381,0.0,14.297166,84.0,85.0,0.165769
45753,2.385542,3.72619,16.0,0.0,5.031928,83.0,84.0,0.16362
34788,2.614458,3.542169,17.771084,0.0,4.174377,83.0,83.0,0.162427


## Highest Severity (reports per week)

In [18]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Severity', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
7276,2.142857,62.0,8540.714286,0.0,2205.514286,7.0,8.0,0.01559
6762,1.888889,25.105263,6238.055556,0.0,2153.19537,18.0,19.0,0.037045
16509,6.433333,2.083333,429667.833333,0.0,1816.67134,60.0,60.0,0.117417
35994,21.3,27.090909,166678.2,0.0,1707.235979,10.0,11.0,0.020665
53684,8.666667,121.25,3076.333333,0.0,1606.724638,3.0,4.0,0.007697
134418,1.888889,25.105263,1575.388889,0.0,1557.978704,18.0,19.0,0.037045
5511,2.736842,10.435897,5066.894737,0.0,958.83124,38.0,39.0,0.075914
2637,1.52381,21.772727,1027.380952,0.0,774.968254,21.0,22.0,0.042925
7415,1.137931,15.933333,1238.931034,0.0,771.0,29.0,30.0,0.058578
20721,1.0,255.0,648.0,0.0,648.0,1.0,2.0,0.003906


# Additional Analysis

## Average Severity between types of mal-activity

In [24]:
for attack in week_l_d_s_r:
    print(attack)
    print(week_l_d_s_r[attack].transpose()['Severity'].mean())

Spammers
1.9272943468
Malware
7.41372654721
Exploits
2.4861656092
Phishing
5.37770462092
Fraudlent Services
6.26642420191
Unwanted Programs
2.63710178686
Labelled
6.53123532393


## Proportion of (average) 1 Week offenders

In [25]:
temp = week_l_d_s_r['Labelled'].transpose()['Life']

onewk = len(temp[temp==1])
all_hosts = len(temp)

print(1.0*onewk/all_hosts)

0.565173745174


In [35]:
# Check Uni of Houston AS
print(week_l_d_s_r['Labelled']['7276'])

Life                           2.142857
Death                         62.000000
Life_S                      8540.714286
Death_S                        0.000000
Severity                    2205.514286
Number of Lives                7.000000
Number of Deaths               8.000000
Reciprocal Mean Duration       0.015590
Name: 7276, dtype: float64


In [33]:
# Proportion with 1 severity (one report per week)
temp = week_l_d_s_r['Labelled'].transpose()['Severity']

onewk = len(temp[temp==1])
all_hosts = len(temp)

print(1.0*onewk/all_hosts)

0.273976833977


In [34]:
# Number of hosts with >10K severity (reports per week)
temp = week_l_d_s_r['Labelled'].transpose()['Severity']

found_hosts = len(temp[temp > 10000])
all_hosts = len(temp)

print(found_hosts)
print(all_hosts)
print(1.0*found_hosts/all_hosts)

0
12950
0.0
