# Raw Extraction of required data 

Note that the raw extraction of required data takes a very long time, the import of pre-computed values is avaliable at the Analysis section.

In [1]:
import pandas as pd
import numpy as np
import tqdm
import csv
from datetime import datetime

In [2]:
# Defining the list indexes
indexes = {
    'Malware':0,
    'Phishing':1,
    'Exploits':2,
    'Fraudlent Services':3,
    'Unwanted Programs':4,
    'Spammers':5,
    'Unlabelled':6,
}

attack_map = ['Malware', 'Phishing', 'Exploits', 'Fraudlent Services',
            'Unwanted Programs', 'Spammers', 'Unlabelled']

In [3]:
# Convert date fields back into a python date object
date_calculation_cache = {}
def make_date(day, month, year):
    date = '%s-%s-%s' % (day, month, year)
    # Check cache before recomputation
    if date not in date_calculation_cache:
        iso_cal = datetime.strptime(date, "%d-%m-%Y").isocalendar()
        date_calculation_cache[date] = iso_cal
    else:
        iso_cal = date_calculation_cache[date]
    return iso_cal

In [4]:
host_attacks = {attk: {} for attk in indexes}
host_attacks['Labelled'] = {}

for attack in indexes:
    print(attack)
    files = []
    if attack == 'Unlabelled':
        # There are 23 files containing Unlabelled reports
        for i in range(23):
            files.append(('./../Datastore/%s_reports_%s.csv'%(attack,i), attack, i))
    else:
        files.append(('./../Datastore/%s_reports.csv'%attack, attack, None))
    #########################
    
    # Process the files
    for file_name, a, file_num in files:
        print(file_name)
        with open(file_name) as data_file:
            filereader = csv.reader(data_file)
            if attack == 'Unlabelled':
                activity_relabel = pd.read_csv('./../Unlabelled_predictions/%s_predictions_%s.csv'%(a, file_num),
                                               index_col=[0], header=None)
                activity_relabel = activity_relabel.transpose().values[0]
            for n, report in enumerate(tqdm.tqdm(filereader)):
                try:
                    # Unpack Row information
                    splits = report[0:9]

                    day = splits[5]
                    month = splits[6]
                    year = splits[7]
                    datasource = splits[8]        
                    org = splits[3]
                    asn = splits[2]
                    cc = splits[1]
                    ip = splits[0]
                    
                    # Compute week bins
                    iso_cal = make_date(day, month, year)
                    year_num, week_num, weekday_num = iso_cal

                    # Invalid Timestamp, likely 0.
                    if year_num == 1970:
                        continue
                    week = (year_num-1970)*52 + week_num

                except Exception, e: 
                    print(str(e))

                if ip is None or ip == '':
                    print("Skipping invalid IP: %s" % ip)
                    continue

                # Relabelling unlabelled reports
                if (attack == 'Unlabelled'):
                    attack_label = attack_map[activity_relabel[n]]
                else:
                    attack_label = attack

                # Selected key 'i' for scrutiny
                i = week
                host = cc
                if i not in host_attacks[attack_label]:
                    host_attacks[attack_label][i] = {}
                # Initialise the host holder for the key
                if host not in host_attacks[attack_label][i]:
                    host_attacks[attack_label][i][host] = 0
                # Increment the attack, key, host value
                host_attacks[attack_label][i][host] += 1

                # We maintain a distinct class for Labelled Data
                if attack_label != 'Unlabelled':
                    if i not in host_attacks['Labelled']:
                        host_attacks['Labelled'][i] = {}
                    # Initialise the host holder for the key
                    if host not in host_attacks['Labelled'][i]:
                        host_attacks['Labelled'][i][host] = 0
                    # Increment the attack, key, host value
                    host_attacks['Labelled'][i][host] += 1

print('Finished')

6918it [00:00, 58817.49it/s]
0it [00:00, ?it/s]

Spammers
./../Datastore/Spammers_reports.csv
Malware
./../Datastore/Malware_reports.csv


3190529it [00:36, 87249.48it/s]
9547it [00:00, 95443.24it/s]

Exploits
./../Datastore/Exploits_reports.csv


201108it [00:02, 92887.57it/s]
10112it [00:00, 101090.69it/s]

Phishing
./../Datastore/Phishing_reports.csv


2382889it [00:32, 73424.97it/s]
9478it [00:00, 94754.34it/s]

Fraudlent Services
./../Datastore/Fraudlent Services_reports.csv


829301it [00:08, 99992.20it/s] 
9373it [00:00, 93705.51it/s]

Unwanted Programs
./../Datastore/Unwanted Programs_reports.csv


698700it [00:16, 42920.60it/s]


Unlabelled
./../Datastore/Unlabelled_reports_0.csv


  mask |= (ar1 == a)
2000138it [00:26, 75155.74it/s]


./../Datastore/Unlabelled_reports_1.csv


2000007it [00:21, 93480.70it/s]


./../Datastore/Unlabelled_reports_2.csv


2000047it [00:33, 60253.07it/s]


./../Datastore/Unlabelled_reports_3.csv


2000022it [00:24, 80903.43it/s]


./../Datastore/Unlabelled_reports_4.csv


2000471it [00:25, 78177.01it/s]


./../Datastore/Unlabelled_reports_5.csv


2000567it [00:24, 81363.71it/s]


./../Datastore/Unlabelled_reports_6.csv


2000130it [00:23, 85091.09it/s]


./../Datastore/Unlabelled_reports_7.csv


2000541it [00:19, 102067.72it/s]


./../Datastore/Unlabelled_reports_8.csv


2000282it [00:26, 76861.63it/s]


./../Datastore/Unlabelled_reports_9.csv


2000016it [00:22, 89398.30it/s]


./../Datastore/Unlabelled_reports_10.csv


2000165it [00:19, 100967.71it/s]


./../Datastore/Unlabelled_reports_11.csv


2000021it [00:20, 99644.28it/s] 


./../Datastore/Unlabelled_reports_12.csv


2000114it [00:34, 58652.94it/s]


./../Datastore/Unlabelled_reports_13.csv


2000038it [00:24, 81822.29it/s]


./../Datastore/Unlabelled_reports_14.csv


2000304it [00:20, 99157.40it/s] 


./../Datastore/Unlabelled_reports_15.csv


2000019it [00:18, 110092.70it/s]


./../Datastore/Unlabelled_reports_16.csv


2000455it [00:26, 74930.11it/s]


./../Datastore/Unlabelled_reports_17.csv


2000040it [00:23, 84238.70it/s]


./../Datastore/Unlabelled_reports_18.csv


2000023it [00:17, 114153.31it/s]


./../Datastore/Unlabelled_reports_19.csv


2000058it [00:16, 123284.38it/s]


./../Datastore/Unlabelled_reports_20.csv


2000016it [00:18, 106725.48it/s]


./../Datastore/Unlabelled_reports_21.csv


2000294it [00:19, 102035.49it/s]
0it [00:00, ?it/s]

./../Datastore/Unlabelled_reports_22.csv


332782it [00:05, 55915.15it/s] 

Finished





In [5]:
df = {}
# Turn the dictionary of dictionaries into dictionaries of DataFrames
for attack in host_attacks:
    df[attack] = pd.DataFrame(host_attacks[attack])
    df[attack].fillna(0, inplace=True)

In [6]:
# Transpose the dataframe, such that the weeks are on the index.
for attack in host_attacks:
    df[attack] = df[attack].transpose()

In [7]:
# Our temporal analysis does not consider the remaining Unlabelled reports
del df['Unlabelled']

# Churn Model Fitting

In [8]:
new_df = df

# The array should be a sparse array, with sequentially increasing
# time bins, each containing the number of reports for a host.
def compute_lifetimes(arr):
    # To maintain time of each mal-activity campaign
    results = {'Death': [],'Life': []}
    # To maintain number of reports of each mal-activity campaign
    s_results = {'Death': [],'Life': []}
    # Start all hosts in a death state
    curr_track = 'Death'
    # Current life or death week tracking
    track_time = 0
    track_severity = 0
    for time_dur in arr:
        if time_dur > 0:
            if curr_track == 'Life':
                track_time += 1
                track_severity += time_dur
            else:
                results[curr_track].append(track_time)
                s_results[curr_track].append(track_severity)
                track_time = 1
                track_severity = time_dur
                curr_track = 'Life'

        else:
            if curr_track == 'Death':
                track_time += 1
                track_severity += time_dur
            else:
                results[curr_track].append(track_time)
                s_results[curr_track].append(track_severity)
                track_time = 1
                track_severity= time_dur
                curr_track = 'Death'
                
    results[curr_track].append(track_time)
    s_results[curr_track].append(track_severity)
                
    if len(results['Life']) == 0:
        return None
    
    assert len(results['Life']) == len(s_results['Life'])
    
    return (np.mean(results['Life']), np.mean(results['Death']),
            np.mean(s_results['Life']), np.mean(s_results['Death']),
            np.mean(np.divide(s_results['Life'], results['Life'])),
            len(results['Life']), len(results['Death'])
           )
res_arr = {}

In [9]:
# Compute lifetime statistice for every host
res_arr = {}
for attack in new_df:
    print(attack)
    res_arr[attack] = {}
    for col in new_df[attack]:

        res = compute_lifetimes(new_df[attack][col])

        if res is not None:
            res_arr[attack][col] = res

print('Finished')

Spammers
Malware
Exploits
Phishing
Fraudlent Services
Unwanted Programs
Labelled
Finished


In [10]:
life_death_recipr_df_holder = {}
for attack in new_df:
    life_death_df = pd.DataFrame(res_arr[attack])

    life_death_df.index = ['Life', 'Death', 'Life_S', 'Death_S',
                           'Severity', 'n_Life', 'n_Death']
    
    recipricol_mean_duration = {}
    for host in life_death_df:
        l, d, l_s, d_s, s, n_l, n_d = life_death_df[host]
        recipricol_mean_duration[host] = 1/(l+d)
        
    life_death_recipr_df = life_death_df.append(
                pd.DataFrame(recipricol_mean_duration, index=[0]))

    life_death_recipr_df.index = ['Life', 'Death', 'Life_S', 'Death_S',
                                  'Severity', 'Number of Lives',
                                  'Number of Deaths',
                                  'Reciprocal Mean Duration']

    life_death_recipr_df_holder[attack] = life_death_recipr_df

In [11]:
# Drop Invalid entries
for attk in life_death_recipr_df_holder:
    life_death_recipr_df_holder[attk].drop('', axis=1, inplace=True,
                                           errors='ignore')
    life_death_recipr_df_holder[attk].drop('-1', axis=1, inplace=True,
                                           errors='ignore')
    life_death_recipr_df_holder[attk].drop('unknown', axis=1, inplace=True,
                                           errors='ignore')

In [12]:
# If carrying through to next section
# weekly_life_death_severity_reciprical
week_l_d_s_r = life_death_recipr_df_holder

In [13]:
print("Saving Statistics in:")
for attack, values in life_death_recipr_df_holder.items():
    out_file_name = "./cc_churn_stats/%s_weekly_churn_CC.csv" % attack
    print(out_file_name)
    values.to_csv(out_file_name, header=True, index=True, encoding='utf-8')
print("Finished")

Saving Statistics in:
./cc_churn_stats/Spammers_weekly_churn_CC.csv
./cc_churn_stats/Malware_weekly_churn_CC.csv
./cc_churn_stats/Exploits_weekly_churn_CC.csv
./cc_churn_stats/Phishing_weekly_churn_CC.csv
./cc_churn_stats/Fraudlent Services_weekly_churn_CC.csv
./cc_churn_stats/Unwanted Programs_weekly_churn_CC.csv
./cc_churn_stats/Labelled_weekly_churn_CC.csv
Finished


# Analysis Code

## Import previously computed statistics

In [14]:
print("Loading Statistics from:")
week_l_d_s_r = {}
for attack in ['Spammers', 'Malware', 'Exploits', 'Phishing',
               'Fraudlent Services', 'Unwanted Programs', 'Labelled']:
    in_file_name = "./cc_churn_stats/%s_weekly_churn_CC.csv" % attack
    print(in_file_name)
    week_l_d_s_r[attack] = pd.read_csv(in_file_name, index_col=[0])
print("Finished")

Loading Statistics from:
./cc_churn_stats/Spammers_weekly_churn_CC.csv
./cc_churn_stats/Malware_weekly_churn_CC.csv
./cc_churn_stats/Exploits_weekly_churn_CC.csv
./cc_churn_stats/Phishing_weekly_churn_CC.csv
./cc_churn_stats/Fraudlent Services_weekly_churn_CC.csv
./cc_churn_stats/Unwanted Programs_weekly_churn_CC.csv
./cc_churn_stats/Labelled_weekly_churn_CC.csv
Finished


## Top Lifetime (Persistent)

In [15]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Life', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
US,511.0,0.0,42187270.0,0.0,82558.252446,1.0,1.0,0.001957
CN,55.75,8.125,164616.8,0.0,377.600228,8.0,8.0,0.015656
BR,54.75,9.125,7062.375,0.0,19.637648,8.0,8.0,0.015656
CA,37.833333,4.384615,4288.5,0.0,21.789258,12.0,13.0,0.023687
GB,37.666667,4.538462,10865.83,0.0,26.538813,12.0,13.0,0.023694
RU,33.142857,3.357143,23837.57,0.0,57.471188,14.0,14.0,0.027397
UA,32.142857,4.357143,17324.21,0.0,79.850993,14.0,14.0,0.027397
DE,30.4375,1.5,92083.44,0.0,212.420467,16.0,16.0,0.031311
TR,30.214286,5.866667,3338.5,0.0,22.135215,14.0,15.0,0.027715
NL,27.235294,2.823529,23842.24,0.0,57.942177,17.0,17.0,0.033268


## Lowest Deathtime (Resilient)

In [16]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Death', ascending=True).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
US,511.0,0.0,42187270.0,0.0,82558.252446,1.0,1.0,0.001957
DE,30.4375,1.5,92083.44,0.0,212.420467,16.0,16.0,0.031311
VG,9.041667,1.604167,4315.104,0.0,22.614989,48.0,48.0,0.093933
FR,23.75,1.8,63716.4,0.0,148.543893,20.0,20.0,0.039139
,9.333333,1.978261,1401.111,0.0,27.08783,45.0,46.0,0.088405
--,7.803922,2.215686,46276.29,0.0,216.887902,51.0,51.0,0.099804
AU,13.625,2.34375,1903.531,0.0,12.819946,32.0,32.0,0.062622
SG,7.09434,2.54717,630.7358,0.0,6.385917,53.0,53.0,0.103718
NL,27.235294,2.823529,23842.24,0.0,57.942177,17.0,17.0,0.033268
HK,11.628571,2.888889,1287.286,0.0,10.608218,35.0,36.0,0.068883


## Top Rate of Arrival (Recurrent)

In [17]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Reciprocal Mean Duration', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
CO,2.708861,3.7125,46.367089,0.0,7.923564,79.0,80.0,0.15573
PA,3.373333,3.394737,62.64,0.0,10.8809,75.0,76.0,0.147753
BS,2.111111,4.917808,121.236111,0.0,19.56189,72.0,73.0,0.142269
NO,3.528571,3.71831,48.928571,0.0,6.759195,70.0,71.0,0.13799
MX,3.685714,3.56338,85.557143,0.0,11.777853,70.0,71.0,0.137948
GE,3.086957,4.257143,55.115942,0.0,6.935507,69.0,70.0,0.136164
PH,1.705882,5.724638,9.205882,0.0,3.617647,68.0,69.0,0.13458
NZ,2.546875,5.353846,19.796875,0.0,6.45395,64.0,65.0,0.126571
ID,4.734375,3.2,140.65625,0.0,10.632659,64.0,65.0,0.126034
KZ,4.5,3.95082,85.15,0.0,7.347385,60.0,61.0,0.118332


## Highest Severity (reports per week)

In [22]:
temp = week_l_d_s_r['Labelled'].transpose()
temp.sort_values('Severity', ascending=False).head(10)

Unnamed: 0,Life,Death,Life_S,Death_S,Severity,Number of Lives,Number of Deaths,Reciprocal Mean Duration
US,511.0,0.0,42187270.0,0.0,82558.252446,1.0,1.0,0.001957
CN,55.75,8.125,164616.8,0.0,377.600228,8.0,8.0,0.015656
--,7.803922,2.215686,46276.29,0.0,216.887902,51.0,51.0,0.099804
DE,30.4375,1.5,92083.44,0.0,212.420467,16.0,16.0,0.031311
FR,23.75,1.8,63716.4,0.0,148.543893,20.0,20.0,0.039139
UA,32.142857,4.357143,17324.21,0.0,79.850993,14.0,14.0,0.027397
EU,11.176471,3.742857,806.8235,0.0,73.239067,34.0,35.0,0.067027
NL,27.235294,2.823529,23842.24,0.0,57.942177,17.0,17.0,0.033268
RU,33.142857,3.357143,23837.57,0.0,57.471188,14.0,14.0,0.027397
BO,1.5,14.030303,84.3125,0.0,55.880208,32.0,33.0,0.06439


# Additional Analysis

## Average Severity between types of mal-activity

In [19]:
for attack in week_l_d_s_r:
    print(attack)
    print(week_l_d_s_r[attack].transpose()['Severity'].mean())

Spammers
1.93917364459
Malware
101.466734584
Exploits
8.42484051842
Phishing
56.61024681
Fraudlent Services
9.02474139794
Unwanted Programs
5.15943519268
Labelled
424.396367791


## Proportion of (average) 1 Week offenders

In [21]:
temp = week_l_d_s_r['Labelled'].transpose()['Life']

onewk = len(temp[temp==1])
all_hosts = len(temp)

print(1.0*onewk/all_hosts)

0.174129353234


In [23]:
# Proportion with 1 severity (one report per week)
temp = week_l_d_s_r['Labelled'].transpose()['Severity']

onewk = len(temp[temp==1])
all_hosts = len(temp)

print(1.0*onewk/all_hosts)

0.0945273631841


In [25]:
# Number of hosts with >10K severity (reports per week)
temp = week_l_d_s_r['Labelled'].transpose()['Severity']

found_hosts = len(temp[temp > 10000])
all_hosts = len(temp)

print(found_hosts)
print(all_hosts)
print(1.0*found_hosts/all_hosts)

1
201
0.00497512437811
