# Raw Extraction of required data 

Note that the raw extraction of required data takes a very long time, the import of pre-computed values is avaliable at the Plotting Code section.

We are performing an evaluation of the specialisation of host within the VT Blacklist, as such during the data extraction process, we do not consider the unlabelled reports.

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime

In [2]:
# Defining the list indexes
indexes = {
    'Malware':0,
    'Phishing':1,
    'Exploits':2,
    'Fraudlent Services':3,
    'Unwanted Programs':4,
    'Spammers':5,
    'Unlabelled':6,
}

# Unlabelled report, IP relabel mapping.
with open("./../IP_activity_relabel.csv") as f:
    ip_activity_relabel = dict(filter(None, csv.reader(f)))

In [3]:
# Convert date fields back into a python date object
date_calculation_cache = {}
def make_date(day, month, year):
    date = '%s-%s-%s' % (day, month, year)
    # Check cache before recomputation
    if date not in date_calculation_cache:
        iso_cal = datetime.strptime(date, "%d-%m-%Y").isocalendar()
        date_calculation_cache[date] = iso_cal
    else:
        iso_cal = date_calculation_cache[date]
    return iso_cal

In [4]:
host_attacks = {attk: {} for attk in indexes}
host_attacks['Labelled'] = {}

for attack in indexes:
    print(attack)
    files = []
    if attack == 'Unlabelled':
        continue
    else:
        files.append('./../Datastore/%s_reports.csv'%attack)
    #########################
    
    # Process the files
    for file_name in files:
        print(file_name)
        with open(file_name) as data_file:
            filereader = csv.reader(data_file)
            for report in filereader:
                try:
                    # Unpack Row information
                    splits = report[0:9]

                    day = splits[5]
                    month = splits[6]
                    year = splits[7]
                    datasource = splits[8]        
                    org = splits[3]
                    asn = splits[2]
                    cc = splits[1]
                    ip = splits[0]
                    
                    # Compute week bins
                    iso_cal = make_date(day, month, year)
                    year_num, week_num, weekday_num = iso_cal

                    # Invalid Timestamp, likely 0.
                    if year_num == 1970:
                        continue
                    week = (year_num-1970)*52 + week_num

                except Exception, e: 
                    print(str(e))

                if ip is None or ip == '':
                    print("Skipping invalid IP: %s" % ip)
                    continue

                # No relabelling
                attack_label = attack
                    
                # Selected host for scrutiny
                host = ip
                # Initialise the host holder
                if host not in host_attacks[attack_label]:
                    host_attacks[attack_label][host] = 0

                # Modify the host value
                host_attacks[attack_label][host] += 1

print('Finished')

Spammers
./../Datastore/Spammers_reports.csv
Malware
./../Datastore/Malware_reports.csv
Exploits
./../Datastore/Exploits_reports.csv
Phishing
./../Datastore/Phishing_reports.csv
Fraudlent Services
./../Datastore/Fraudlent Services_reports.csv
Unwanted Programs
./../Datastore/Unwanted Programs_reports.csv
Unlabelled
Finished


In [5]:
# Turn into a pandas array
df = pd.DataFrame.from_dict(host_attacks, orient='index')
df = df.fillna(0)
df = df.transpose()

In [6]:
# Save the DataFrame
df.to_csv('./data/ip_VTbl_reports.csv')

# Specialization Entropies

## Remove hosts with no labelled samples

In [7]:
import pandas as pd
host_attack_frame = pd.read_csv('./data/ip_VTbl_reports.csv', index_col=0)

In [8]:
# As the attack frame may contain IPs with only unlabelled entries
# We remove them from the dataframe
a = host_attack_frame
if 'Totals' in a:
    a.drop('Totals', axis=1, inplace=True)
if 'Unlabelled' in a:
    a.drop('Unlabelled', axis=1, inplace=True)

a['Totals'] = a.transpose().sum()
# Filter IPs with no labelled reports
a = a[a['Totals'] > 0]
a = a.drop('Totals', axis=1)

host_attack_frame = a.transpose()

## Entropy computation

In [9]:
import math
def shannon_entropy(arr):
    total = sum(arr)
    Pn = [float(c)/total for c in arr]    
    e_arr = []
    for P in Pn:
        if P == 0.0:
            #e_arr.append(0)
            pass
        else:
            e_arr.append(P*math.log(P,2))
                
    shannon_e = sum(e_arr)
    
    return (-1*shannon_e)/math.log(len(Pn),2)

def entropy(arr):
    total = sum(arr)
    Pn = [float(c)/total for c in arr]    
    e_arr = []
    for P in Pn:
        if P == 0.0:
            e_arr.append(0)
        else:
            e_arr.append(math.log(P,2))
                
    shannon_e = sum(e_arr)
    
    return -1*shannon_e

In [10]:
# Compute the entropy for all attacks ip_attack_frame
host_entropies = {}
for host in host_attack_frame:
    counts = host_attack_frame[host]
    host_entropies[host] = (shannon_entropy(counts), entropy(counts))

In [11]:
import pandas as pd
entropies = pd.DataFrame.from_dict(host_entropies, orient='index')
entropies.columns = ['Shannon Entropy', 'Entropy']

## Save Output intermediaries

In [12]:
import pandas as pd
entropies.to_csv('./entropies/ip_entropies.csv')