In [4]:
import ipaddress
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import re
import pickle
import os

In [1]:
DD_THRESHOLD = 20
ARG_COVERAGE_THRESHOLD = 1
MCR_THRESHOLD = 0.1
AVG_DD_THRESHOLD = 0.6
AVG_MCR_THRESHOLD = 0.15
LOCAL_SUBNET = "163.194.0.0/16"
new_way = True

# Define Cluster

In [2]:
# Need to get cluster out to file, and save Hash GUID, List P2P read-later
# Pre-load cluster a part of, with time

cluster = {}
p2p_clusters = {}
list_p2p = set()
len_cluster = 0
num_clusters = 0

def clean():
    global cluster, p2p_clusters, list_p2p, len_cluster, num_clusters
    cluster = {}
    p2p_clusters = {}
    list_p2p = set()
    len_cluster = 0
    num_clusters = 0
    try:
        index = 0
        while True:
            if os.path.exists(f"out_cluster/clusters_{index}.pkl"):
                os.remove(f"out_cluster/clusters_{index}.pkl")
            else:
                break
            index += 1
    except Exception as e:
        print(f"Error removing old clusters: {e}")

In [5]:
clean()

In [None]:
import ipinfo_db
client = ipinfo_db.Client('')

# Read datatset

In [7]:
def find_cluster(flow_guid, num_clusters):
    # Check if saved clusters exist
    if not os.path.exists('out_cluster'):
        return None, -1
    
    for i in range(0, num_clusters-2):
        handle = open('out_cluster/cluster_{i}.pickle'.format(i=i), 'rb')
        cluster = pickle.load(handle)
        if flow_guid in cluster:
            handle.close()
            return cluster, i
        handle.close()
    return None, -1

def save_cluster(cluster, num_cluster):
    # Create directory if it doesn't exist
    if not os.path.exists('out_cluster'):
        os.makedirs('out_cluster')
    with open('out_cluster/cluster_{i}.pickle'.format(i=num_cluster), 'wb') as handle:
        pickle.dump(cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
SRC_LABEL = "SrcAddr"
DST_LABEL = "DstAddr"
SPORT_LABEL = "Sport"
DPORT_LABEL = "Dport"
PROTO_LABEL = "Proto"
BPP_IN_LABEL = "BppIn"
BPP_OUT_LABEL = "BppOut"
G_BPP_IN_LABEL = "G_BppIn"
G_BPP_OUT_LABEL = "G_BppOut"

def define_index(line):
    l = re.split(r',|\n|\t', line)
    src_index, dst_index, proto_index, bpp_in_index, bpp_out_index, g_bpp_in_index, g_bpp_out_index = -1, -1, -1, -1, -1, -1, -1
    for i, v in enumerate(l):
        if v == SRC_LABEL:
            src_index = i
        elif v == DST_LABEL:
            dst_index = i
        elif v == PROTO_LABEL:
            proto_index = i
        elif v == BPP_IN_LABEL:
            bpp_in_index = i
        elif v == BPP_OUT_LABEL:
            bpp_out_index = i
        elif v == G_BPP_IN_LABEL:
            g_bpp_in_index = i
        elif v == G_BPP_OUT_LABEL:
            g_bpp_out_index = i
    return src_index, dst_index, proto_index, bpp_in_index, bpp_out_index, g_bpp_in_index, g_bpp_out_index

In [9]:
src_index, dst_index, proto_index, bpp_in_index, bpp_out_index, g_bpp_in_index, g_bpp_out_index = 0, 0, 0, 0, 0, 0, 0
with open('src/out_file/new_infile.csv') as f:
    new_way = True
    global cluster, p2p_clusters, list_p2p, len_cluster, num_clusters
    l_index = 0
    for line in f:
        # If first line (header)
        if (l_index == 0):
            src_index, dst_index, proto_index, bpp_in_index, bpp_out_index, g_bpp_in_index, g_bpp_out_index = define_index(line)
            l_index += 1
            continue
        
        # Split line into list
        l = re.split(r',|\n|\t', line)
        
        # Keep flow TCP and UDP only
        if (l[proto_index] != '6') and (l[proto_index] != '17'):
            continue
        
        temp_key = ()
        if new_way:
            temp_key = (l[src_index], l[proto_index], int(l[g_bpp_out_index]), int(l[g_bpp_in_index]),)
        else:
            temp_key = (l[src_index], l[proto_index], int(l[bpp_out_index]), int(l[bpp_in_index]),)
        
        # Generate GUID for flow
        flow_guid = hash(temp_key).__str__()

        temp_cluster = cluster
        cluster_filenum = -1
        
        # Add to cluster
        if flow_guid not in cluster:
            temp_cluster,cluster_filenum = find_cluster(flow_guid, num_clusters)
            if cluster_filenum == -1:
                temp_cluster = cluster
                temp_cluster[flow_guid] = dict(cls_def=temp_key, ddi=0, dd_set = set(), list_flows=[(l[dst_index],)])
                len_cluster += 1
        else:
            temp_cluster[flow_guid]['list_flows'].append((l[dst_index],))
            
        # Update DDI count
        if new_way:
            temp_cluster[flow_guid]['dd_set'].add(client.getASN(l[dst_index]))
        else:
            temp_cluster[flow_guid]['dd_set'].add(ipaddress.ip_network(l[dst_index]+'/16', strict= False).network_address)
        temp_cluster[flow_guid]['ddi'] = len(temp_cluster[flow_guid]['dd_set'])
        
        # Check if DDI meet threshold, then it is P2P
        if temp_cluster[flow_guid]['ddi'] >= DD_THRESHOLD:
            p2p_clusters[flow_guid] = temp_cluster[flow_guid]
            list_p2p.add(l[src_index])

        # Check if there is read outside of cluster
        if cluster_filenum != -1:
            save_cluster(temp_cluster, cluster_filenum)

        # Dump cluster to file
        if len_cluster % 2000000 == 0 and cluster_filenum == -1:
            save_cluster(cluster, num_clusters)
            cluster = {}
            num_clusters += 1
            print("Dumped clusters: ", num_clusters)
        
        # Increment line index
        l_index += 1
        if l_index % 500000 == 0:
            print("Processed: ", l_index)

Processed:  500000
Processed:  1000000
Processed:  1500000
Processed:  2000000
Processed:  2500000
Processed:  3000000


In [10]:
# Iterate through p2p clusters
map_ip = {}
coverage = {}
remove_set = set()
for flow_guid, cluster in p2p_clusters.items():
    # Get the list of flows
    flows = cluster['list_flows']
    
    # Iterate through each flow
    for flow in flows:
        # Ignore local IPs
        if ipaddress.ip_address(flow[0]) not in ipaddress.ip_network(LOCAL_SUBNET, strict=False):
            asn = client.getASN(flow[0])
            map_ip[flow[0]] = asn
            if asn not in coverage:
                coverage[asn] = set()
            coverage[asn].add(cluster['cls_def'][0])

# COVERAGE_THRESHOLD = (1-(len(list_p2p)/len(map_ip)))*ARG_COVERAGE_THRESHOLD
COVERAGE_THRESHOLD = 0.6

# Re-iterate through p2p clusters to remove local IPs, also IP high coverage
for flow_guid, cluster in p2p_clusters.items():
    # Get the list of flows
    flows = cluster['list_flows']

    new_flows = []
    new_dd_set = set()
    
    # Iterate through each flow
    for flow in flows:
        # Get IP ASN
        asn = map_ip.get(flow[0])
        # Check coverage
        if asn is not None and len(coverage[asn])/len(list_p2p) >= COVERAGE_THRESHOLD:
            remove_set.add(flow[0])
            continue
        new_flows.append(flow)
        # new_dd_set.add(ipaddress.ip_network(flow[0]+'/16', strict= False).network_address)
        new_dd_set.add(client.getASN(flow[0]))
    # Update the cluster with the new flows
    cluster['list_flows'] = new_flows
    # Update the DDI count
    cluster['dd_set'] = new_dd_set
    cluster['ddi'] = len(new_dd_set)


# Community

In [11]:
# Calculate ddr_i that divide dd_i by the number of dest ip
def get_len_list_ip(flows):
    set_dsts = set()
    for dst in flows:
        set_dsts.add(dst[0])
    return set_dsts

def ddr_i(ddi, dsts):
    len_dsts = len(get_len_list_ip(dsts))
    if len_dsts == 0:
        return 0
    return ddi / len_dsts

# Given 2 cluster, if they have same stats, they are a pair, then calculate MCR of 2 clusters
def mcr(cluster1, cluster2):
    cls1= cluster1['cls_def']
    cls2=cluster2['cls_def']
    
    
    # print('Evaluate between ', cls1, ' and ', cls2)
    
    # 1 is proto, 2 is bpp_in, 3 is bpp_out
    if cls1[1] != cls2[1]:
        return 0
    if cls1[2] != cls2[2]:
        return 0
    if cls1[3] != cls2[3]:
        return 0
    
    # if cls2[2] == 0:
    #     if cls1[2] != 0:
    #         return 0
    # if cls2[3] == 0:
    #     if cls1[3] != 0:
    #         return 0
    
    # if cls2[2] != 0 and not (cls1[2] / cls2[2] > 0.75 and cls1[2] / cls2[2] < 1.25):
    #     return 0
    # if cls2[3] != 0 and not (cls1[3] / cls2[3] > 0.75 and cls1[3] / cls2[3] < 1.25):
    #     return 0
    
    ip_list_1 = get_len_list_ip(cluster1['list_flows'])
    ip_list_2 = get_len_list_ip(cluster2['list_flows'])
    intersection = len(ip_list_1.intersection(ip_list_2))
    union = len(ip_list_1.union(ip_list_2))
    
    if union == 0:
        return 0
    # print('Evaluate between ', cls1, ' and ', cls2, 'with MCR: ', intersection/union, intersection, union, len(ip_list_1), len(ip_list_2))
    return intersection / union

# Main construct graph function
def construct_graph(G, clusters, MCR_THRESHOLD=MCR_THRESHOLD):
    guid_tranversed = set()
    index = 0
    total_mcr = 0
    for guid, cls in clusters.items():
        index+=1
        G.add_node(guid, cls=cls,ddr_i=ddr_i(cls['ddi'], cls['list_flows']), cls_def=cls['cls_def'])
        if index % 100 == 0:
            total_mcr = 0
        guid_tranversed.add(guid)
        for guid_j,cls_j in clusters.items():
            if guid_j in guid_tranversed:
                continue
            mcr_ij = mcr(cls, cls_j)
            total_mcr += mcr_ij
            if mcr_ij > MCR_THRESHOLD:
                G.add_edge(guid, guid_j, mcr_ij=mcr_ij)
    return G
                
# Louvain community detection
def louvain_community_detection(G):
    return nx.community.louvain_communities(G, weight='mcr_ij')

# Clique detection for each community
def clique_detection(G, comm):
    temp_comm = comm.copy()
    list_bot = set()
    flag_exit = False
    while True:
        if len(temp_comm) < 3:
            break
        iterator = nx.clique.find_cliques(G.subgraph(temp_comm))
        set_vertex = set()
        index = 0
        while True:
            try:
                node = next(iterator)
                # print("Clique: ", node)
                if len(node) < 3:
                    if index == 0:
                        flag_exit = True
                    break
                for i in node:
                    set_vertex.add(i)
                    list_bot.add(G.nodes[i]['cls']['cls_def'][0])
                    # print("Add botnet: ", G.nodes[i]['cls']['cls_def'][0])
                index += 1
            except StopIteration:
                break
        temp_comm = temp_comm.difference(set_vertex)
        if flag_exit:
            break
    return list_bot

# Other way to evaluate, by examine degree of each node. 

# Evaluate community and output botnet list, include check comm mcr with threshold, cliques detection
def evaluate_community(G, comm_set, new_way=True, AVG_DD_THRESHOLD=AVG_DD_THRESHOLD, AVG_MCR_THRESHOLD=AVG_MCR_THRESHOLD):
    print('Evaluate community', len(comm_set))
    list_bot = set()
    bot_comm = []
    for comm in comm_set:
        total_ddri = 0
        if len(comm) == 0:
            continue
        for index in comm:
            total_ddri += G.nodes[index]['ddr_i']
        avg_ddri = total_ddri / len(comm)
        if avg_ddri < AVG_DD_THRESHOLD:
            # Implement no botnet flag
            continue
        
        total_mcr = 0
        edges = G.edges(comm)
        if len(edges) == 0:
            continue
        for edge in edges:
            total_mcr += G[edge[0]][edge[1]]['mcr_ij']
        avg_mcr = (2 * total_mcr) / (len(comm) * (len(comm)-1) )
        it = iter(comm)
        # print('Evaluate Community:', G.nodes[next(it)]['cls']['cls_def'], ', with avg_mcr:', avg_mcr, "avg_ddri:", avg_ddri)
        if avg_mcr < AVG_MCR_THRESHOLD:
            # Implement no botnet flag
            continue
        bot_comm.append(comm)
        
    print('Botnet community', len(bot_comm))
    
    if not new_way:
        for comm in bot_comm:
            for node in comm:
                list_bot.add(G.nodes[node]['cls']['cls_def'][0])
                # print("Add botnet: ", G.nodes[node]['cls']['cls_def'][0])
    else:    
        for comm in bot_comm:
            list_bot.update(clique_detection(G, comm))
        
    return list_bot

# Evaluate

In [12]:
ADDR_LABEL = "Addr"
TYPE_LABEL = "Type"

# Define mapping
mapping = set()
with_mapping = set()
mapper = {}

src_index, dst_index = 0, 0
with open('src/out_file/new_infile.csv') as f:
    l_index = 0
    for line in f:
        # If first line (header)
        if (l_index == 0):
            src_index, dst_index, _, _, _, _, _ = define_index(line)
            l_index += 1
            continue
        
        # Split line into list
        l = re.split(r',|\t|\n| ', line)
        mapping.add(l[src_index])
        mapping.add(l[dst_index])
        
addr_index, type_index = -1, -1
with open('../FullDS/bot_mapping.txt') as f:
    l_index = 0
    for line in f:
        l = re.split(r',|\n|\t| ', line)
        if (l_index == 0):
            for v in l:
                if v == ADDR_LABEL:
                    addr_index = l.index(v)
                elif v == TYPE_LABEL:
                    type_index = l.index(v)
            l_index += 1
            continue
        
        try:
            if ipaddress.ip_address(l[addr_index]):
                with_mapping.add(l[addr_index])
                if type_index != -1:
                    mapper[l[addr_index]] = l[type_index]
        except ValueError:
            print(f"Invalid IP address: {l[addr_index]}")
            pass

Invalid IP address: 
Invalid IP address: 
Invalid IP address: 


In [13]:
def get_scores(flow_files, bots_ip_files, local_subnet, bots_set):
    # Define mapping
    mapping = set()
    info_bots = set()
    
    src_index, dst_index = 0, 0

    # Read file -> get all IPs
    if isinstance(flow_files, list) or isinstance(flow_files, set):
        for ip in flow_files:
            mapping.add(ip)
    else:
        with open(flow_files) as f:
            l_index = 0
            for line in f:
                # If first line (header)
                if (l_index == 0):
                    src_index, dst_index, _, _, _, _, _ = define_index(line)
                    l_index += 1
                    continue
                
                # Split line into list
                l = re.split(r',|\t|\n', line)
                mapping.add(l[src_index])
                mapping.add(l[dst_index])

    if isinstance(bots_ip_files, list) or isinstance(bots_ip_files, set):
        for ip in bots_ip_files:
            info_bots.add(ip)
    else:
        with open(bots_ip_files) as f:
            l_index = 0
            addr_index, type_index = 0, 0
            for line in f:
                l = re.split(r',|\n|\t| ', line)
                if (l_index == 0):
                    for v in l:
                        if v == ADDR_LABEL:
                            addr_index = l.index(v)
                        elif v == TYPE_LABEL:
                            type_index = l.index(v)
                    l_index += 1
                    continue
                
                try:
                    if ipaddress.ip_address(l[addr_index]):
                        info_bots.add(l[addr_index])
                except ValueError:
                    print(f"Invalid IP address: {l[addr_index]}")
                    pass

    temp_mapping = set(mapping)
    # Remove publics IP in mapping
    for ip in mapping:
        try:
            if ipaddress.ip_address(ip) not in ipaddress.ip_network(local_subnet, strict=False):
                temp_mapping.remove(ip)
        except ValueError:
            temp_mapping.remove(ip)
    mapping = temp_mapping
    
    # True Positive 
    number_tp_list = (info_bots.intersection(bots_set))
    number_tp = len(number_tp_list)

    # True Negative 
    number_tn_list = (mapping.difference(info_bots)).intersection(mapping.difference(bots_set))
    number_tn = len(number_tn_list)

    # False Positive 
    number_fp_list = (bots_set.difference(info_bots))
    number_fp = len(number_fp_list)

    # False Negative 
    number_fn_list = (info_bots.difference(bots_set))
    number_fn = len(number_fn_list)

    precision = number_tp / (number_tp + number_fp) if (number_tp + number_fp) != 0 else 0
    recall = number_tp / (number_tp + number_fn) if (number_tp + number_fn) != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f1_score, [[number_tp, number_fp], [number_fn, number_tn]]

# Skopt

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin

class DetectSystem(BaseEstimator, ClassifierMixin):
    def __init__(self, DD=17, MCR=0.1, RM_POR =0.6, AVG_DD=0.1, AVG_MCR=0.1, new_way=True):
        self.DD = DD
        self.MCR = MCR
        self.RM_POR = RM_POR
        self.AVG_DD = AVG_DD
        self.AVG_MCR = AVG_MCR
        self.new_way = new_way

    # Build with X is mapping, 
    def fit(self, X, y):
        # Example: Store the training data
        self.X_ = X
        self.y_ = y
        # Training logic here
        return self
    
    def predict(self, X):
        global cluster, p2p_clusters, list_p2p, len_cluster, num_clusters
        # module_1_for_pr('INPUT/PeerRush.csv', new_way=self.new_way)
        # module_1('CTU/new_ctu_out.csv', new_way=self.new_way)
        # module_1('ISOT/new_ISOT.csv', new_way=self.new_way)

        # if self.new_way:
        #     # Remove same hosting
        #     remove_same_hosting(self.RM_POR)

        G = construct_graph(nx.Graph(), p2p_clusters, MCR_THRESHOLD=self.MCR)
        communities = louvain_community_detection(G)

        list_ip_bots = evaluate_community(G, communities, new_way=self.new_way, AVG_DD_THRESHOLD=self.AVG_DD, AVG_MCR_THRESHOLD=self.AVG_MCR)
        return list_ip_bots
    
    def clean():
        global cluster, p2p_clusters, list_p2p, len_cluster, num_clusters
        cluster = {}
        p2p_clusters = {}
        list_p2p = set()
        len_cluster = 0
        num_clusters = 0
        try:
            index = 0
            while True:
                if os.path.exists(f"clusters_{index}.pkl"):
                    os.remove(f"clusters_{index}.pkl")
                else:
                    break
                index += 1
        except Exception as e:
            print(f"Error removing old clusters: {e}")

In [43]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args

space = [
    Real(0.001, 0.05, name='MCR'),
    Real(0.8, 1.0, name='AVG_DD'),
    Real(0.001, 0.1, name='AVG_MCR'),
]

@use_named_args(space)
def objective(**params):
    cls = DetectSystem(new_way=True, DD=17, RM_POR=0.6, **params)
    bots_list = cls.predict(X=None)
    all_scores = get_scores(mapping, with_mapping, LOCAL_SUBNET, bots_list)
    print("MCR: ", params['MCR'], "AVG_DD: ", params['AVG_DD'], "AVG_MCR: ", params['AVG_MCR'])
    # print("AVG_DD: ", params['AVG_DD'], "Scores: ", all_scores)
    print("Scores: ", all_scores)
    return -all_scores[2]

In [45]:
from skopt import gp_minimize
init_space = [0.001, 0.8, 0.07]
# init_space = [0.8]
init_scores = [objective(init_space)]
cls_gp = gp_minimize(objective, space, n_calls=500, n_jobs=-1, random_state=42, x0=init_space, y0=init_scores)

Evaluate community 124
Botnet community 8
MCR:  0.001 AVG_DD:  0.8 AVG_MCR:  0.07
Scores:  (0.8878504672897196, 0.9595959595959596, 0.9223300970873787, [[95, 12], [4, 65425]])
Evaluate community 147
Botnet community 5
MCR:  0.040030606356151424 AVG_DD:  0.8366869579732328 AVG_MCR:  0.07818940902700418
Scores:  (0.8962264150943396, 0.9595959595959596, 0.926829268292683, [[95, 11], [4, 65426]])
Evaluate community 145
Botnet community 13
MCR:  0.03024565773937787 AVG_DD:  0.8891665505707183 AVG_MCR:  0.01089751666598229
Scores:  (0.9545454545454546, 0.8484848484848485, 0.8983957219251337, [[84, 4], [15, 65433]])
Evaluate community 144
Botnet community 16
MCR:  0.023503195706327498 AVG_DD:  0.8667417222278044 AVG_MCR:  0.01514381497427214
Scores:  (0.8867924528301887, 0.9494949494949495, 0.9170731707317074, [[94, 12], [5, 65425]])
Evaluate community 146
Botnet community 9
MCR:  0.032893535174493796 AVG_DD:  0.8112823158054201 AVG_MCR:  0.07247787845441567
Scores:  (0.8962264150943396, 0.95

In [46]:
cls_gp.fun

np.float64(-0.9353233830845771)