In [None]:
import os
import pickle

import pandas as pd
import numpy as np

from ipaddress import IPv4Address
from tqdm.notebook import tqdm
from datetime import datetime
from scipy.stats import entropy

In [None]:
INSIDE_IP_LIST = ["10.0.0.0/8", "172.22.0.0/16", "192.168.0.0/16", "222.111.164.0/21",
              "113.198.184.0/21", "113.198.208.0/20",
              "203.246.112.0/22", "203.246.116.0/24", "210.121.152.0/21",
              "210.123.32.0/20", "211.192.0.0/16", "1.209.0.0/16"]

def inflate_subnet_ip(inside_ip_list):
    total_ip_set = set()
    for inside_ip in inside_ip_list:
        if '/' in inside_ip:
            inside_ip, cidr = inside_ip.split('/')
            start_address = int(IPv4Address(inside_ip))
            end_address = int(IPv4Address(inside_ip)) + 2**(32-int(cidr))-1
            for subnet_address in range(start_address, end_address+1):
                total_ip_set.add(str(IPv4Address(subnet_address)))
        else:
            total_ip_set.add(inside_ip)
    return total_ip_set

inside_ip_set = inflate_subnet_ip(INSIDE_IP_LIST)

In [None]:
data_path = r"C:\jupyter_project\Workshop\kookmin2Netflow_dataset(0221)"
save_path = r"D:\behavior(no rest)"
# save_path = r"D:\tmp_save"

In [None]:
train_data = [r"C:\jupyter_project\Workshop\kookmin2Netflow_dataset(0221)\2023-02-14.csv",
             r"C:\jupyter_project\Workshop\kookmin2Netflow_dataset(0221)\2023-02-15.csv"]

test_data = ["C:\jupyter_project\Workshop\kookmin2Netflow_dataset(0221)\2023-02-16.csv"]

In [None]:
class Profile:

    attr_list = ['target_ip', 'target_port', 'opposite_ip', 'opposite_port', 'duration',
                 'target_pkts', 'opposite_pkts', 'target_bytes', 'opposite_bytes',
                'start_time', 'end_time']

    attr_typing_map = {
        'target_ip': lambda x: str(x),
        'target_port': lambda x: int(float(x)),
        'opposite_ip': lambda x: str(x),
        'opposite_port': lambda x: int(float(x)),
        'duration': lambda x: float(x),
        'target_pkts': lambda x: int(float(x)),
        'opposite_pkts': lambda x: int(float(x)),
        'target_bytes': lambda x: int(float(x)),
        'opposite_bytes': lambda x: int(float(x)),
        'start_time' : lambda x: str(x),
        'end_time' : lambda x: str(x)
    }

    def __init__(self, profile_key):
        self.__profile_key = profile_key
        self.__table = {}
        for attr in self.attr_list:
            self.__table[attr] = []

    def add(self, attr_dict: dict):
        for attr in self.attr_list:
            value = self.attr_typing_map[attr](attr_dict[attr])
            self.__table[attr].append(value)

    def debug(self):
        ret_str = self.__str_attr() + '\n'
        for idx in range(self.__flow_cnt):
            ret_str += self.__str_row(idx) + '\n'
        return ret_str

    def __str_attr(self) -> str:
        return ",".join(self.attr_list)

    def __str_row(self, idx: int) -> str:
        print_row_values = []
        for attr in self.attr_list:
            print_row_values.append(str(self.__table[attr][idx]))
        return ",".join(print_row_values)

    def __len__(self) -> int:
        return len(self.table)

    def __getitem__(self, attr) -> list:
        return self.table[attr]

    @property
    def table(self) -> dict:
        return self.__table

    @property
    def profile_key(self) -> str:
        return self.__profile_key

    @property
    def target_ip(self) -> str:
        return self.profile_key.split('_')[0]


In [None]:
def get_int_time(start_time: str):
    ts_timestamp = datetime.strptime(start_time, datetime_format).timestamp()
    return ts_timestamp

def get_str_time(int_time):
    return datetime.fromtimestamp(int_time).strftime(datetime_format)

In [None]:
def get_int(st_list, end_list):
    if len(st_list) == 1:
        return 0
    else:
        int_list = []
        for i in range(len(end_list)-1):
            int_list.append(get_int_time(end_list[i]) - get_int_time(st_list[i+1]))
        return int_list
        

feature_list = ['target_ip', 'target_key_port', 'opposite_key_port', 'card_target_port', 'card_opposite_ip',
                'card_opposite_port',
                'sum_target_pkts', 'sum_opposite_pkts', 'sum_target_bytes', 'sum_opposite_bytes', 'sum_dur',
                'avg_target_pkts', 'avg_opposite_pkts', 'avg_target_bytes', 'avg_opposite_bytes', 'avg_dur',
                'std_target_pkts', 'std_opposite_pkts', 'std_target_bytes', 'std_opposite_bytes', 'std_dur',
                'len_flows', 'pst_per_flows',
                'etp_target_pkts', 'etp_opposite_pkts', 'etp_target_bytes', 'etp_opposite_bytes',
                'in_bytes_per_pkt', 'in_pkts_per_byte', 'out_bytes_per_pkt', 'out_pkts_per_byte',
                'in_out_bytes', 'in_out_pkts', 'out_in_bytes', 'out_in_pkts',
                'in_bytes_per_sec', 'in_pkts_per_sec', 'out_bytes_per_sec', 'out_pkts_per_sec',
                'etp_opposite_ip', 'etp_opposite_port', 'etp_target_port', 'etp_pattern',
                'freq_target_port', 'freq_opposite_port', 'freq_opposite_ip',
                'sum_int', 'avg_int', 'std_int', 'etp_int']

feature_func_map = {
    'target_ip':
        lambda x: str(IPv4Address(int(IPv4Address(x.target_ip)) >> (32 - 32) << (32 - 32))),
    'target_key_port':
        lambda x: sorted(
            zip(*np.unique(x['target_port'], return_counts=True)), key=lambda y: y[1], reverse=True)[0][0],
    'opposite_key_port':
        lambda x: sorted(
            zip(*np.unique(x['opposite_port'], return_counts=True)), key=lambda y: y[1], reverse=True)[0][0],
    'card_target_port':
        lambda x: len(set(x['target_port'])),
    'card_opposite_ip':
        lambda x: len(set(x['opposite_ip'])),
    'card_opposite_port':
        lambda x: len(set(x['opposite_port'])),
    'sum_target_pkts':
        lambda x: np.sum(x['target_pkts']),
    'sum_opposite_pkts':
        lambda x: np.sum(x['opposite_pkts']),
    'sum_target_bytes':
        lambda x: np.sum(x['target_bytes']),
    'sum_opposite_bytes':
        lambda x: np.sum(x['opposite_bytes']),
    'sum_dur':
        lambda x: np.sum(x['duration']),
    'avg_target_pkts':
        lambda x: np.mean(x['target_pkts']),
    'avg_opposite_pkts':
        lambda x: np.mean(x['opposite_pkts']),
    'avg_target_bytes':
        lambda x: np.mean(x['target_bytes']),
    'avg_opposite_bytes':
        lambda x: np.mean(x['opposite_bytes']),
    'avg_dur':
        lambda x: np.mean(x['duration']),
    'std_target_pkts':
        lambda x: np.std(x['target_pkts']),
    'std_opposite_pkts':
        lambda x: np.std(x['opposite_pkts']),
    'std_target_bytes':
        lambda x: np.std(x['target_bytes']),
    'std_opposite_bytes':
        lambda x: np.std(x['opposite_bytes']),
    'std_dur':
        lambda x: np.std(x['duration']),
    'len_flows':
        lambda x: len(x['opposite_ip']),
    'pst_per_flows':
        lambda x: sorted(zip(*np.unique("{}_{}_{}_{}".format(x['target_pkts'], x['opposite_pkts'],
                                              x['target_bytes'], x['opposite_bytes']),
                                    return_counts=True)), reverse=True, key=lambda y:y[1])[0][1],
    'etp_target_pkts':
        lambda x: entropy(np.unique(x['target_pkts'], return_counts=True)[1]),
    'etp_opposite_pkts':
        lambda x: entropy(np.unique(x['opposite_pkts'], return_counts=True)[1]),
    'etp_target_bytes':
        lambda x: entropy(np.unique(x['target_bytes'], return_counts=True)[1]),
    'etp_opposite_bytes':
        lambda x: entropy(np.unique(x['opposite_bytes'], return_counts=True)[1]),
    'in_bytes_per_pkt':
        lambda x: np.sum(x['target_bytes']) / max(np.sum(x['target_pkts']), 0.1),
    'in_pkts_per_byte':
        lambda x: np.sum(x['target_pkts']) / max(np.sum(x['target_bytes']), 0.1),
    'out_bytes_per_pkt':
        lambda x: np.sum(x['opposite_bytes']) / max(np.sum(x['opposite_pkts']), 0.1),
    'out_pkts_per_byte':
        lambda x: np.sum(x['opposite_pkts']) / max(np.sum(x['opposite_bytes']), 0.1),
    'in_out_bytes':
        lambda x: np.sum(x['target_bytes']) / max(np.sum(x['opposite_bytes']), 0.1),
    'in_out_pkts':
        lambda x: np.sum(x['target_pkts']) / max(np.sum(x['opposite_pkts']), 0.1),
    'out_in_bytes':
        lambda x: np.sum(x['opposite_bytes']) / max(np.sum(x['target_bytes']), 0.1),
    'out_in_pkts':
        lambda x: np.sum(x['opposite_pkts']) / max(np.sum(x['target_pkts']), 0.1),
    'in_bytes_per_sec':
        lambda x: np.sum(x['target_bytes']) / max(np.sum(x['duration']), 0.1),
    'in_pkts_per_sec':
        lambda x: np.sum(x['target_pkts']) / max(np.sum(x['duration']), 0.1),
    'out_bytes_per_sec':
        lambda x: np.sum(x['opposite_bytes']) / max(np.sum(x['duration']), 0.1),
    'out_pkts_per_sec':
        lambda x: np.sum(x['opposite_pkts']) / max(np.sum(x['duration']), 0.1),
    'etp_opposite_ip':
        lambda x: entropy(np.unique(x['opposite_ip'], return_counts=True)[1]),
    'etp_opposite_port':
        lambda x: entropy(np.unique(x['opposite_port'], return_counts=True)[1]),
    'etp_target_port':
        lambda x: entropy(np.unique(x['target_port'], return_counts=True)[1]),
    'etp_pattern':
        lambda x: sorted(zip(*np.unique("{}_{}_{}_{}".format(x['target_pkts'], x['opposite_pkts'],
                                                         x['target_bytes'], x['opposite_bytes']),
                                    return_counts=True)), reverse=True, key=lambda y: y[1])[0][1] / len(
            x['target_pkts']),
    'freq_target_port':
        lambda x: sorted(
            zip(*np.unique(x['target_port'], return_counts=True)), key=lambda y: y[1],
            reverse=True)[0][0] / len(x['target_port']),
    'freq_opposite_port':
        lambda x: sorted(
            zip(*np.unique(x['opposite_port'], return_counts=True)), key=lambda y: y[1],
            reverse=True)[0][0] / len(x['opposite_port']),
    'freq_opposite_ip':
        lambda x: sorted(
            zip(*np.unique(x['target_port'], return_counts=True)), key=lambda y: y[1],
            reverse=True)[0][0] / len(x['opposite_ip']),
    'sum_int':
        lambda x: np.sum(get_int(x['start_time'], x['end_time'])), 
    'avg_int':
        lambda x: np.mean(get_int(x['start_time'], x['end_time'])),
    'std_int':
        lambda x: np.std(get_int(x['start_time'], x['end_time'])),
    'etp_int':
        lambda x: entropy(get_int(x['start_time'], x['end_time'])) if np.sum(get_int(x['start_time'], x['end_time'])) != 0 else 0
}

In [None]:
attribute_map = {'target_ip': 'destination', 'target_port': 'dst_port', 'opposite_ip': 'source', 'opposite_port': 'src_port',
                'duration': 'duration', 'target_pkts': 'out_pkts', 'opposite_pkts': 'in_pkts',
                'target_bytes': 'out_bytes', 'opposite_bytes': 'in_bytes',
                'start_time' : 'first', 'end_time' : 'last'}

attribute_map_inv = {'target_ip': 'source', 'target_port': 'src_port', 'opposite_ip': 'destination', 
                    'opposite_port': 'dst_port', 'duration': 'duration', 'target_pkts': 'in_pkts', 
                    'opposite_pkts': 'out_pkts', 'target_bytes': 'in_bytes', 'opposite_bytes': 'out_bytes',
                    'start_time' : 'first', 'end_time' : 'last'}

In [None]:
def profiling(flow_list, target_ip, st_time, end_time):
    profile_key = '{}_{}_{}'.format(target_ip, get_str_time(st_time), get_str_time(end_time))
    new_pf = Profile(profile_key)
    for flow in flow_list:
        new_pf.add(add_flow(flow))
    return (new_pf, profile_key)

def add_flow(flow: list):
    sip, dip = flow[column_index['source']], flow[column_index['destination']]
    start_time = flow[column_index['first']]
    attr_map = {}
    profile_key = '{}_{}'
    if profiling_target == 'destination':
        if sip in inside_ip_set:
            attr_map = attribute_map
            profile_key = profile_key.format(dip, start_time)
        elif dip in inside_ip_set:
            attr_map = attribute_map_inv
            profile_key = profile_key.format(sip, start_time)
        else:
            return
    elif profiling_target == 'source':
        if dip in inside_ip_set:
            attr_map = attribute_map_inv
            profile_key = profile_key.format(sip, start_time)
        elif sip in inside_ip_set:
            attr_map = attribute_map
            profile_key = profile_key.format(dip, start_time)
        else:
            return

    attr_dict = {}
    for attr, column in attr_map.items():
        attr_dict[attr] = flow[column_index[column]]
    return attr_dict

In [None]:
datetime_format = '%Y-%m-%d %H:%M:%S.%f'
profiling_target = 'destination'

In [None]:
profiling_min_data = [10]
profiling_timeout = [60, 4*60, 12*60, 48*60]
# 3 - 48*60 추가로 해야함~

In [None]:
for min_data in profiling_min_data:
    for timeout in profiling_timeout:
        
        print("Start MIN :", min_data, "TIMEOUT :", timeout)
        
        profile_list = []
        profile_key_list = []
        flow_stack = {}
        for file in train_data:
            with open(file, "r", encoding='utf-8') as f:
                col = f.readline().strip().split(',')
                column_index = {i : idx for idx, i in enumerate(col)}
                for tmp_flow in tqdm(f.readlines()):
                    flow = tmp_flow.strip().split(',')
                    sip, dip = flow[column_index['source']], flow[column_index['destination']]
                    if sip not in inside_ip_set and dip not in inside_ip_set:
                        continue
                    elif sip in inside_ip_set and dip not in inside_ip_set:
                        target_ip = dip
                    else:
                        target_ip = sip
                    now_time = get_int_time(flow[column_index['first']])

                    if target_ip not in flow_stack:
                        flow_stack[target_ip] = {'flow':[]}
                        flow_stack[target_ip]['st_time'] = now_time
                    if flow_stack[target_ip]['st_time'] < now_time - timeout:
                        profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
                        profile_list.append(profile)
                        profile_key_list.append(profile_key)
                        del flow_stack[target_ip]
                        flow_stack[target_ip] = {'flow':[]}
                        flow_stack[target_ip]['st_time'] = now_time

                    flow_stack[target_ip]['flow'].append(flow)
                    flow_stack[target_ip]['end_time'] = now_time
                    if len(flow_stack[target_ip]['flow']) == min_data:
                        profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
                        profile_list.append(profile)
                        profile_key_list.append(profile_key)
                        flow_stack[target_ip]['flow'].pop(0)
                        flow_stack[target_ip]['st_time'] = get_int_time(flow_stack[target_ip]['flow'][0][column_index['first']])


        for ip in tqdm(flow_stack):
            profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
            profile_list.append(profile)
            profile_key_list.append(profile_key)


        feature_matrix = [[] for _ in range(len(feature_list))]
        profile_key_list = []

        for profile in tqdm(profile_list):
            for i, feature in enumerate(feature_list):
                feature_matrix[i].append(feature_func_map[feature](profile))
            profile_key_list.append(profile.profile_key)
        
        np.save(rf"{save_path}\min_{min_data}_to_{timeout}_feature_train.npy", feature_matrix)
        np.save(rf"{save_path}\min_{min_data}_to_{timeout}_key_train.npy", profile_key_list)

In [None]:
np.save(rf"{save_path}\min_{min_data}_to_{timeout}_feature_train.npy", feature_matrix)
np.save(rf"{save_path}\min_{min_data}_to_{timeout}_key_train.pkl", profile_key_list)


# with open(rf"{save_path}\min_{min_data}_to_{timeout}_feature_train.npy", "wb") as f:
#     np.save(feature_matrix, f)

# with open(rf"{save_path}\min_{min_data}_to_{timeout}_key_train.pkl", "wb") as f:
#     np.save(profile_key_list, f)

In [None]:
profile_list = []
profile_key_list = []
flow_stack = {}
for file in tqdm(os.listdir(data_path)):
    with open(rf"{data_path}\{file}", "r", encoding='utf-8') as f:
        col = f.readline().strip().split(',')
        column_index = {i : idx for idx, i in enumerate(col)}
        for tmp_flow in tqdm(f.readlines()):
            flow = tmp_flow.strip().split(',')
            sip, dip = flow[column_index['source']], flow[column_index['destination']]
            if sip not in inside_ip_set and dip not in inside_ip_set:
                continue
            elif sip in inside_ip_set and dip not in inside_ip_set:
                target_ip = dip
            else:
                target_ip = sip
            now_time = get_time_window(flow[column_index['first']])
            
            if target_ip not in flow_stack:
                flow_stack[target_ip] = {'flow':[]}
                flow_stack[target_ip]['st_time'] = now_time
            elif flow_stack[target_ip]['st_time'] < now_time - profiling_timeout:
                while(flow_stack[target_ip]['st_time'] < now_time - profiling_timeout):
                    profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
                    profile_list.append(profile)
                    profile_key_list.append(profile_key)
                    flow_stack[target_ip]['flow'].pop(0)
                    if len(flow_stack[target_ip]['flow']) == 0:
                        flow_stack[target_ip]['st_time'] = now_time
                        break
                    else:
                        flow_stack[target_ip]['st_time'] = get_time_window(flow_stack[target_ip]['flow'][0][column_index['first']])

            flow_stack[target_ip]['flow'].append(flow)
            flow_stack[target_ip]['end_time'] = now_time
            if len(flow_stack[target_ip]['flow']) == profiling_min_data:
                profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
                profile_list.append(profile)
                profile_key_list.append(profile_key)
                flow_stack[target_ip]['flow'].pop(0)
                if len(flow_stack[target_ip]['flow']) == 0:
                    del flow_stack[target_ip]
                    break
                else:
                    flow_stack[target_ip]['st_time'] = get_time_window(flow_stack[target_ip]['flow'][0][column_index['first']])

                    
for ip in tqdm(flow_stack):
    profile, profile_key = profiling(flow_stack[target_ip]['flow'], target_ip, flow_stack[target_ip]['st_time'], flow_stack[target_ip]['end_time'])
    profile_list.append(profile)
    profile_key_list.append(profile_key)
    


In [None]:
feature_matrix = [[] for _ in range(len(feature_list))]
profile_key_list = []

In [None]:
for profile in tqdm(profile_list):
    for i, feature in enumerate(feature_list):
        feature_matrix[i].append(feature_func_map[feature](profile))
    profile_key_list.append(profile.profile_key)