In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import tqdm

In [None]:
data = pd.read_csv("class_data.csv")
data.head()

In [None]:
data.Rdate.min(), data.Rdate.max()

In [None]:
data.shape

In [None]:
data['src_net'] = data['src_ip'].apply(lambda x: ".".join(x.split(".")[:-1]) + ".0")
data['dest_net'] = data['dest_ip'].apply(lambda x: ".".join(x.split(".")[:-1]) + ".0")

# outbounds

In [None]:
outbounds = data[data.dest_country != 'KR']
outbounds = outbounds[outbounds.src_country != 'US']
# outbounds['protocol'] = outbounds['protocol'].apply(lambda x: 0 if x==17 else 1)
# outbounds['src_country'] = outbounds['src_country'].apply(lambda x: 0 if x=='KR' else 1)
# outbounds['action'] = outbounds['action'].apply(lambda x: 3 if x==0 else x)
outbounds['Rdate'] = pd.to_datetime(outbounds.Rdate.apply(lambda x: str(x)), format="%Y%m%d%H%M%S.%f")
outbounds['dest_country'] = outbounds.dest_country.fillna("None")
outbounds = outbounds[['Rdate','src_net','dest_net','protocol','src_port','dest_port','action','src_country','dest_country']]
outbounds

In [None]:
WINDOW_SIZE = 3
win_agg = {}

for hour in range((WINDOW_SIZE-1), 24):
    start = pd.to_datetime(f"20210410{str(hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210410{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = outbounds[(outbounds.Rdate >= start) & (outbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['dest_net', 'dest_port', 'dest_country']]
    win_agg[(10,hour)] = tmp
        
for hour in range(0, (WINDOW_SIZE-1)):
    start = pd.to_datetime(f"20210410{str(24+hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210411{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = outbounds[(outbounds.Rdate >= start) & (outbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['dest_net', 'dest_port', 'dest_country']]
    win_agg[(11,hour)] = tmp
    
for hour in range((WINDOW_SIZE-1), 24):
    start = pd.to_datetime(f"20210411{str(hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210411{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = outbounds[(outbounds.Rdate >= start) & (outbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['dest_net', 'dest_port', 'dest_country']]
    win_agg[(11,hour)] = tmp

In [None]:
# 데이터에서 Rdate 에 해당하는 윈도우로부터 dest_net, dest_port, dest_country 각 cdf 값 반환
def get_cdf(seq):
    rdate, net, port, cty = seq.Rdate, seq.dest_net, seq.dest_port, seq.dest_country
    window = win_agg[(rdate.day,rdate.hour)]
    return window[0][net], window[1][port], window[2][cty]

In [None]:
target = outbounds[outbounds.Rdate > pd.to_datetime(f"20210410{str(WINDOW_SIZE-1).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")]
x = pd.DataFrame(index=target.index, columns=['dest_net','dest_port','dest_country'], data=[get_cdf(target.loc[i,:]) for i in tqdm.tqdm(target.index)])
x['src_net'] = target.src_net
x = x[['src_net','dest_net','dest_port','dest_country']]
x

In [None]:
target = outbounds[outbounds.Rdate > pd.to_datetime(f"20210410{str(WINDOW_SIZE-1).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")]

for SEQ_LEN in [10,20,30,40,50]:
    BASE = 1
    sequences = []
    sequence_match = []
    seq_builder = {net:[[]]+[0 for i in range(BASE)] for net in target.src_net.unique()}

    for i in tqdm.tqdm(target.index):
        tmp = target.loc[i,:]
        seq = seq_builder[tmp.src_net]

        seq[0].append(i)
        seq[1] += tmp.action       # none/deny == 1, allow == 0
    #     seq[2] += tmp.protocol     # tcp == 1, udp == 0
    #     seq[3] += tmp.src_country  # None == 1, KR == 0
        for return_value in get_cdf(tmp):
            seq.append(return_value)

        if len(seq) == 1 + BASE + SEQ_LEN * 3:
            for j in range(1, BASE+1):
                seq[j] /= SEQ_LEN
            sequences.append(seq[1:])
            sequence_match.append([tmp.src_net] + seq[0])
            seq_builder[tmp.src_net] = [[]] + [0 for _ in range(BASE)]
    
    
    col = ['actions']
    for i in range(SEQ_LEN):
        col.append(f'q{i+1}_net')
        col.append(f'q{i+1}_port')
        col.append(f'q{i+1}_cty')

    assert len(col) == BASE + SEQ_LEN*3, "column length unmatching"
    table = pd.DataFrame(data=sequences, columns=col)
    
    save_pth = f"_SEQ_{str(SEQ_LEN).zfill(2)}_WIN_{str(WINDOW_SIZE).zfill(2)}"
    table.to_csv("pjdata/data" + save_pth + ".csv", index=False)
    with open("pjdata/match" + save_pth + ".pickle", "wb") as f:
        pickle.dump(sequence_match, f)

In [None]:
# col = ['actions','protocol','src_cty']
col = ['actions']
for i in range(SEQ_LEN):
    col.append(f'q{i+1}_net')
    col.append(f'q{i+1}_port')
    col.append(f'q{i+1}_cty')

assert len(col) == BASE + SEQ_LEN*3, "column length unmatching"
table = pd.DataFrame(data=sequences, columns=col)
table

# inbounds

In [None]:
import warnings
warnings.filterwarnings(action='ignore')
data = pd.read_csv("class_data.csv")
data['src_net'] = data['src_ip'].apply(lambda x: ".".join(x.split(".")[:-1]) + ".0")
data['dest_net'] = data['dest_ip'].apply(lambda x: ".".join(x.split(".")[:-1]) + ".0")
inbounds = data[data.dest_country == 'KR']
inbounds['protocol'] = inbounds['protocol'].apply(lambda x: 0 if x==17 else 1)
inbounds['action'] = inbounds['action'].apply(lambda x: 3 if x==0 else x)
inbounds['Rdate'] = pd.to_datetime(inbounds.Rdate.apply(lambda x: str(x)), format="%Y%m%d%H%M%S.%f")
inbounds['src_country'] = inbounds.src_country.fillna("None")
inbounds = inbounds[['Rdate','src_country','src_net','dest_net','dest_port','protocol','action']]
inbounds

In [None]:
WINDOW_SIZE = 6
win_agg = {}

for hour in range((WINDOW_SIZE-1), 24):
    start = pd.to_datetime(f"20210410{str(hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210410{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = inbounds[(inbounds.Rdate >= start) & (inbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['src_net', 'dest_port', 'src_country']]
    win_agg[(10,hour)] = tmp
        
for hour in range(0, (WINDOW_SIZE-1)):
    start = pd.to_datetime(f"20210410{str(24+hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210411{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = inbounds[(inbounds.Rdate >= start) & (inbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['src_net', 'dest_port', 'src_country']]
    win_agg[(11,hour)] = tmp
    
for hour in range((WINDOW_SIZE-1), 24):
    start = pd.to_datetime(f"20210411{str(hour-(WINDOW_SIZE-1)).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")
    end = pd.to_datetime(f"20210411{str(hour).zfill(2)}5959.99", format="%Y%m%d%H%M%S.%f")
    x = inbounds[(inbounds.Rdate >= start) & (inbounds.Rdate <= end)]
    tmp = [(np.cumsum(x[col].value_counts(ascending=True)) / x.shape[0]) for col in ['src_net', 'dest_port', 'src_country']]
    win_agg[(11,hour)] = tmp

In [None]:
def get_cdf(seq):
    rdate, net, port, cty = seq.Rdate, seq.src_net, seq.dest_port, seq.src_country
    window = win_agg[(rdate.day,rdate.hour)]
    return window[0][net], window[1][port], window[2][cty]

In [None]:
target = inbounds[inbounds.Rdate > pd.to_datetime(f"20210410{str(WINDOW_SIZE-1).zfill(2)}0000.00", format="%Y%m%d%H%M%S.%f")]

for SEQ_LEN in [10,20,30,40,50]:
    BASE = 1
    sequences = []
    sequence_match = []
    seq_builder = {net:[[]]+[0 for i in range(BASE)] for net in target.dest_net.unique()}

    for i in tqdm.tqdm(target.index):
        tmp = target.loc[i,:]
        seq = seq_builder[tmp.dest_net]

        seq[0].append(i)
        seq[1] += tmp.action       # none/deny == 1, allow == 0
    #     seq[2] += tmp.protocol     # tcp == 1, udp == 0
    #     seq[3] += tmp.src_country  # None == 1, KR == 0
        for return_value in get_cdf(tmp):
            seq.append(return_value)

        if len(seq) == 1 + BASE + SEQ_LEN * 3:
            for j in range(1, BASE+1):
                seq[j] /= SEQ_LEN
            sequences.append(seq[1:])
            sequence_match.append([tmp.dest_net] + seq[0])
            seq_builder[tmp.dest_net] = [[]] + [0 for _ in range(BASE)]
    
    
    col = ['actions']
    for i in range(SEQ_LEN):
        col.append(f'q{i+1}_net')
        col.append(f'q{i+1}_port')
        col.append(f'q{i+1}_cty')

    assert len(col) == BASE + SEQ_LEN*3, "column length unmatching"
    table = pd.DataFrame(data=sequences, columns=col)
    
    save_pth = f"_SEQ_{str(SEQ_LEN).zfill(2)}_WIN_{str(WINDOW_SIZE).zfill(2)}"
    table.to_csv("pjdata/dataIN" + save_pth + ".csv", index=False)
    with open("pjdata/matchIN" + save_pth + ".pickle", "wb") as f:
        pickle.dump(sequence_match, f)