In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("/content/drive/MyDrive/mon_standard.pkl", 'rb') as fi: #Colab 에서 mon_standard.pkl 파일의 경로
    data = pickle.load(fi)

# X1 ~ X4: Continuous features(4개)
X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
X3 = [] # Array to store bursts - 19,000 instances

y = []  # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]


# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.

    for sample in data[i]:
        time_seq = [] #X1
        size_seq = [] #X2
        size_burst_seq = [] #X3



        burst_sum = 0
        last_sign = None
        cumulative_sum = 0

        for c in sample:
            time_seq.append(abs(c)) # Append timestamp to the timestamp sequence - X1

            # Calculate the direction and size for packet size
            dr = 1 if c > 0 else -1
            packet_size = dr * 512
            #size_seq.append(packet_size)  # Append direction * 512 to the packet size sequence  - X2

           # X3
           # Calculate the burst sequence based on the sign of the value
            if c > 0:
                sign = 1
            else:
                sign = -1


            if last_sign is None:  # For the first value
                burst_sum = packet_size
            elif sign == last_sign:  # If the sign remains the same, add to the burst
                burst_sum += packet_size
            else:  # If the sign changes, append the accumulated burst and start a new one
                size_burst_seq.append(burst_sum)
                burst_sum = packet_size

            last_sign = sign

        X1.append(time_seq)
        X2.append(size_seq)

        X3.append(size_burst_seq)

        y.append(label)


# X1_1 생성: X1(타임스탬프 시퀀스) 값 사이의 간격을 구해서 평균 계산
X1_1 = []
for seq in X1:
    if len(seq) > 1:
        intervals = np.diff(seq)  # 시퀀스 값 사이의 간격 계산
        avg_interval = np.mean(intervals)  # 간격의 평균 계산
        X1_1.append(avg_interval)
    else:
        X1_1.append(0)  # 시퀀스에 값이 하나뿐이라면 간격이 없으므로 0으로 처리

# X3_1 생성: X3(burst sequence)에서 절댓값이 가장 큰 값 10개를 뽑아서 평균 계산
X3_1 = []
for burst_seq in X3:
    if burst_seq:
        abs_values = list(map(abs, burst_seq))  # 각 burst sequence의 절댓값 리스트
        top_10_abs_values = sorted(abs_values, reverse=True)[:10]  # 상위 10개 절댓값
        avg_top_10 = np.mean(top_10_abs_values)  # 상위 10개 절댓값의 평균
        X3_1.append(avg_top_10)
    else:
        X3_1.append(0)  # burst sequence가 비어있다면 0으로 처리



size = len(y)

print(f'Total samples: {size}') # Output: 19000
# Extract y values and remove duplicates
unique_y = list(set(y))

X1_np = np.array(X1_1)
X2_np = np.array(X2)
X3_np = np.array(X3_1)
y_np = np.array(y)

np.savez('/content/drive/MyDrive/data_1_final.npz', X1=X4_np, X2=X2_np, X3=X5_np, y=y_np)



Loading datafile...
Total samples: 19000
X1 shape: (0,)
y shape: (19000,)
X4: [0.007140845070422536, 0.01965183752417795, 0.008187177597641856, 0.009245674740484428, 0.007572953736654804, 0.01924731182795699, 0.01152, 0.006312410841654779, 0.02099467140319716, 0.021936056838365896, 0.008966244725738396, 0.021643109540636044, 0.025355871886120998, 0.007399445214979195, 0.007525844245348036, 0.021306715063520873, 0.01857904085257549, 0.007521186440677966, 0.041359570661896244, 0.024443500424808837, 0.06304140127388536, 0.01030565963320844, 0.003928286852589641, 0.0035344827586206895, 0.009458937198067632, 0.010726915520628684, 0.003834426862215361, 0.009717624148003895, 0.0046791990192071925, 0.003974540311173975, 0.003914851485148515, 0.0032197580645161286, 0.02881679389312977, 0.01993103448275862, 0.008176838810641627, 0.011080392156862743, 0.003622397412573277, 0.009651500484027109, 0.004007071302298173, 0.011119952257807836, 0.0016923406076166025, 0.0031854911115101452, 0.00275399361