In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("/content/drive/MyDrive/mon_standard.pkl", 'rb') as fi: #Colab 에서 mon_standard.pkl 파일의 경로
    data = pickle.load(fi)

# X1 ~ X4: Continuous features(4개)
X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
X2 = [] # Array to store instances (direction*size) - size information
X3 = [] # Array to store bursts - 19,000 instances

y = []  # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]


# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.

    for sample in data[i]:
        time_seq = [] #X1
        size_seq = [] #X2
        size_burst_seq = [] #X3



        burst_sum = 0
        last_sign = None
        cumulative_sum = 0

        for c in sample:
            time_seq.append(abs(c)) # Append timestamp to the timestamp sequence - X1

            # Calculate the direction and size for packet size
            dr = 1 if c > 0 else -1
            packet_size = dr * 512
            #size_seq.append(packet_size)  # Append direction * 512 to the packet size sequence  - X2

           # X3
           # Calculate the burst sequence based on the sign of the value
            if c > 0:
                sign = 1
            else:
                sign = -1


            if last_sign is None:  # For the first value
                burst_sum = packet_size
            elif sign == last_sign:  # If the sign remains the same, add to the burst
                burst_sum += packet_size
            else:  # If the sign changes, append the accumulated burst and start a new one
                size_burst_seq.append(burst_sum)
                burst_sum = packet_size

            last_sign = sign

        X1.append(time_seq)
        X2.append(size_seq)

        X3.append(size_burst_seq)

        y.append(label)


# Create X1_1: Calculate the intervals between values in the timestamp sequence X1 and find their averageX1_1 = []
for seq in X1:
    if len(seq) > 1:
        intervals = np.diff(seq)  # Calculate spacing between sequence values
        avg_interval = np.mean(intervals)  # Calculating the average of an interval
        X1_1.append(avg_interval)
    else:
        X1_1.append(0)  # If there is only one value in the sequence, it is treated as 0 since there is no gap.

# Create X3_1: Calculate the average by selecting the 10 values ​​with the largest absolute value from X3 (burst sequence)
X3_1 = []
for burst_seq in X3:
    if burst_seq:
        abs_values = list(map(abs, burst_seq))  # List of absolute values ​​of each burst sequence
        top_10_abs_values = sorted(abs_values, reverse=True)[:10]  # Top 10 absolute values
        avg_top_10 = np.mean(top_10_abs_values)  # Average of the top 10 absolute values
        X3_1.append(avg_top_10)
    else:
        X3_1.append(0)  # If the burst sequence is empty, it is treated as 0.



size = len(y)

print(f'Total samples: {size}') # Output: 19000
# Extract y values and remove duplicates
unique_y = list(set(y))

X1_np = np.array(X1_1)
X2_np = np.array(X2)
X3_np = np.array(X3_1)
y_np = np.array(y)

np.savez('/content/drive/MyDrive/data_1_final.npz', X1=X1_1, X2=X2_np, X3=X3_1, y=y_np)

