# Cram School Easter Round: Cybersecurity AI Challenge

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from datetime import time
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

## Load the data

In [2]:
df_train = pd.read_csv("./train_data.csv", parse_dates=["Timestamp"])
df_test = pd.read_csv("./test_data.csv", parse_dates=["Timestamp"])

In [3]:
df_train.isna().any()

ID                          False
Timestamp                   False
Suspicious_Port_Activity    False
Traffic_Volume_Variation    False
Packet_Length_Anomaly       False
Malware_Score               False
Threat_Level_Index          False
User_Behavior_Score          True
Geo_Dispersion              False
Payload_Entropy             False
Login_Attempts              False
Device_Response_Time        False
Session_Duration            False
Packet_Retry_Rate           False
Anomaly_Tendency            False
Attack Type                 False
dtype: bool

In [4]:
df_test.isna().any()

ID                          False
Timestamp                   False
Suspicious_Port_Activity    False
Traffic_Volume_Variation    False
Packet_Length_Anomaly       False
Malware_Score               False
Threat_Level_Index          False
User_Behavior_Score          True
Geo_Dispersion              False
Payload_Entropy             False
Login_Attempts              False
Device_Response_Time        False
Session_Duration            False
Packet_Retry_Rate           False
Anomaly_Tendency            False
dtype: bool

In [5]:
df_train.fillna({'User_Behavior_Score': df_train['User_Behavior_Score'].mean()}, inplace=True)
df_test.fillna({'User_Behavior_Score': df_test['User_Behavior_Score'].mean()}, inplace=True)

In [6]:
df_train.describe()

Unnamed: 0,ID,Timestamp,Suspicious_Port_Activity,Traffic_Volume_Variation,Packet_Length_Anomaly,Malware_Score,Threat_Level_Index,User_Behavior_Score,Geo_Dispersion,Payload_Entropy,Login_Attempts,Device_Response_Time,Session_Duration,Packet_Retry_Rate,Anomaly_Tendency,Attack Type
count,13356.0,13356,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0,13356.0
mean,19904.8236,2023-01-28 19:12:46.451033344,49.791904,49.788837,49.807553,7.223436,4.070842,-0.001211,50.172048,11.14877,140773.7,4828117.0,15.143093,19.936004,1.604717,0.745283
min,7.0,2023-01-01 00:00:00,9.830626,11.44717,9.704399,-40.898692,-20.865728,-4.207929,0.002713,-102.528539,-842866900.0,0.0,-35.197389,-6.683795,-73.746635,0.0
25%,9985.75,2023-01-15 00:00:15,43.117314,43.043988,42.978653,-2.796696,-0.800068,-0.597838,25.838683,-9.625675,3.686437,0.0,5.96713,15.66537,-8.751905,0.0
50%,19880.5,2023-01-28 17:56:30,49.912484,49.946217,49.918006,6.839157,3.734464,-0.001211,50.284962,11.211696,1449.021,0.0,15.215888,19.921653,1.568656,0.0
75%,29923.5,2023-02-11 16:48:15,56.573445,56.550146,56.620733,16.848907,8.794495,0.592959,74.787214,31.65616,291332.7,7785192.0,24.25457,24.321041,11.893369,1.0
max,39996.0,2023-02-25 13:10:00,97.241773,97.398056,98.353307,63.855348,27.272845,4.185208,99.989204,138.23509,881837200.0,39992180.0,65.088692,44.933708,58.459195,2.0
std,11550.807566,,10.075306,10.080638,10.209363,14.450361,6.802323,0.958667,28.621882,30.631642,87483050.0,7278543.0,13.130384,6.446938,15.255836,0.827976


In [7]:
df_test.describe()

Unnamed: 0,ID,Timestamp,Suspicious_Port_Activity,Traffic_Volume_Variation,Packet_Length_Anomaly,Malware_Score,Threat_Level_Index,User_Behavior_Score,Geo_Dispersion,Payload_Entropy,Login_Attempts,Device_Response_Time,Session_Duration,Packet_Retry_Rate,Anomaly_Tendency
count,7500.0,7500,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,19848.977333,2023-01-28 11:51:49.119999744,50.13861,50.120978,50.155341,9.905555,5.377551,0.011313,50.118944,11.010308,2365471.0,6563752.0,15.135375,20.04867,1.918573
min,0.0,2023-01-01 00:07:00,12.056187,11.257373,12.671509,-40.898692,-16.382542,-3.403769,0.021645,-102.528539,-1116288000.0,0.0,-30.442588,-3.958067,-53.664412
25%,9770.75,2023-01-14 20:59:30,43.056456,43.248044,42.988945,-0.087082,0.609952,-0.566398,26.149742,-9.103703,-12877960.0,0.0,5.797551,15.706623,-8.43339
50%,19922.5,2023-01-28 00:49:00,50.140781,50.179219,50.26825,9.86308,5.315728,0.011313,50.032403,11.404631,1517.705,4033571.0,15.073778,20.029074,1.778833
75%,29615.5,2023-02-11 07:01:15,57.044083,57.119216,57.197588,19.833414,10.372933,0.594068,74.732869,31.328711,17215210.0,10576710.0,24.415288,24.510951,12.137416
max,39997.0,2023-02-25 13:16:00,86.440026,86.422501,86.579586,75.400715,27.815069,3.780323,99.941817,123.655156,900363800.0,38235560.0,69.376365,41.173987,58.960541
std,11560.13315,,10.305109,10.305518,10.412693,14.444796,6.8277,0.937631,28.590484,30.136054,101362200.0,7755322.0,13.233457,6.479375,15.491881


## Subtask 1

In [8]:
value_subtask1 = (df_test["Timestamp"].dt.time < time(12, 0, 0)).map({True: "AM", False: "PM"}).to_list()
value_subtask1[:5]  # first 5

['PM', 'PM', 'AM', 'AM', 'AM']

In [9]:
task1 = pd.DataFrame({
    'subtaskID': 1,
    'datapointID': df_test.ID,
    'answer':  value_subtask1
})

## Subtask 2

In [10]:
X = df_train.drop(columns=["Attack Type", "ID", "Timestamp"]).to_numpy()
y = df_train["Attack Type"].to_numpy()

In [11]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
clf = RandomForestClassifier(random_state=42)

In [13]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    acc = f1_score(y_val, y_pred, average="macro")
    print(f"Fold {fold} F1 score: {acc}")

Fold 1 F1 score: 0.8625513527224303
Fold 2 F1 score: 0.8625469946097749
Fold 3 F1 score: 0.8674975074775673
Fold 4 F1 score: 0.862116676978863
Fold 5 F1 score: 0.8620215125612661


In [14]:
X_test = df_test.drop(columns=["ID", "Timestamp"]).to_numpy()

In [15]:
y_pred = clf.predict(X_test).tolist()
y_pred[: 5]  # first 5

[1, 2, 1, 1, 2]

In [16]:
task2 = pd.DataFrame({
    'subtaskID': 2,
    'datapointID': df_test.ID,
    'answer': y_pred
})

## Save answers

In [17]:
submission_df = pd.concat([task1, task2], ignore_index=True)
submission_df.to_csv("submission.csv", index=False)

## Submission results

Subtask 1:
- Accuracy: 1
- Score: 20/20

Subtask 2:
- F1 score: 0.8931
- Score: 80/80