In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
data = pd.read_csv("Threats (1).csv")
data=data.drop(["Unnamed: 0","id","label","attack_cat"], axis=1)
y = pd.read_csv("Threats (1).csv", usecols=["attack_cat"])

In [97]:
data

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,0.012996,tcp,-,FIN,16,18,1540,1644,2539.242797,31,...,6,2,1,6,0,0,0,5,7,0
1,0.004119,tcp,-,FIN,12,12,1064,2260,5583.879675,31,...,2,1,1,1,0,0,0,3,8,0
2,0.000988,udp,dns,CON,2,2,146,178,3036.437382,31,...,3,2,1,2,0,0,0,7,2,0
3,3.415787,tcp,ssh,FIN,230,238,24344,29556,136.718127,31,...,4,1,1,1,0,0,0,4,1,0
4,0.193943,tcp,-,FIN,72,74,4238,63618,747.642372,31,...,6,1,1,5,0,0,0,6,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135336,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,24,24,13,24,0,0,0,24,24,0
135337,0.505762,tcp,-,FIN,10,8,620,354,33.612649,254,...,1,1,1,2,0,0,0,1,1,0
135338,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,3,3,3,13,0,0,0,3,12,0
135339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,30,30,14,30,0,0,0,30,30,0


In [98]:
data.shape

(135341, 42)

In [99]:
data = pd.get_dummies(data, columns=['proto','service','state'])

In [100]:
print(data.shape)

(135341, 188)


In [101]:
attack_map = {
    "Normal": 0,
    "Backdoor" : 1,
    "Analysis" : 2,
    "Fuzzers" : 3,
    "Shellcode" : 4,
    "Reconnaissance" : 5,
    "Exploits" : 6,
    "DoS" : 7,
    "Worms" : 8,
    "Generic" : 9}

y['attack_cat'] = y['attack_cat'].map(attack_map)

In [102]:
#Checking for NaN values
print(data.isna().sum().sum())
print(y.isna().sum())

0
attack_cat    0
dtype: int64


In [103]:
from sklearn.model_selection import train_test_split

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    data, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Second split: from 80%, get 75% train and 25% val → 60/20 overall
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,  
    stratify=y_temp,
    random_state=42
)


In [104]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().flatten()

In [105]:
def entropy(y):
    if len(y) == 0:
        return 0
    
    values, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    
    return -np.sum(probabilities * np.log2(probabilities))

def information_gain_entropy(y, y_left, y_right):
    p = len(y_left) / len(y)
    return entropy(y) - p * entropy(y_left) - (1 - p) * entropy(y_right)


In [111]:
def best_split(X,y):
    best_gain=0
    split_index = None
    split_value = None

    for feature_index in range(X.shape[1]):
        values = np.unique(X[:, feature_index])
        
        for val in values:
            left_indices = X[:, feature_index] <= val
            right_indices = X[:, feature_index] > val
            
            if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                continue
            
            gain = information_gain_entropy(y, y[left_indices], y[right_indices])
            
            if gain > best_gain:
                best_gain = gain
                split_index = feature_index
                split_value = val

    return split_index, split_value

In [107]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # class label for leaf

    def is_leaf_node(self):
        return self.value is not None


In [108]:
def build_tree(X, y, depth=0, max_depth=5):
    num_samples_per_class = np.bincount(y)
    predicted_class = np.argmax(num_samples_per_class)

    # Stop condition
    if depth >= max_depth or len(np.unique(y)) == 1:
        return Node(value=predicted_class)

    feature, threshold = best_split(X, y)
    if feature is None:
        return Node(value=predicted_class)

    left_idx = X[:, feature] <= threshold
    right_idx = ~left_idx

    left = build_tree(X[left_idx], y[left_idx], depth + 1, max_depth)
    right = build_tree(X[right_idx], y[right_idx], depth + 1, max_depth)

    return Node(feature, threshold, left, right)


In [None]:
def predict_one(x, tree):
    if tree.is_leaf_node():
        return tree.value
    if x[tree.feature] <= tree.threshold:
        return predict_one(x, tree.left)
    else:
        return predict_one(x, tree.right)

def predict(X, tree):
    return [predict_one(x, tree) for x in X]


In [None]:
tree = build_tree(X_train, y_train, max_depth=5)
y_pred = predict(X_test.to_numpy(), tree)


In [None]:
y_pred