In [9]:
import pandas as pd
import numpy as np

RANDOM_STATE = 42



In [10]:
columns = [
'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
'wrong_fragment','urgent','hot','num_failed_logins','logged_in',
'num_compromised','root_shell','su_attempted','num_root','num_file_creations',
'num_shells','num_access_files','num_outbound_cmds','is_host_login',
'is_guest_login','count','srv_count','serror_rate','srv_serror_rate',
'rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
'srv_diff_host_rate','dst_host_count','dst_host_srv_count',
'dst_host_same_srv_rate','dst_host_diff_srv_rate',
'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate','dst_host_srv_serror_rate',
'dst_host_rerror_rate','dst_host_srv_rerror_rate','label','difficulty'
]



In [11]:
train_data = pd.read_csv("../data/raw/KDDTrain+.txt", names=columns)
test_data = pd.read_csv("../data/raw/KDDTest+.txt", names=columns)

print("Train Shape:", train_data.shape)
print("Test Shape:", test_data.shape)

print("\nSample rows:")
display(train_data.head())


Train Shape: (125973, 43)
Test Shape: (22544, 43)

Sample rows:


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [12]:
dos_attacks = [
    'neptune','smurf','back','teardrop','pod','land',
    'apache2','mailbomb','processtable','udpstorm'
]

probe_attacks = [
    'satan','ipsweep','nmap','portsweep',
    'mscan','saint'
]

r2l_attacks = [
    'guess_passwd','ftp_write','imap','phf',
    'multihop','warezmaster','warezclient','spy',
    'snmpguess','snmpgetattack','httptunnel',
    'sendmail','named','worm','xlock','xsnoop'
]

u2r_attacks = [
    'buffer_overflow','loadmodule','rootkit',
    'perl','sqlattack','xterm','ps'
]



In [13]:
def map_attack(label):
    if label == 'normal':
        return 0  # Normal
    elif label in dos_attacks:
        return 1  # DoS
    elif label in probe_attacks:
        return 2  # Probe
    elif label in r2l_attacks or label in u2r_attacks:
        return 3  # Privilege attacks
    else:
        # safer than silent wrong mapping
        raise ValueError(f"Unknown attack type encountered: {label}")



In [14]:
train_data['label'] = train_data['label'].apply(map_attack)
test_data['label'] = test_data['label'].apply(map_attack)

print("Mapped Train Label Distribution:")
print(train_data['label'].value_counts())

print("\nMapped Test Label Distribution:")
print(test_data['label'].value_counts())



Mapped Train Label Distribution:
label
0    67343
1    45927
2    11656
3     1047
Name: count, dtype: int64

Mapped Test Label Distribution:
label
0    9711
1    7458
3    2954
2    2421
Name: count, dtype: int64


In [15]:
print("\nNull values in train:")
print(train_data.isnull().sum().sum())

print("\nNull values in test:")
print(test_data.isnull().sum().sum())



Null values in train:
0

Null values in test:
0


In [16]:
train_data.to_csv("../data/processed/train_day1.csv", index=False)
test_data.to_csv("../data/processed/test_day1.csv", index=False)

print("\n✅ Day 1 completed successfully (4-class IDS).")



✅ Day 1 completed successfully (4-class IDS).
