In [None]:
#installs
!pip3 install requests

## Imports

In [117]:
import pandas as pd
import requests
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


## Load Dataset

In [None]:
#load column names
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names"
response = requests.get(url)
lines = response.text.splitlines()

columns = []

for line in lines:
    if ':' in line:
        col = line.split(':')[0].strip()
        columns.append(col)

columns.append('label') 

print(columns)
print(f"Total columns: {len(columns)}")

['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']
Total columns: 42


In [None]:
#load dataset and attach column names
url = "https://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz"
df = pd.read_csv(url, names = columns)
print(df.head())

   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        181       5450     0   
1         0           tcp    http   SF        239        486     0   
2         0           tcp    http   SF        235       1337     0   
3         0           tcp    http   SF        219       1337     0   
4         0           tcp    http   SF        217       2032     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   9   
1               0       0    0  ...                  19   
2               0       0    0  ...                  29   
3               0       0    0  ...                  39   
4               0       0    0  ...                  49   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     1.0                     0.0   
1                     1.0                     0.0   
2                     1.0                     0.0   
3           

In [106]:
#Shape of dataset
print("Shape:")
print(df.shape)
print("Columns:")
print(df.columns)
print("Data Types: ")
print(df.dtypes)



Shape:
(494021, 42)
Columns:
Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Data Types: 
duration                         int64
protocol_type                   object
service                      

In [107]:
# A look at the first entry
print(df.iloc[1])

duration                             0
protocol_type                      tcp
service                           http
flag                                SF
src_bytes                          239
dst_bytes                          486
land                                 0
wrong_fragment                       0
urgent                               0
hot                                  0
num_failed_logins                    0
logged_in                            1
num_compromised                      0
root_shell                           0
su_attempted                         0
num_root                             0
num_file_creations                   0
num_shells                           0
num_access_files                     0
num_outbound_cmds                    0
is_host_login                        0
is_guest_login                       0
count                                8
srv_count                            8
serror_rate                        0.0
srv_serror_rate          

# Preprocessing

### Categorical vs Numeric Features

In [None]:
#Identify object-type (categorical or string) columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()

#The remaining are usually numeric
numeric_cols = df.select_dtypes(exclude='object').columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)


Categorical columns: ['protocol_type', 'service', 'flag', 'label']
Numeric columns: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


### Missing Values

In [109]:
# Check for NaN-based missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])

# Check for '?' in object columns
for col in df.select_dtypes(include='object'):
    if (df[col] == '?').any():
        print(f"Column '{col}' contains '?' values.")



Series([], dtype: int64)


No missing values!

## Verify Binary Columns

In [110]:
# Candidate binary columns: numeric + ≤ 3 unique values
candidate_binary_cols = [col for col in df.columns
                         if pd.api.types.is_numeric_dtype(df[col]) and
                         df[col].nunique() <= 3]

print(candidate_binary_cols)

for col in candidate_binary_cols:
    unique_vals = sorted(df[col].dropna().unique())
    print(f"{col}: {unique_vals}")



['land', 'wrong_fragment', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login']
land: [np.int64(0), np.int64(1)]
wrong_fragment: [np.int64(0), np.int64(1), np.int64(3)]
logged_in: [np.int64(0), np.int64(1)]
root_shell: [np.int64(0), np.int64(1)]
su_attempted: [np.int64(0), np.int64(1), np.int64(2)]
num_shells: [np.int64(0), np.int64(1), np.int64(2)]
num_outbound_cmds: [np.int64(0)]
is_host_login: [np.int64(0)]
is_guest_login: [np.int64(0), np.int64(1)]


Error with su_attempted, should just be 2 columns

In [None]:
#Clean column

df['su_attempted'] = df['su_attempted'].replace(2, 1)



### Encode features

In [112]:
#map attacks to their attack type
attack_mapping = {
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    'mailbomb.': 'DoS',
    'apache2.': 'DoS',
    'processtable.': 'DoS',
    'udpstorm.': 'DoS',

    'satan.': 'Probe',
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'mscan.': 'Probe',
    'saint.': 'Probe',

    'guess_passwd.': 'R2L',
    'ftp_write.': 'R2L',
    'imap.': 'R2L',
    'phf.': 'R2L',
    'multihop.': 'R2L',
    'warezmaster.': 'R2L',
    'warezclient.': 'R2L',
    'spy.': 'R2L',
    'xlock.': 'R2L',
    'xsnoop.': 'R2L',
    'snmpguess.': 'R2L',
    'snmpgetattack.': 'R2L',
    'httptunnel.': 'R2L',
    'sendmail.': 'R2L',
    'named.': 'R2L',

    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'rootkit.': 'U2R',
    'perl.': 'U2R',
    'sqlattack.': 'U2R',
    'xterm.': 'U2R',
    'ps.': 'U2R',

    'normal.': 'Normal'
}
#assign type of attack one of 5 labels and place in "attack_category"
df['attack_category'] = df['label'].map(attack_mapping)



In [113]:
le = LabelEncoder()


#preprocessing(categorical)
df_encoded = df.copy()


#one-hot encoding of service, protocol_type, and flag (creates new dimension for each different value of service, protocol_type, and flag)
df_encoded = pd.get_dummies(df, columns=['service', 'protocol_type', 'flag'])
# ^^ this is good for forest models, but might hurt the performance of SVM
# PCA could be used to reduce the dimensionality for SVM

#encode labels
df_encoded['attack_category'] = le.fit_transform(df['attack_category'])  # replaces the string labels with integers

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

label_mapping_df = pd.DataFrame({
    'original_label': le.classes_,
    'encoded_label': le.transform(le.classes_)
})

print(label_mapping_df)

print(df_encoded.shape)
#High dimensionality now!



  original_label  encoded_label
0            DoS              0
1         Normal              1
2          Probe              2
3            R2L              3
4            U2R              4
(494021, 120)


## Preprocessing(feature scaling)

### Start by identifying features we DONT want to scale (binary and encoded columns)

In [115]:
# Identify binary columns separately
binary_cols = [col for col in df_encoded.columns
               if df_encoded[col].dropna().nunique() == 2 and set(df_encoded[col].dropna().unique()).issubset({0, 1})]

# One-hot + label columns to exclude
onehot_and_label_cols = [col for col in df_encoded.columns if (
    col.startswith('service_') or 
    col.startswith('protocol_type_') or 
    col.startswith('flag_') or 
    col in ['label', 'attack_category', 'attack_category_encoded']
)]

# Combine all non-scalable columns
non_scaled_cols = set(onehot_and_label_cols + binary_cols)

# Select features to scale
features_to_scale = [col for col in df_encoded.columns if col not in non_scaled_cols]

print(features_to_scale)

['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


### Scale the features!

In [118]:
scaler = StandardScaler()
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])
df_encoded[features_to_scale].describe().T[['mean', 'std']]



Unnamed: 0,mean,std
duration,2.1718100000000002e-17,1.000001
src_bytes,6.508237e-19,1.000001
dst_bytes,4.717573e-18,1.000001
wrong_fragment,3.106694e-18,1.000001
urgent,2.277883e-18,1.000001
hot,-2.0351730000000003e-17,1.000001
num_failed_logins,-1.5749220000000002e-17,1.000001
num_compromised,-1.510199e-18,1.000001
num_root,-1.941684e-18,1.000001
num_file_creations,7.155464999999999e-19,1.000001
