## Imports

In [19]:
import pandas as pd
import numpy as np
import requests
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import uniform

## Load Dataset

In [20]:
#load column names
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names"
response = requests.get(url)
lines = response.text.splitlines()

columns = []

for line in lines:
    if ':' in line:
        col = line.split(':')[0].strip()
        columns.append(col)

columns.append('label') 

print(columns)
print(f"Total columns: {len(columns)}")

['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']
Total columns: 42


In [21]:
#load dataset and attach column names
url = "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz"
df = pd.read_csv(url, names = columns)
print(df.head())

   duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        215      45076     0   
1         0           tcp    http   SF        162       4528     0   
2         0           tcp    http   SF        236       1228     0   
3         0           tcp    http   SF        233       2032     0   
4         0           tcp    http   SF        239        486     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   0   
1               0       0    0  ...                   1   
2               0       0    0  ...                   2   
3               0       0    0  ...                   3   
4               0       0    0  ...                   4   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     0.0                     0.0   
1                     1.0                     0.0   
2                     1.0                     0.0   
3           

In [22]:
#Shape of dataset
print("Shape:")
print(df.shape)
print("Columns:")
print(df.columns)
print("Data Types: ")
print(df.dtypes)



Shape:
(4898431, 42)
Columns:
Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')
Data Types: 
duration                         int64
protocol_type                   object
service                     

In [23]:
# A look at the first entry
print(df.iloc[1])

duration                             0
protocol_type                      tcp
service                           http
flag                                SF
src_bytes                          162
dst_bytes                         4528
land                                 0
wrong_fragment                       0
urgent                               0
hot                                  0
num_failed_logins                    0
logged_in                            1
num_compromised                      0
root_shell                           0
su_attempted                         0
num_root                             0
num_file_creations                   0
num_shells                           0
num_access_files                     0
num_outbound_cmds                    0
is_host_login                        0
is_guest_login                       0
count                                2
srv_count                            2
serror_rate                        0.0
srv_serror_rate          

# Preprocessing

### Categorical vs Numeric Features

In [24]:
#Identify object-type (categorical or string) columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()

#The remaining are usually numeric
numeric_cols = df.select_dtypes(exclude='object').columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)


Categorical columns: ['protocol_type', 'service', 'flag', 'label']
Numeric columns: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


### Missing Values

In [25]:
# Check for NaN-based missing values
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])

# Check for '?' in object columns
for col in df.select_dtypes(include='object'):
    if (df[col] == '?').any():
        print(f"Column '{col}' contains '?' values.")



Series([], dtype: int64)


No missing values!

## Verify Binary Columns

In [26]:
# Candidate binary columns: numeric + ≤ 3 unique values
candidate_binary_cols = [col for col in df.columns
                         if pd.api.types.is_numeric_dtype(df[col]) and
                         df[col].nunique() <= 3]

print(candidate_binary_cols)

for col in candidate_binary_cols:
    unique_vals = sorted(df[col].dropna().unique())
    print(f"{col}: {unique_vals}")



['land', 'wrong_fragment', 'logged_in', 'root_shell', 'su_attempted', 'num_shells', 'num_outbound_cmds', 'is_host_login', 'is_guest_login']
land: [np.int64(0), np.int64(1)]
wrong_fragment: [np.int64(0), np.int64(1), np.int64(3)]
logged_in: [np.int64(0), np.int64(1)]
root_shell: [np.int64(0), np.int64(1)]
su_attempted: [np.int64(0), np.int64(1), np.int64(2)]
num_shells: [np.int64(0), np.int64(1), np.int64(2)]
num_outbound_cmds: [np.int64(0)]
is_host_login: [np.int64(0), np.int64(1)]
is_guest_login: [np.int64(0), np.int64(1)]


Error with su_attempted, should just be 2 columns

In [27]:
#Clean column

df['su_attempted'] = df['su_attempted'].replace(2, 1)



### Encode features

In [28]:
#map attacks to their attack type
attack_mapping = {
    'back.': 'DoS',
    'land.': 'DoS',
    'neptune.': 'DoS',
    'pod.': 'DoS',
    'smurf.': 'DoS',
    'teardrop.': 'DoS',
    'mailbomb.': 'DoS',
    'apache2.': 'DoS',
    'processtable.': 'DoS',
    'udpstorm.': 'DoS',

    'satan.': 'Probe',
    'ipsweep.': 'Probe',
    'nmap.': 'Probe',
    'portsweep.': 'Probe',
    'mscan.': 'Probe',
    'saint.': 'Probe',

    'guess_passwd.': 'R2L',
    'ftp_write.': 'R2L',
    'imap.': 'R2L',
    'phf.': 'R2L',
    'multihop.': 'R2L',
    'warezmaster.': 'R2L',
    'warezclient.': 'R2L',
    'spy.': 'R2L',
    'xlock.': 'R2L',
    'xsnoop.': 'R2L',
    'snmpguess.': 'R2L',
    'snmpgetattack.': 'R2L',
    'httptunnel.': 'R2L',
    'sendmail.': 'R2L',
    'named.': 'R2L',

    'buffer_overflow.': 'U2R',
    'loadmodule.': 'U2R',
    'rootkit.': 'U2R',
    'perl.': 'U2R',
    'sqlattack.': 'U2R',
    'xterm.': 'U2R',
    'ps.': 'U2R',

    'normal.': 'Normal'
}
#assign type of attack one of 5 labels and place in "attack_category"
df['attack_category'] = df['label'].map(attack_mapping)



In [29]:
le = LabelEncoder()


#preprocessing(categorical)
df_encoded = df.copy()


#one-hot encoding of service, protocol_type, and flag (creates new dimension for each different value of service, protocol_type, and flag)
df_encoded = pd.get_dummies(df, columns=['service', 'protocol_type', 'flag'])
# ^^ this is good for forest models, but might hurt the performance of SVM
# PCA could be used to reduce the dimensionality for SVM

#encode labels
df_encoded['attack_category'] = le.fit_transform(df['attack_category'])  # replaces the string labels with integers

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

label_mapping_df = pd.DataFrame({
    'original_label': le.classes_,
    'encoded_label': le.transform(le.classes_)
})

print(label_mapping_df)

print(df_encoded.shape)
#High dimensionality now!



  original_label  encoded_label
0            DoS              0
1         Normal              1
2          Probe              2
3            R2L              3
4            U2R              4
(4898431, 124)


## Preprocessing(feature scaling)

### Start by identifying features we DONT want to scale (binary and encoded columns)

In [30]:
# Identify binary columns separately
binary_cols = [col for col in df_encoded.columns
               if df_encoded[col].dropna().nunique() == 2 and set(df_encoded[col].dropna().unique()).issubset({0, 1})]

# One-hot + label columns to exclude
onehot_and_label_cols = [col for col in df_encoded.columns if (
    col.startswith('service_') or 
    col.startswith('protocol_type_') or 
    col.startswith('flag_') or 
    col in ['label', 'attack_category', 'attack_category_encoded']
)]

# Combine all non-scalable columns
non_scaled_cols = set(onehot_and_label_cols + binary_cols)

# Select features to scale
features_to_scale = [col for col in df_encoded.columns if col not in non_scaled_cols]

print(features_to_scale)

['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


### Scale the features!

In [31]:
scaler = StandardScaler()
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])
df_encoded[features_to_scale].describe().T[['mean', 'std']]



Unnamed: 0,mean,std
duration,-1.955576e-16,1.0
src_bytes,4.24105e-18,1.0
dst_bytes,-1.064705e-18,1.0
wrong_fragment,-1.8549650000000002e-17,1.0
urgent,3.1912139999999996e-20,1.0
hot,8.552163000000001e-17,1.0
num_failed_logins,2.9557890000000004e-17,1.0
num_compromised,5.751436999999999e-19,1.0
num_root,1.0953840000000001e-17,1.0
num_file_creations,2.500461e-17,1.0


# The Models

## Random Forest

In [59]:
#features: all except target-related columns
X = df_encoded.drop(columns=['label', 'attack_category'])

#target: the encoded attack category
y = df_encoded['attack_category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)


In [60]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced', min_samples_leaf=2, min_samples_split=5,)
rf_model.fit(X_train, y_train)


In [61]:
y_pred = rf_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     58719
           1       1.00      1.00      1.00     14592
           2       1.00      1.00      1.00       616
           3       0.99      0.94      0.97       169
           4       0.64      0.88      0.74         8

    accuracy                           1.00     74104
   macro avg       0.92      0.96      0.94     74104
weighted avg       1.00      1.00      1.00     74104

Confusion Matrix:
[[58716     3     0     0     0]
 [    1 14585     3     1     2]
 [    0     0   616     0     0]
 [    1     7     0   159     2]
 [    0     1     0     0     7]]


In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Base model
rf = RandomForestClassifier(random_state=42, n_jobs=-1)


param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

# Grid search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='f1_weighted',  # balances class imbalance better
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))





Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best parameters: {'class_weight': 'balanced', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best score: 0.9998016346061069
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     58719
           1       1.00      1.00      1.00     14592
           2       1.00      1.00      1.00       616
           3       0.99      0.93      0.96       169
           4       0.64      0.88      0.74         8

    accuracy                           1.00     74104
   macro avg       0.93      0.96      0.94     74104
weighted avg       1.00      1.00      1.00     74104



In [63]:
print(best_rf)

RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_split=10, n_jobs=-1, random_state=42)


## Support Vector Machines (SVM)

In [64]:
# Parameters for randomized search
param_dist = {'C': uniform(0.1, 5), 'gamma': ['scale'], 'kernel': ['rbf'], 'class_weight': ['balanced']}

# Randomized search
random = RandomizedSearchCV(
    estimator=SVC(random_state=0), 
    param_distributions=param_dist, 
    n_iter=5, 
    cv=2, 
    scoring='f1_weighted', 
    verbose=2, 
    n_jobs=-1, 
    random_state=0)

# Train model using randomized search
random.fit(X_train, y_train)

# Best model
best_svm = random.best_estimator_


Fitting 2 folds for each of 5 candidates, totalling 10 fits


In [65]:
# Runtime analysis
start = time.time()
y_pred = best_svm.predict(X_test)
end = time.time()
runtime = end - start

# Evaluation metrics
labels = np.unique(y_test)
print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Runtime analysis: {runtime: .4f}")
print("Classification report: \n", classification_report(y_test, y_pred, zero_division=0))
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))

Overall Accuracy: 0.9986
Runtime analysis:  15.3607
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     58719
           1       1.00      0.99      1.00     14592
           2       0.97      1.00      0.99       616
           3       0.73      0.94      0.82       169
           4       0.56      0.62      0.59         8

    accuracy                           1.00     74104
   macro avg       0.85      0.91      0.88     74104
weighted avg       1.00      1.00      1.00     74104

Confusion matrix: 
 [[58713     6     0     0     0]
 [    4 14509    17    58     4]
 [    0     0   616     0     0]
 [    1     9     0   159     0]
 [    0     3     0     0     5]]


# Isolation Forests

In [36]:
# Handling Missing Values
for col in df_encoded.columns:
    if df_encoded[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df_encoded[col]):
            df_encoded[col].fillna(df_encoded[col].median(), inplace=True)
        else:
            df_encoded[col].fillna(df_encoded[col].mode()[0], inplace=True)

# Preparing features and binary target from both the manually encoded and scaled df_encoded
X = df_encoded.drop(['label', 'attack_category'], axis=1)
y = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Undersampling the majority anomalies
X_train_norm = X_train[y_train == 0]
X_train_norm_sampled = X_train_norm.sample(frac=0.1, random_state=42) 

iso = IsolationForest(
    n_estimators=500,
    max_samples=4096,
    contamination=0.01,
    random_state=42,
    verbose=1
)
iso.fit(X_train[y_train == 0])

# Predict and map 0=normal, 1=attack
raw_preds = iso.predict(X_test)             # +1=inlier, -1=outlier
y_pred = pd.Series(raw_preds).map({1: 0, -1: 1})

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    4.6s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:   10.4s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   11.6s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.3s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    8.4s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    9.4s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    3.7s
[Parallel(n_jobs=1)]: Done 449 tasks      | elaps

In [35]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Normal','Attack']))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.5f}")


Confusion Matrix:
[[ 288965    2869]
 [   1467 1176229]]

Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      0.99      0.99    291834
      Attack       1.00      1.00      1.00   1177696

    accuracy                           1.00   1469530
   macro avg       1.00      0.99      1.00   1469530
weighted avg       1.00      1.00      1.00   1469530


Accuracy: 0.99705
