<a href="https://colab.research.google.com/github/mrh-rakib/AI-cybersec/blob/main/load_KDD1999.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load and preprocess the KDD Cup 1999 dataset
Download the dataset from [Kaggle](https://www.kaggle.com/datasets/galaxyh/kdd-cup-1999-data?select=kddcup.data_10_percent.gz). Unzip and upload *kddcup.data_10_percent.gz* file into */content* folder of your colab session. Alternatively, download the *.gz* file from LMS and upload into colab.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# feature_names found from https://www.kaggle.com/datasets/galaxyh/kdd-cup-1999-data?resource=download&select=kddcup.names
feature_names=['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'target']

 # attack_types found from https://www.kaggle.com/datasets/galaxyh/kdd-cup-1999-data?resource=download&select=training_attack_types
# dot (.) added with each type to match the dataset
attack_types = {
    'normal.': 'normal',
    'back.': 'dos',
    'buffer_overflow.': 'u2r',
    'ftp_write.': 'r2l',
    'guess_passwd.': 'r2l',
    'imap.': 'r2l',
    'ipsweep.': 'probe',
    'land.': 'dos',
    'loadmodule.': 'u2r',
    'multihop.': 'r2l',
    'neptune.': 'dos',
    'nmap.': 'probe',
    'perl.': 'u2r',
    'phf.': 'r2l',
    'pod.': 'dos',
    'portsweep.': 'probe',
    'rootkit.': 'u2r',
    'satan.': 'probe',
    'smurf.': 'dos',
    'spy.': 'r2l',
    'teardrop.': 'dos',
    'warezclient.': 'r2l',
    'warezmaster.': 'r2l',
}

In [None]:
# Load the dataset
df = pd.read_csv("kddcup.data_10_percent.gz", names=feature_names)

In [None]:
# adding attack_type column
df['attack_type'] = df.target.apply(lambda r:attack_types[r])

df.drop(['target'],axis=1,inplace=True)

In [None]:
df = df[[col for col in df if df[col].nunique()>1]] # keep columns where there are more than 1 unique values

In [None]:
# num_root is highly correlated with num_compromised and should be ignored for analysis.
df.drop('num_root',axis = 1,inplace = True) # axis=1 specifies that we want to drop the columns axis

# srv_serror_rate is highly correlated with serror_rate and should be ignored for analysis.
df.drop('srv_serror_rate',axis = 1,inplace = True)

# srv_rerror_rate is highly correlated with rerror_rate and should be ignored for analysis.
df.drop('srv_rerror_rate',axis = 1, inplace=True)

# dst_host_srv_serror_rate is highly correlated with srv_serror_rate and should be ignored for analysis.
df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)

# dst_host_serror_rate is highly correlated with rerror_rate and should be ignored for analysis.
df.drop('dst_host_serror_rate',axis = 1, inplace=True)

# dst_host_rerror_rate is highly correlated with srv_rerror_rate and should be ignored for analysis.
df.drop('dst_host_rerror_rate',axis = 1, inplace=True)

# dst_host_srv_rerror_rate is highly correlated with rerror_rate and should be ignored for analysis.
df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)

# dst_host_same_srv_rate is highly correlated with dst_host_srv_count and should be ignored for analysis.
df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)

# srv_count is highly correlated with count and should be ignored for analysis.
df.drop('srv_count',axis = 1, inplace=True)

In [None]:
pmap = {"icmp":0,"tcp":1,"udp":2}
df['protocol_type'] = df['protocol_type'].map(pmap)

#flag feature mapping
fmap = {'SF':0,'S0':1,'REJ':2,'RSTR':3,'RSTO':4,'SH':5 ,'S1':6 ,'S2':7,'RSTOS0':8,'S3':9 ,'OTH':10}
df['flag'] = df['flag'].map(fmap)

#attack type feature mapping
amap = {'dos':0,'normal':1,'probe':2,'r2l':3,'u2r':4}
df['attack_type'] = df['attack_type'].map(amap)

df.drop('service',axis = 1,inplace= True)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
y = df[['attack_type']]
X = df.drop(['attack_type'],axis=1)

In [None]:
scaler = MinMaxScaler() #  MinMaxScaler scales the data to a fixed range (by default, between 0 and 1) by subtracting the minimum value and dividing by the range of the data.
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split test and train data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("Loaded \"X_train\" and \"X_test\" with shapes:")
print(X_train.shape, X_test.shape)
print("\nLoaded \"y_train\" and \"y_test\" with shapes:")
print(y_train.shape, y_test.shape)

NB: Parts of this program is taken and improved from https://www.kaggle.com/code/iamyajat/intrusion-detection-system-using-neural-networks, which has been released under the Apache 2.0 open source license