In [1]:
import os

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
DATA_DIR = '../data'
FILE_PATH_TRAIN = os.path.join(DATA_DIR, 'raw/KDDTrain+_20Percent.csv')
FILE_PATH_TEST = os.path.join(DATA_DIR, 'raw/KDDTest-21.csv')

In [3]:
df_train = pd.read_csv(FILE_PATH_TRAIN, header=None, skiprows=[0])
df_test = pd.read_csv(FILE_PATH_TEST, header=None, skiprows=[0])

In [4]:
# Reset column names for training set
df_train.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
                    'urgent', 'hot',
                    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
                    'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
                    'num_access_files', 'num_outbound_cmds', 'is_host_login',
                    'is_guest_login', 'count', 'srv_count', 'serror_rate',
                    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                    'dst_host_same_src_port_rate',
                    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                    'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
                    'dst_host_srv_rerror_rate', 'subclass', 'difficulty_level']

# Reset column names for testing set
df_test.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
                   'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
                   'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
                   'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
                   'num_access_files', 'num_outbound_cmds', 'is_host_login',
                   'is_guest_login', 'count', 'srv_count', 'serror_rate',
                   'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                   'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                   'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                   'dst_host_same_src_port_rate',
                   'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                   'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
                   'dst_host_srv_rerror_rate', 'subclass', 'difficulty_level']

In [5]:
# accessing names of training columns
lst_names = df_train.columns  # returns a list of column names
testlst_names = df_test.columns

In [6]:
# Dropping the last columns of training set
df_train = df_train.drop('difficulty_level', 1)  # we don't need it in this project

# Dropping the last columns of testing set
df_test = df_test.drop('difficulty_level', 1)

  df_train = df_train.drop('difficulty_level', 1)  # we don't need it in this project
  df_test = df_test.drop('difficulty_level', 1)


In [7]:
df_train.isnull().values.any()
df_test.isnull().values.any()

False

In [8]:
# defining col list
cols = ['protocol_type', 'service', 'flag']

In [9]:
# One-hot encoding
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(each, 1)
    return df

In [10]:
# Merging train and test data
#combined_data = pd.concat([df_train, df_test])

# Applying one hot encoding to combined data
df_train = one_hot(df_train, cols)
df_test = one_hot(df_test, cols)

  df = df.drop(each, 1)
  df = df.drop(each, 1)
  df = df.drop(each, 1)
  df = df.drop(each, 1)
  df = df.drop(each, 1)
  df = df.drop(each, 1)


In [11]:
# Liran: use sklearn function

# Function to min-max normalize
def normalize(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with normalized specified features
    """
    result = df.copy()  # do not touch the original df
    for feature_name in ['src_bytes']:
        try:
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()

            if max_value > min_value:
                result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
        except Exception as ex:
            print(feature_name)
            print(ex)
    return result

In [12]:
# Dropping subclass column for training set
tmp = combined_data.pop('subclass')

# Normalizing training set
new_train_df = normalize(combined_data, combined_data.columns)

NameError: name 'combined_data' is not defined

In [13]:
# Fixing labels for training set
classlist = []
check1 = (
    "apache2", "back", "land", "neptune", "mailbomb", "pod", "processtable", "smurf", "teardrop", "udpstorm", "worm")
check2 = ("ipsweep", "mscan", "nmap", "portsweep", "saint", "satan")
check3 = ("buffer_overflow", "loadmodule", "perl", "ps", "rootkit", "sqlattack", "xterm")
check4 = (
    "ftp_write", "guess_passwd", "httptunnel", "imap", "multihop", "named", "phf", "sendmail", "Snmpgetattack", "spy",
    "snmpguess", "warezclient", "warezmaster", "xlock", "xsnoop")

DoSCount = 0
ProbeCount = 0
U2RCount = 0
R2LCount = 0
NormalCount = 0

for item in tmp:
    if item in check1:
        classlist.append("dos")
        DoSCount = DoSCount + 1
    elif item in check2:
        classlist.append("probe")
        ProbeCount = ProbeCount + 1
    elif item in check3:
        classlist.append("u2r")
        U2RCount = U2RCount + 1
    elif item in check4:
        classlist.append("r2l")
        R2LCount = R2LCount + 1
    else:
        classlist.append("normal")
        NormalCount = NormalCount + 1

In [14]:
# Appending class column to training set
new_train_df["Class"] = classlist

new_train_df["Class"].value_counts()

new_train_df.isnull().values.any()

False

In [15]:
attack_dict = {
    'normal': 0,
    'dos': 1,
    'probe': 1,
    'u2r': 1,
    'r2l': 1
}

y_train = new_train_df["Class"]

In [16]:
y_train.isnull().values.any()

False

In [17]:
y_train = y_train.replace(attack_dict)

combined_data_X = new_train_df.drop('Class', 1)

  combined_data_X = new_train_df.drop('Class', 1)


In [18]:
min_max_scaler = preprocessing.MinMaxScaler()

combined_data_X_normalized_df = pd.DataFrame(min_max_scaler.fit_transform(combined_data_X))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(combined_data_X_normalized_df, y_train, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)  # 0.25 x 0.8 = 0.2

In [20]:
# Save the results
X_train.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'train/train_features.csv'))
X_val.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'val/val_features.csv'))
X_test.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'test/test_features.csv'))

y_train.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'train/train_labels.csv'))
y_val.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'val/val_labels.csv'))
y_test.to_csv(os.path.join(DATA_DIR, 'processed_nslkdd', 'test/test_labels.csv'))