# Load the data
dataload.py

In [2]:
import numpy as np
import pandas as pd

import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    obj = {}
    with open(filename, 'rb') as input:
        obj = pickle.load(input)
    return obj

# process the dataset from raw file to DataFrame
# returns df
def init_dataset(filename='kddcup.data.txt'):
    # read the CSV without header
    # (discard row 0 if it contains the header)
    # fix the malformed row 4817100 by removing columns 0:14
    df = pd.read_csv(filename, error_bad_lines=False, header=None, engine='c', memory_map=True)
    df1 = pd.read_csv(filename, header=None, skiprows=4817100-1, nrows=1, engine='c', memory_map=True).iloc[:, 14:]
    #df1.columns = df.columns
    df = df.append(df1)
    if df.iloc[0, 0] == 'duration':
        df = df[1:]
    del(df1)
    df.reset_index(drop=True, inplace=True)

    # read in the headers (exclude first row)
    header = open('kddcup.names').readlines()[1:]
    header = [d.split(':')[0] for d in header]

    # set the column names on the DataFrame
    df.columns = header + ['attack']

    # remove trailing '.' in the attack labels
    df['attack'] = df['attack'].str.slice(0, -1)

    # add new column 'attack_type' containing
    #     dos, u2r, r2l, probe, normal
    attack_types = [d.split() for d in open('training_attack_types.txt').readlines()[:-1]]
    attack_types.append(['normal', 'normal'])
    attack_types = np.array(attack_types)
    attack_types = pd.DataFrame({ 'attack_type' : attack_types[:,1] },
        index=attack_types[:,0])
    df['attack_type'] = attack_types.loc[df.attack].attack_type.values
    return df

from sklearn.model_selection import train_test_split

def make_data():
    df = init_dataset()

    # the processed dataset is now in DataFrame 'df'.
    # we first split it into train/test, before doing any analysis

    x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-2], df.iloc[:, -2:], test_size=0.1, random_state=4129)

    save_object([x_train, y_train], 'train.dat')
    save_object([x_test, y_test], 'test.dat')

# Feature transform

In [16]:
x_train_r, y_train = load_object('train_r.dat')
x_test_r, y_test = load_object('test.dat')

In [17]:
print(x_train_r.shape)
print(y_train.shape)
print(x_test_r.shape)
print(y_test.shape)

(100000, 37)
(100000,)
(489844, 41)
(489844, 2)


In [69]:
y_train

array(['dos', 'dos', 'dos', ..., 'u2r', 'u2r', 'u2r'], dtype=object)

# Feature transform

# Scaling part two.

In [3]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
x_train, y_train = load_object('train_r.dat')
x_test,y_test = load_object('test.dat')
x_test = x_test.iloc[:,4:]

In [4]:
x_test = x_test.fillna(0)
# https://stackoverflow.com/questions/13295735/how-can-i-replace-all-the-nan-values-with-zeros-in-a-column-of-a-pandas-datafra

In [5]:
x_test = np.asarray(x_test)
np.argwhere(np.isnan(x_test))
# with thanks to... 
# https://stackoverflow.com/questions/37754948/how-to-get-the-indices-list-of-all-nan-value-in-numpy-array

array([], shape=(0, 2), dtype=int64)

In [6]:
s_scaler = StandardScaler()
m_scaler = MinMaxScaler()
ss_train = s_scaler.fit_transform(x_train)
mm_train = m_scaler.fit_transform(x_train)

In [7]:
ss_test = s_scaler.fit_transform(x_test)
mm_test = m_scaler.fit_transform(x_test)

In [8]:
y_test = y_test.iloc[:,1]
y_test.head()

3086687       dos
578347        dos
3732971       dos
3994439       dos
813098     normal
Name: attack_type, dtype: object

# Models
initial set is logistic only because i got stuck preprocessing test

In [9]:
bin_train, y_train = load_object('train_r_bin.dat')

In [10]:
mms_train, y_train = load_object('train_r_scaled.dat')

In [11]:
# basic logistic regression model
from sklearn import linear_model

logit = linear_model.LogisticRegression() 
logit.fit(x_train, y_train)
# note: x_train, y_train = load_object('train_r.dat') -- defined earlier. unscaled.
#logit.coef

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
print("R2 of the logistic model with no scaling (train set) is ", logit.score(x_train,y_train))

R2 of the logistic model with no scaling (train set) is  0.58341


In [13]:
logit.fit(ss_train,y_train)
print("R2 of the logistic model with Standard Scaling (train set) is ", logit.score(ss_train,y_train))

R2 of the logistic model with Standard Scaling (train set) is  0.9348


In [29]:
logit.fit(mm_train,y_train)
print("R2 of the logistic model with MinMax Scaling (train set) is ", logit.score(mm_train,y_train))

R2 of the logistic model with MinMax Scaling (train set) is  0.91659


In [67]:
logit.fit(mms_train,y_train)
print("R2 of the logistic model with MinMax Scaling Nic's way (train set) is ", logit.score(mm_train,y_train))

R2 of the logistic model with MinMax Scaling Nic's way (train set) is  0.9172


In [97]:
print("R2 of the logistic model with MinMax Scaler (test set) is ", logit.score(mm_test, y_test))

R2 of the logistic model with MinMax Scaler (test set) is  0.9727076375335821


In [98]:
print("R2 of the logistic model with Standard Scaler (test set) is ", logit.score(ss_test, y_test))

R2 of the logistic model with Standard Scaler (test set) is  0.7375245996684658


In [14]:
print("R2 of the logistic model with no scaling (test set) is ", logit.score(x_test, y_test))

R2 of the logistic model with no scaling (test set) is  0.8044499881594956


In [15]:
print("R2 of the logistic model with MinMax Scaling (test set) is ", logit.score(mm_test, y_test))

R2 of the logistic model with MinMax Scaling (test set) is  0.8887298813499808


In [16]:
print("R2 of the logistic model with Standard Scaling (test set) is ", logit.score(ss_test, y_test))

R2 of the logistic model with Standard Scaling (test set) is  0.70620850719821
