In [1]:
# To show the python version
import sys
assert sys.version_info >= (3,8)
print(sys.version_info)

# to show all the output from a code chunk
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# matplotlib for plotting
from matplotlib import pyplot as plt
%matplotlib inline

# Other external libraries like pandas and numpy
import pandas as pd
import numpy as np
np.random.seed(999)

# machine learning models
import sklearn
print(sklearn.__version__)

#import others
from sklearn.metrics import confusion_matrix
from sklearn import model_selection, datasets
from sklearn.tree import DecisionTreeClassifier
import joblib
import pickle
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


from scipy.io import arff
# Load the arff file
header_info = arff.loadarff("Dataset/KDDTrain+.arff")
df = pd.DataFrame(header_info[0])
header_names = df.columns.values.tolist()
header_names.append('Difficulty')
kdd_train = pd.read_csv('Dataset/KDDTrain+.txt', delimiter = ',', header = None)

#Rename the headers
kdd_train.columns = header_names
#Drop the target variables
kdd_train_x = kdd_train.drop(['class', 'Difficulty'], axis = 1)
y = ['class']
kdd_train_y = kdd_train[y]

#Identify categorical features
num_cols = kdd_train_x._get_numeric_data().columns
cate_cols = list(set(kdd_train_x.columns) - set(num_cols))

#One-Hot Encoding the categorical features
cate_features = pd.get_dummies(kdd_train_x[cate_cols], prefix = cate_cols)
one_hot_data = kdd_train_x.drop(cate_cols, axis = 1)
one_hot_data_maha = pd.concat([one_hot_data, cate_features], axis = 1)
one_hot_data_if = one_hot_data_maha
one_hot_data_if.head()
one_hot_data_maha.head()
data_array = np.array(one_hot_data_maha)
data_array.shape    # Make sure that the Numpy array dimension is consistent with the Pandas Dataframe
# function to calculate mahalanobis distance
label_map = {"normal": 0,
            "neptune": 1,
            "satan": 1,
            "ipsweep": 1,
            "portsweep": 1,
            "smurf": 1,
            "nmap": 1,
            "back": 1,
            "teardrop": 1,
            "warezclient": 1,
            "pod": 1,
            "guess_passwd": 1,
            "buffer_overflow": 1,
            "warezmaster": 1,
            "land": 1,
            "imap": 1,
            "rootkit": 1,
            "loadmodule": 1,
            "ftp_write": 1,
            "multihop": 1,
            "phf": 1,
            "perl": 1, 
            "spy": 1}
kdd_train_y["class_binary"] = kdd_train_y["class"].map(label_map)

sys.version_info(major=3, minor=10, micro=10, releaselevel='final', serial=0)
1.1.1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kdd_train_y["class_binary"] = kdd_train_y["class"].map(label_map)


# MinCovDet 

In [2]:
from sklearn.covariance import MinCovDet
from sklearn.datasets import make_gaussian_quantiles

rng = np.random.RandomState(42)


model = MinCovDet(random_state=42).fit(one_hot_data_maha)




In [3]:
model.covariance_

array([[ 4.35858604e-03,  1.22637357e-02,  1.45957861e+00, ...,
         0.00000000e+00, -1.22207874e-05, -1.36062235e-05],
       [ 1.22637357e-02,  1.71066997e+04,  2.16820742e+05, ...,
         0.00000000e+00, -1.02875368e+00, -1.14538058e+00],
       [ 1.45957861e+00,  2.16820742e+05,  1.62195579e+07, ...,
         0.00000000e+00, -1.40732690e+01, -1.56687157e+01],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.22207874e-05, -1.02875368e+00, -1.40732690e+01, ...,
         0.00000000e+00,  8.68871983e-03, -8.55455119e-05],
       [-1.36062235e-05, -1.14538058e+00, -1.56687157e+01, ...,
         0.00000000e+00, -8.55455119e-05,  9.66403739e-03]])

In [4]:
model.location_

array([1.39418299e-03, 1.17363214e+02, 1.60551948e+03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       4.31885262e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.94295715e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.01791051e+02,
       2.76586031e+01, 3.95592009e-01, 3.95659641e-01, 6.76788336e-02,
       6.76847663e-02, 5.72063984e-01, 2.96228290e-02, 4.66121353e-02,
       2.05530294e+02, 1.41266704e+02, 5.54835739e-01, 3.17415422e-02,
       1.36707058e-02, 8.19319817e-03, 3.95640805e-01, 3.95667206e-01,
       6.78673450e-02, 6.78993815e-02, 0.00000000e+00, 8.95006155e-01,
       1.04993845e-01, 0.00000000e+00, 6.76772021e-02, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.95443691e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 5.36879107e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.23844979e-02, 0.00000000e+00,
      

In [5]:
import scipy as sp
inv_cormat = sp.linalg.pinv(model.covariance_)
mu_vect = model.location_

In [7]:
mahal_dist = []
for i in range(data_array.shape[0]):
    x_minus_mu = data_array[i,:] - mu_vect
    left_term = np.dot(x_minus_mu, inv_cormat)
    mahal = np.dot(left_term, x_minus_mu.T)
    mahal = np.sqrt(mahal)
    mahal_dist.append(mahal)

In [9]:
np.mean(one_hot_data_maha)

from scipy.stats import chi2
one_hot_data_maha['p-value'] = 1 - chi2.cdf(mahal_dist, df = 121) #pdf(x, degreeOfFreedom = Number of Variables)

#Calculate number of p-value less than 0.01
print(np.count_nonzero(one_hot_data_maha['p-value'] < 0.01))

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


duration               287.144650
src_bytes            45566.743000
dst_bytes            19779.114421
land                     0.000198
wrong_fragment           0.022687
                         ...     
service_urp_i            0.004779
service_uucp             0.006192
service_uucp_path        0.005469
service_vmnet            0.004898
service_whois            0.005501
Length: 122, dtype: float64

29525


In [10]:
pvalue = one_hot_data_maha['p-value']

pvalue = np.where(pvalue < 0.01, 1, pvalue)
pvalue = np.where(pvalue >= 0.01, 0, pvalue)

from sklearn.metrics import accuracy_score

train_acc_maha = accuracy_score(kdd_train_y['class_binary'], pvalue)
print("Training accuracy: {:.4f}".format(train_acc_maha))
cm_maha = confusion_matrix (kdd_train_y["class_binary"], pvalue)
print(cm_maha)

Training accuracy: 0.5346
[[67343     0]
 [58630     0]]
