In [1]:
# To show the python version
import sys
assert sys.version_info >= (3,8)
print(sys.version_info)

# to show all the output from a code chunk
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# matplotlib for plotting
from matplotlib import pyplot as plt
%matplotlib inline

# Other external libraries like pandas and numpy
import pandas as pd
import numpy as np
np.random.seed(999)

# machine learning models
import sklearn
print(sklearn.__version__)

#import others
from sklearn.metrics import confusion_matrix
from sklearn import model_selection, datasets
from sklearn.tree import DecisionTreeClassifier
import joblib
import pickle
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


from scipy.io import arff
# Load the arff file
header_info = arff.loadarff("Dataset/KDDTrain+.arff")
df = pd.DataFrame(header_info[0])
header_names = df.columns.values.tolist()
header_names.append('Difficulty')
kdd_train = pd.read_csv('Dataset/KDDTrain+.txt', delimiter = ',', header = None)

#Rename the headers
kdd_train.columns = header_names
#Drop the target variables
kdd_train_x = kdd_train.drop(['class', 'Difficulty'], axis = 1)
y = ['class']
kdd_train_y = kdd_train[y]

#Identify categorical features
num_cols = kdd_train_x._get_numeric_data().columns
cate_cols = list(set(kdd_train_x.columns) - set(num_cols))

#One-Hot Encoding the categorical features
cate_features = pd.get_dummies(kdd_train_x[cate_cols], prefix = cate_cols)
one_hot_data = kdd_train_x.drop(cate_cols, axis = 1)
one_hot_data_maha = pd.concat([one_hot_data, cate_features], axis = 1)
one_hot_data_if = one_hot_data_maha
one_hot_data_if.head()
one_hot_data_maha.head()
data_array = np.array(one_hot_data_maha)
data_array.shape    # Make sure that the Numpy array dimension is consistent with the Pandas Dataframe
# function to calculate mahalanobis distance
label_map = {"normal": 0,
            "neptune": 1,
            "satan": 1,
            "ipsweep": 1,
            "portsweep": 1,
            "smurf": 1,
            "nmap": 1,
            "back": 1,
            "teardrop": 1,
            "warezclient": 1,
            "pod": 1,
            "guess_passwd": 1,
            "buffer_overflow": 1,
            "warezmaster": 1,
            "land": 1,
            "imap": 1,
            "rootkit": 1,
            "loadmodule": 1,
            "ftp_write": 1,
            "multihop": 1,
            "phf": 1,
            "perl": 1, 
            "spy": 1}
kdd_train_y["class_binary"] = kdd_train_y["class"].map(label_map)

sys.version_info(major=3, minor=10, micro=10, releaselevel='final', serial=0)
1.1.1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kdd_train_y["class_binary"] = kdd_train_y["class"].map(label_map)


# MinCovDet 

In [2]:
from sklearn.covariance import MinCovDet
from sklearn.datasets import make_gaussian_quantiles

rng = np.random.RandomState(42)


model = MinCovDet(random_state=42).fit(one_hot_data_maha)




In [None]:
model.covariance_

In [None]:
model.location_

In [23]:
import scipy as sp
inv_cormat = sp.linalg.pinv(model.covariance_)
mu_vect = model.location_

In [24]:
one_hot_data_maha.shape

(125973, 123)

In [25]:
mahal_dist = []
for i in range(data_array.shape[0]):
    x_minus_mu = data_array[i,:] - mu_vect
    left_term = np.dot(x_minus_mu, inv_cormat)
    mahal = np.dot(left_term, x_minus_mu.T)
    mahal = np.sqrt(mahal)
    mahal_dist.append(mahal)

In [26]:
one_hot_data_maha['p-value']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
125968    0.0
125969    0.0
125970    0.0
125971    0.0
125972    0.0
Name: p-value, Length: 125973, dtype: float64

In [27]:
np.mean(one_hot_data_maha)

from scipy.stats import chi2
one_hot_data_maha['p-value'] = 1 - chi2.cdf(mahal_dist, df = 121) #pdf(x, degreeOfFreedom = Number of Variables)

#Calculate number of p-value less than 0.01
print(np.count_nonzero(one_hot_data_maha['p-value'] < 0.01))

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


duration             2.871447e+02
src_bytes            4.556674e+04
dst_bytes            1.977911e+04
land                 1.984552e-04
wrong_fragment       2.268740e-02
                         ...     
service_uucp         6.191803e-03
service_uucp_path    5.469426e-03
service_vmnet        4.897875e-03
service_whois        5.501179e-03
p-value              3.945954e-09
Length: 123, dtype: float64

125973


In [30]:
pvalue = one_hot_data_maha['p-value']

pvalue = np.where(pvalue < 0.01, 1, pvalue)
pvalue = np.where(pvalue >= 0.01, 0, pvalue)

from sklearn.metrics import accuracy_score

train_acc_maha = accuracy_score(kdd_train_y['class_binary'], pvalue)
print("Training accuracy: {:.4f}".format(train_acc_maha))
cm_maha = confusion_matrix (kdd_train_y["class_binary"], pvalue)
print(cm_maha)

Training accuracy: 0.5346
[[67343     0]
 [58630     0]]


In [10]:
from pyod.models.mcd import MCD

ModuleNotFoundError: No module named 'pyod'

In [13]:
model.mahalanobis(one_hot_data)

array([4.00507835e+03, 5.20527854e+03, 9.42144079e+00, ...,
       1.21507598e+05, 8.75733059e+00, 4.97681382e+02])

In [16]:
from scipy.stats import chi2
one_hot_data['p-value'] = 1 - chi2.cdf(model.dist_,df = 121)

In [17]:
pvalue = one_hot_data['p-value']

pvalue = np.where(pvalue < 0.01, 1, pvalue)
pvalue = np.where(pvalue >= 0.01, 0, pvalue)

from sklearn.metrics import accuracy_score
train_acc_mcd = accuracy_score(kdd_train_y['class_binary'], pvalue)
print("Training accuracy: {:.4f}".format(train_acc_mcd))
cm_mcd = confusion_matrix (kdd_train_y["class_binary"], pvalue)
print(cm_mcd)

Training accuracy: 0.5346
[[67343     0]
 [58630     0]]
