## Testing UMAP Feature Reduction
This will reduce the dataset to 2 dimensions by default

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction import FeatureHasher

# Common imports
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import umap
import umap.plot

# to make this notebook's output stable across runs
np.random.seed(42)

In [2]:
# Splitting the data
# mndata = MNIST('fashion-mnist/data/fashion')
# Get data from csv
DATA_DIR =  "./datasets/unsw/"
training_fname = "UNSW-NB15_1_ColHeaders.csv"

df = pd.read_csv(DATA_DIR + training_fname)
df = df[:-630001]

from sklearn.model_selection import train_test_split
y = df['label'].tolist()
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
label_column = ['label']
categorical_columns = ['proto', 'service', 'state']
drop_columns = ['sttl', 'dttl', 'swin', 'dwin', 'trans_depth', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'is_sm_ips_ports', 'Attack_cat']
# UNSW-NB15 has unlabelled data with different headers
drop_columns_2 = ['srcip', 'sport', 'dstip', 'dsport']
numeric_columns = list(set(df.columns) - set(label_column) - set(categorical_columns) - set(drop_columns) - set(drop_columns_2))

scaler = sklearn.preprocessing.MinMaxScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.fit_transform(X_test[numeric_columns])

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('hash_proto', 'drop', 'proto'),
                      ('hash_service', 'drop', 'service'),
                      ('hash_state', 'drop', 'state'),
                      ('numeric_cols', 'passthrough', numeric_columns),
                       ('dropped', 'drop', drop_columns),
                       ('label_drop', 'drop', 'label')])

X_train = ct.fit_transform(X_train)
X_test = ct.fit_transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = v

In [4]:
df

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_scv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Attack_cat,label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,59.166.0.8,56701,149.171.126.5,25,tcp,FIN,0.524064,37502,3172,31,...,0,2,1,2,2,1,1,1,,0
69996,59.166.0.9,43384,149.171.126.0,49546,tcp,FIN,0.031841,4238,63618,31,...,0,7,7,3,4,1,1,2,,0
69997,59.166.0.9,44387,149.171.126.7,5190,tcp,FIN,0.006032,1920,4312,31,...,0,7,3,1,4,1,1,1,,0
69998,59.166.0.6,59733,149.171.126.9,21,tcp,FIN,2.400760,2934,3742,31,...,0,1,1,1,2,1,1,1,,0


In [5]:
print("X_train size: " + str(len(X_train)))
print("X_test size: " + str(len(X_test)))
X_test

X_train size: 52500
X_test size: 17500


array([[5.31754213e-01, 5.11409480e-05, 4.45417895e-04, ...,
        5.07757405e-03, 4.43915704e-01, 2.63157895e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 6.05030591e-02, 1.05263158e-01],
       [3.78130143e-01, 1.52342603e-06, 4.71722101e-04, ...,
        2.25669958e-03, 5.43847723e-02, 0.00000000e+00],
       ...,
       [5.34643890e-01, 2.78941390e-03, 4.92765466e-04, ...,
        1.12834979e-03, 4.99660095e-01, 5.26315789e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 6.05030591e-02, 2.63157895e-02],
       [3.58393196e-01, 9.26704227e-05, 4.50678736e-04, ...,
        4.23131171e-03, 4.69068661e-02, 0.00000000e+00]])

In [None]:
# Running the embedding
embedding = umap.UMAP(n_neighbors=5).fit_transform(X_test, y=y_test)

In [None]:
# Make Plot
classes = ['Normal', 'Anomaly']
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*embedding.T, s=0.3, c=y_train, cmap='Spectral', alpha=0.6)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(2))
cbar.set_ticklabels(classes)
plt.title('UMAP UNSW NB15 (HASHTRICK) TEST - No Target')

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(X_train)

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1],
            c=y_train, s=0.1, cmap='Spectral');

In [None]:
import umap.plot
p = umap.plot.interactive(embedding, labels=y_train, hover_data=hover_data, point_size=2)
umap.plot.show(p)