## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
import copy
from sklearn.feature_extraction import FeatureHasher
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'tf'

## File System

In [None]:
DATA_DIR =  "/Users/jackgraham/misc/handson-ml2/datasets/unsw/"
testing_fname = "UNSW_NB15_testing-set.csv"
training_fname = "UNSW_NB15_training-set.csv"

## Reading and Feature Labeling

In [None]:
df1 = pd.read_csv(DATA_DIR + training_fname)
df2 =  pd.read_csv(DATA_DIR  + testing_fname)
df = df1.append(df2)


label_feature = ['attack_cat']
categorical_features = ['proto', 'service', 'state']
drop_features = ['id', 'sttl', 'dttl', 'swin', 'dwin', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label']
numerical_features = list(set(df.columns) - set(label_feature) - set(categorical_features) - set(drop_features))

In [None]:
df.hist(column='rate',bins=20)

### Normalize

In [None]:
float_array = df[['rate']].values.astype(float)
min_max_scaler = preprocessing.MinMaxScaler()
scaled_array = min_max_scaler.fit_transform(float_array)
df_normalized = pd.DataFrame(scaled_array)
df_normalized.head()

In [None]:
scaler = preprocessing.MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
df.head()

In [None]:
len(df['attack_cat'].unique())
type(df['attack_cat'])

In [None]:
df_normalized.hist(column=0,bins=20)

## Train, Validation, Test Split

In [None]:
train, test = train_test_split(df, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    # Transform name species into numerical values 
    encoder = preprocessing.LabelEncoder()
    encoder.fit(dataframe['attack_cat'])
    dataframe['attack_cat'] = encoder.transform(dataframe['attack_cat'])
    dataframe['attack_cat'] = tf.keras.utils.to_categorical(dataframe['attack_cat'])

    dataframe = dataframe.copy()
    labels = dataframe.pop('attack_cat')
    dataframe = dataframe.drop(drop_features, axis=1)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [None]:
train_ds = df_to_dataset(train)
val_ds = df_to_dataset(val, shuffle=False)
test_ds = df_to_dataset(test, shuffle=False)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
    print('Every feature:', list(feature_batch.keys()))
    print('A batch of labels:', label_batch )

### Making Feature Columns

In [None]:
# header is the name of the column
# categories is all category names in that column
# returns the one hot feature column
def makeIndicatorFeatureColumn(header, categories):
    r = feature_column.categorical_column_with_vocabulary_list(header, categories)
    r_one_hot = feature_column.indicator_column(r)
    return r_one_hot

In [None]:
# categorical columns
one_hot_feature_columns = []
for h in categorical_features:
    fc = makeIndicatorFeatureColumn(h, df[h].unique())
    one_hot_feature_columns.append(fc)

In [None]:
feature_columns = one_hot_feature_columns

# numeric columns
for header in numerical_features:
    feature_columns.append(feature_column.numeric_column(header))


Now that we have defined our feature columns, we will use a DenseFeatures layer to input them to our Keras model.

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

## Test

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

In [None]:
history.history

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()