# UNSW-NB15 Random Forest

In [25]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction import FeatureHasher

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## File System

In [26]:
DATA_DIR =  "/Users/jackgraham/misc/handson-ml2/datasets/unsw/"
testing_fname = "UNSW_NB15_testing-set.csv"
training_fname = "UNSW_NB15_training-set.csv"

## Reading Files and Column Labeling

In [27]:
df1 = pd.read_csv(DATA_DIR + training_fname)
df2 =  pd.read_csv(DATA_DIR  + testing_fname)
df = df1.append(df2)


label_column = ['label']
categorical_columns = ['proto', 'service', 'state']
drop_columns = ['id', 'sttl', 'dttl', 'swin', 'dwin', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']
numeric_columns = list(set(df.columns) - set(label_column) - set(categorical_columns) - set(drop_columns))

## Preprocessing

In [28]:
print(type(df['proto']))
len(df['proto'].unique())

<class 'pandas.core.series.Series'>


133

### Categorical Columns Hash Trick

In [48]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html
def column_hash_trick(categorical_column, n_hash_features):
    """Turns categorical columns into hashed with hash trick.
    
    Args:
        categorical_column: Pandas Series.
        n_hash_features: Integer for number of hash features desired.
        
    Returns:
        scipy.sparse.csr.csr_matrix of hash features.
    
    """
    categories = categorical_column.unique()
    h = FeatureHasher(n_features=n_hash_features, input_type='string')
    return h.transform(categories)
    

In [50]:
f = column_hash_trick(df['proto'])
print(f.toarray())
np.array(f.toarray()).shape
type(f)

[[ 0. -1. -1.  0. -1.]
 [ 0. -1.  1.  0.  1.]
 [ 2. -1.  0.  0.  0.]
 [ 0. -1.  0.  2. -1.]
 [ 0. -1. -1.  0.  0.]
 [ 0. -1. -1.  0.  0.]
 [ 1. -1.  0.  0. -1.]
 [ 0. -1.  0.  0.  2.]
 [ 3. -2. -1.  0. -1.]
 [ 0. -1. -1.  1. -1.]
 [ 0. -1.  0.  0. -2.]
 [ 0. -2.  0. -1.  0.]
 [ 1. -4.  0.  1.  0.]
 [ 0.  0.  0.  0. -1.]
 [ 1. -2. -1.  0. -1.]
 [ 1.  1.  0.  1. -4.]
 [-1. -2. -1.  0.  0.]
 [ 0.  0.  1.  0. -3.]
 [ 0. -3.  0.  1. -2.]
 [ 0. -1.  0.  1. -1.]
 [ 0. -1.  1.  0.  0.]
 [ 1.  0.  1.  0. -2.]
 [ 0.  0.  0.  1. -2.]
 [ 1. -1. -1.  0.  0.]
 [ 2. -2.  1.  0. -1.]
 [ 1. -2.  0.  0.  1.]
 [ 1. -1.  0.  0. -1.]
 [ 0. -3.  0.  0.  0.]
 [ 2. -2.  1.  1. -1.]
 [ 0. -1. -1.  1. -1.]
 [ 1. -1.  0. -1. -1.]
 [ 0. -1.  0.  1. -2.]
 [ 0.  0.  0. -1. -1.]
 [ 0. -2.  0.  0.  0.]
 [ 0. -1. -1.  0.  1.]
 [ 1. -2. -1.  0.  1.]
 [ 2.  1.  0.  0.  0.]
 [ 0.  0. -1.  0. -1.]
 [ 0. -2.  0.  0. -2.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 1. -2. -1.  0. -1.]
 [ 0.  0.  0.  1.  1.]
 [ 1.  0.  

scipy.sparse.csr.csr_matrix

In [None]:
# Add to dataframe
#pd.concat([df[['Genre', 'Publisher']], pd.DataFrame(hashed_features1),pd.DataFrame(hashed_features2)],
#axis=1)
ct = ColumnTransformer(['hash_proto', FeatureHasher(5, input_type='string'), 'proto'])

