# Random Forest - Method 4

## (1) Library Imports

In [1]:
# Importing Libraries: Data processing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Importing Libraries: Model
import sklearn
from sklearn import preprocessing 
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

# metrics are used to find accuracy or error
from sklearn import metrics

## (2) Pre-Processing

In [None]:
# Loading the training and testing datasets.
train = pd.read_csv("UNSW_NB15_testing-set.csv", sep=',', header=0)
test = pd.read_csv("UNSW_NB15_training-set.csv", sep=',', header=0)

# Combining the testing and training dataset so we can split it more even.
#combined_data = pd.concat([train, test]).drop(["id"], axis = 1)

combined_data = pd.concat([train, test]).drop_duplicates().reset_index(drop=True)
combined_data.duplicated().sum()

0

### Dropping Columns

In [None]:
# Feature Selection based on XGBoost Feature Selection method
drop_XGBoost_cols = ['attack_cat','ct_ftp_cmd', 'swin', 'is_ftp_login', 'dttl', 'state', 'trans_depth', 'ct_flw_http_mthd', 'dpkts', 'spkts', 'dloss', 'ct_state_ttl', 'sloss', 'service']
combined_data.drop(drop_XGBoost_cols, axis=1, inplace=True)

### Splitting the dataset, label-encoding & Normalizing

In [None]:
combined_data.duplicated().sum()

0

In [None]:
combined_data.drop_duplicates(keep='last', inplace=True)
combined_data.duplicated().sum()

In [None]:
# Splitting the datset into X and y
X = combined_data.drop(['label'], axis=1)
y = combined_data.loc[:, ['label']]

X_train Shape:  (175341, 30)
y_train Shape:  (175341, 1)
X_test Shape:  (82332, 30)
y_test Shape:  (82332, 1)


In [None]:
# Selecting all categorical columns (object)
le_cols = X.select_dtypes(include=[object]).columns

In [None]:
# Label Encoding X
le = preprocessing.LabelEncoder()
X[le_cols] = X[le_cols].apply(le.fit_transform)

#### Finding and dropping duplicated rows in X

In [None]:
print('X Duplicated: ', X.duplicated().sum())
print('y Duplicated: ', y.duplicated().sum())

In [None]:
# Use keep='last' to keep the last occurrence 
# y.drop_duplicates(keep='last', inplace=True)
# y.duplicated().sum()

In [None]:
# Use keep='last' to keep the last occurrence 
# X.drop_duplicates(keep='last', inplace=True)
# X.duplicated().sum()

In [None]:
print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
# Applying StandardScaler on X
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X),columns = X.columns)

In [None]:
# Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

## (3) Random Forest Classifier

In [None]:
# random forest model creation
clf = RandomForestClassifier(n_estimators = 300, verbose = 1, n_jobs=-1)

In [None]:
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train.values.ravel())

In [None]:
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

In [None]:
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import r2_score
sklearn.metrics.r2_score(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
model = LogisticRegression(solver='liblinear')
scoring = 'neg_log_loss'
results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)

In [None]:
print("Logloss: %.3f (%.3f)" % (results.mean(), results.std()))

## Evaluation of the model

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
matrix = confusion_matrix(y_test, predicted)

In [None]:
print(matrix)