# Random Forest Classifier (Label Encode Object Columns)

https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

In [2]:
# Importing Libraries: Data processing
import pandas as pd
import numpy as np

# Importing Libraries: Model
import sklearn
from sklearn import preprocessing 
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

# metrics are used to find accuracy or error
from sklearn import metrics  

In [3]:
# Loading the training and testing datasets.
test = pd.read_csv("UNSW_NB15_testing-set.csv", sep=',', header=0)
train = pd.read_csv("UNSW_NB15_training-set.csv", sep=',', header=0)

# Printing the shape of the datasets
print('Training Set Shape:', '\t\t', train.shape)
print('Testing Set Shape:', '\t\t', test.shape)

Training Set Shape: 		 (82332, 45)
Testing Set Shape: 		 (175341, 45)


In [5]:
test

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175337,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,13,24,0,0,0,24,24,0,Generic,1
175337,175338,0.505762,tcp,-,FIN,10,8,620,354,33.612649,...,1,2,0,0,0,1,1,0,Shellcode,1
175338,175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,3,13,0,0,0,3,12,0,Generic,1
175339,175340,0.000009,udp,dns,INT,2,0,114,0,111111.107200,...,14,30,0,0,0,30,30,0,Generic,1


## Label Encoder

We can se from test.info() and train.info() that proto, service, state and attack_cat columns are of the type 'object'. 
Label encode these.

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

In [8]:
# Defining the columns that need to be label encoded.
cols = ['proto', 'service', 'state', 'attack_cat']
le = preprocessing.LabelEncoder()

In [9]:
# Label encoding the columns for the test and training set
test[cols] = test[cols].apply(le.fit_transform)
train[cols] = train[cols].apply(le.fit_transform)

In [10]:
test

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,113,0,2,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,6,0
1,2,0.649902,113,0,2,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,6,0
2,3,1.623129,113,0,2,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,6,0
3,4,1.681642,113,3,2,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,6,0
4,5,0.449454,113,0,2,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,175337,0.000009,119,2,3,2,0,114,0,111111.107200,...,13,24,0,0,0,24,24,0,5,1
175337,175338,0.505762,113,0,2,10,8,620,354,33.612649,...,1,2,0,0,0,1,1,0,8,1
175338,175339,0.000009,119,2,3,2,0,114,0,111111.107200,...,3,13,0,0,0,3,12,0,5,1
175339,175340,0.000009,119,2,3,2,0,114,0,111111.107200,...,14,30,0,0,0,30,30,0,5,1


In [12]:
# Splitting the dataset into inputs and outputs
X_train = train.iloc[:, 0:44]
X_test = test.iloc[:, 0:44]
y_train = train.iloc[:, 44]
y_test = test.iloc[:, 44]

In [None]:
y_train

## Creating the Classifier

In [6]:
# random forest model creation
clf = RandomForestClassifier(n_estimators = 1000, verbose = 1)

In [7]:
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  1.1min finished


RandomForestClassifier(n_estimators=1000, verbose=1)

In [8]:
# performing predictions on the test dataset
y_pred = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:   11.9s finished


In [9]:
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9476220621531758


In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92     56000
           1       1.00      0.92      0.96    119341

    accuracy                           0.95    175341
   macro avg       0.93      0.96      0.94    175341
weighted avg       0.95      0.95      0.95    175341

