<a href="https://colab.research.google.com/github/jvitorc/TCC/blob/main/ExplorandoTecnicasIA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### *João Vitor Cardoso <2021>*

# **Explorando Tecnicas de IA para Deteção de Intrusão**

  Usando a base [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) para detecção de intrusão com redes neurais e outras tecnicas de IA

In [None]:
!lscpu

In [None]:
!free

In [None]:
!lsb_release -a

In [None]:
!python --version

Python 3.7.10


## Baixando Base da Dados

#### Baixando awc-cli

In [None]:
!curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
!unzip awscliv2.zip
!sudo ./aws/install

#### Baixando CSV de ataques DOS

In [None]:
!aws s3 sync --no-sign-request --region sa-east-1 "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms" "./CSE-CIC-IDS2018"

In [None]:
!ls -l --block-size=M "CSE-CIC-IDS2018"

## Conectar com o drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importando Bibliotecas

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import  keras
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier

# Tratar base para 10%

### Carregando dados e salvar dados

In [None]:
FILEPATH = 'CSE-CIC-IDS2018/'

In [None]:
def carregar_arquivos(filename):
  data = pd.read_csv(FILEPATH + filename)
  data = data[data['Protocol'] != 'Protocol']
  target = data.pop('Label')
  timestamp = data.pop('Timestamp')
  data = data.apply(pd.to_numeric)
  data['Label'] = target
  return data


In [None]:
from sklearn.model_selection import train_test_split

CONJUNTO_10 = '/content/drive/MyDrive/UFSC/TCC/Arquivos/ids2018/conjunto10/'
def separar_salvar(dataset, name):
  dataset.to_csv(CONJUNTO_10 + name + '.csv', encoding='utf-8', index=False)

## Wednesday 14/02/2018 - Brute Force (FTP-BruteForce, SSH-BruteForce)

In [None]:
dataset = carregar_arquivos('Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv')
dataset['Label'] = dataset['Label'].replace('SSH-Bruteforce', 'Brute Force')
dataset['Label'] = dataset['Label'].replace('FTP-BruteForce', 'Brute Force')

In [None]:
dataset['Label'].value_counts()

Benign         667626
Brute Force    380949
Name: Label, dtype: int64

In [None]:
target = dataset.pop('Label')

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.10, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((943717, 78), (943717,), (104858, 78), (104858,))

In [None]:
X_test["Label"] = y_test.values

separar_salvar(X_test, 'brute_force')

## Friday 02/03/2018 - Bot

In [None]:
dataset = carregar_arquivos('Friday-02-03-2018_TrafficForML_CICFlowMeter.csv')

In [None]:
dataset['Label'].value_counts()

Benign    762384
Bot       286191
Name: Label, dtype: int64

In [None]:
target = dataset.pop('Label')

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.10, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((943717, 78), (943717,), (104858, 78), (104858,))

In [None]:
X_test["Label"] = y_test.values

separar_salvar(X_test, 'bot')

## Wed 28/02/2018 and Thursday 01/03/2018- Infiltration

In [None]:
dataset = carregar_arquivos('Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv')
dataset = dataset.append(carregar_arquivos('Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv'), ignore_index=False)

In [None]:
dataset['Label'].value_counts()

Benign           782237
Infilteration    161934
Name: Label, dtype: int64

In [None]:
target = dataset.pop('Label')

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.10, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((849753, 78), (849753,), (94418, 78), (94418,))

In [None]:
X_test["Label"] = y_test.values

separar_salvar(X_test, 'infiltration')

## Fri 23/02/2018 and 	Thurs 22/02/2018 - Web Attack (Brute Force -Web, Brute Force -XSS, SQL Injection)

In [None]:
dataset = carregar_arquivos('Friday-23-02-2018_TrafficForML_CICFlowMeter.csv')
dataset = dataset.append(carregar_arquivos('Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv'), ignore_index=False)

In [None]:
dataset['Label'].value_counts()

Benign              2096222
Brute Force -Web        611
Brute Force -XSS        230
SQL Injection            87
Name: Label, dtype: int64

In [None]:
dataset['Label'] = dataset['Label'].replace('Brute Force -Web', 'Web Attack')
dataset['Label'] = dataset['Label'].replace('Brute Force -XSS', 'Web Attack')
dataset['Label'] = dataset['Label'].replace('SQL Injection', 'Web Attack')

In [None]:
dataset['Label'].value_counts()

Benign        2096222
Web Attack        928
Name: Label, dtype: int64

In [None]:
target = dataset.pop('Label')

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.10, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1887435, 78), (1887435,), (209715, 78), (209715,))

In [None]:
X_test["Label"] = y_test.values

separar_salvar(X_test, 'web')

## Wed 21/02/2018 and 	Tues 20/02/2018 - DDOS (LOIC-HTTP, LOIC-UDP, HOIC)

In [None]:
filename = 'CSE-CIC-IDS2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'

with open(filename) as f:
    quantidade_linhas = sum(1 for line in f)

print(quantidade_linhas)

7948749


### Verificar diferença de colunas

In [None]:
tamanho_conjunto = 300
data = pd.read_csv(filename, nrows=tamanho_conjunto)

print(data.shape)

header = []
for col in data.columns:
    header.append(col)
    
print(header)

(300, 84)
['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/

In [None]:
data_aux = pd.read_csv('CSE-CIC-IDS2018/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv')
print(data_aux.shape)
header_aux = []
for col in data_aux.columns:
    header_aux.append(col)

print(header_aux)

tem que remover as colunas do arquivo Thuesday-20-02-2018_TrafficForML_CICFlowMeter

'Flow ID'

'Src IP'

'Src Port'

'Dst IP'



### Linhas ate 3000000

In [None]:
tamanho_conjunto = 3000000
filename = 'CSE-CIC-IDS2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'
data = pd.read_csv(filename, nrows=tamanho_conjunto)

header = []
for col in data.columns:
    header.append(col)

print(data.shape)

data = data[data['Protocol'] != 'Protocol']
target = data.pop('Label')
timestamp = data.pop('Timestamp')
timestamp = data.pop('Flow ID')
timestamp = data.pop('Src IP')
timestamp = data.pop('Src Port')
timestamp = data.pop('Dst IP')
data = data.apply(pd.to_numeric)
data['Label'] = target
print(data.shape)


(3000000, 84)
(3000000, 79)


In [None]:
data['Label'].value_counts()

Benign                    2423809
DDoS attacks-LOIC-HTTP     576191
Name: Label, dtype: int64

In [None]:
target = data.pop('Label')
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=42, stratify=target) 

X_test['Label'] = y_test.values
X_test['Label'].value_counts()

separar_salvar(X_test, 'ddos1')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Linhas 3000001 à 6000000

In [None]:
filename = 'CSE-CIC-IDS2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'
data = pd.read_csv(filename, names=header, header=None, skiprows= tamanho_conjunto, nrows=tamanho_conjunto)

print(data.shape)

data = data[data['Protocol'] != 'Protocol']
target = data.pop('Label')
timestamp = data.pop('Timestamp')
timestamp = data.pop('Flow ID')
timestamp = data.pop('Src IP')
timestamp = data.pop('Src Port')
timestamp = data.pop('Dst IP')
data = data.apply(pd.to_numeric)
data['Label'] = target
print(data.shape)


(3000000, 84)
(3000000, 79)


In [None]:
print(data['Label'].value_counts())

Benign    3000000
Name: Label, dtype: int64


In [None]:
target = data.pop('Label')
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=42, stratify=target) 

X_test['Label'] = y_test.values
X_test['Label'].value_counts()

separar_salvar(X_test, 'ddos2')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Linhas acima de 6000000

In [None]:
filename = 'CSE-CIC-IDS2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'
data = pd.read_csv(filename, names=header, header=None, skiprows= (tamanho_conjunto*2))
print(data.shape)

data = data[data['Protocol'] != 'Protocol']
target = data.pop('Label')
timestamp = data.pop('Timestamp')
timestamp = data.pop('Flow ID')
timestamp = data.pop('Src IP')
timestamp = data.pop('Src Port')
timestamp = data.pop('Dst IP')
data = data.apply(pd.to_numeric)
data['Label'] = target
print(data.shape)

(1948749, 84)
(1948749, 79)


In [None]:
print(data['Label'].value_counts())

Benign    1948749
Name: Label, dtype: int64


In [None]:
target = data.pop('Label')
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=42, stratify=target) 

X_test['Label'] = y_test.values
X_test['Label'].value_counts()

separar_salvar(X_test, 'ddos3')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Info arquivos

In [None]:
dos_path ='/content/drive/MyDrive/UFSC/TCC/Arquivos/ids2018/conjunto10/'

dataset = pd.read_csv(dos_path + 'ddos1.csv')

dataset.shape

(300000, 79)

In [None]:
dataset = pd.read_csv(dos_path + 'ddos2.csv')

dataset.shape

(300000, 79)

In [None]:
dataset = pd.read_csv(dos_path + 'ddos3.csv')

dataset.shape

(194875, 79)

### Juntar arquivos

Carrega e junta arquivos de 10% do Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv

In [None]:
dos_path ='/content/drive/MyDrive/UFSC/TCC/Arquivos/ids2018/conjunto10/'

dataset = pd.read_csv(dos_path + 'ddos1.csv')
dataset = dataset.append(pd.read_csv(dos_path + 'ddos2.csv'))
dataset = dataset.append(pd.read_csv(dos_path + 'ddos3.csv'))

In [None]:
dataset.shape

(794875, 79)

Carrega Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv

In [None]:
data = carregar_arquivos('Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv')
print(dataset.shape)

target = data.pop('Label')
X_train, data_wednesday, y_train, y_test = train_test_split(data, target, test_size=0.10, random_state=42, stratify=target) 

data_wednesday['Label'] = y_test.values
data_wednesday['Label'].value_counts()
print(data_wednesday.shape)


Junta os dois arquivos

In [None]:
dataset = dataset.append(data_wednesday)

In [None]:
dataset['Label'].value_counts()

Benign                    773339
DDOS attack-HOIC           68602
DDoS attacks-LOIC-HTTP     57619
DDOS attack-LOIC-UDP         173
Name: Label, dtype: int64

In [None]:
dataset['Label'] = dataset['Label'].replace('DDOS attack-HOIC', 'DDOS')
dataset['Label'] = dataset['Label'].replace('DDOS attack-LOIC-UDP', 'DDOS')
dataset['Label'] = dataset['Label'].replace('DDoS attacks-LOIC-HTTP', 'DDOS')

In [None]:
dataset['Label'].value_counts()

Benign    773339
DDOS      126394
Name: Label, dtype: int64

In [None]:
separar_salvar(dataset, 'ddos')

## Thurs 15/02/2018 and 	Fri 16/02/2018 - DOS (GoldenEye, Slowloris, SlowHTTPTest, Hulk)

In [None]:
dataset = carregar_arquivos('Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv')
dataset = dataset.append(carregar_arquivos('Friday-16-02-2018_TrafficForML_CICFlowMeter.csv'), ignore_index=False)

In [None]:
dataset['Label'].value_counts()

Benign                      1442849
DoS attacks-Hulk             461912
DoS attacks-SlowHTTPTest     139890
DoS attacks-GoldenEye         41508
DoS attacks-Slowloris         10990
Name: Label, dtype: int64

In [None]:
dataset['Label'] = dataset['Label'].replace('DoS attacks-GoldenEye', 'DoS')
dataset['Label'] = dataset['Label'].replace('DoS attacks-Slowloris', 'DoS')
dataset['Label'] = dataset['Label'].replace('DoS attacks-Hulk', 'DoS')
dataset['Label'] = dataset['Label'].replace('DoS attacks-SlowHTTPTest', 'DoS')

In [None]:
dataset['Label'].value_counts()

Benign    1442849
DoS        654300
Name: Label, dtype: int64

In [None]:
target = dataset.pop('Label')

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.10, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1887434, 78), (1887434,), (209715, 78), (209715,))

In [None]:
X_test["Label"] = y_test.values

separar_salvar(X_test, 'dos')

# **Execução Experimento**

In [None]:
def carregar_arquivo(name, path):
  return pd.read_csv(path + name + '.csv')

In [None]:
def carregar_arquivo_conjunto10(name):
  return carregar_arquivo(name, CONJUNTO_10)

### Juntar 

In [None]:
CONJUNTO_10 = '/content/drive/MyDrive/UFSC/TCC/Arquivos/ids2018/conjunto10/'


In [None]:
train = carregar_arquivo_conjunto10('bot')
train = train.append(carregar_arquivo_conjunto10('brute_force'), ignore_index=False)
train = train.append(carregar_arquivo_conjunto10('ddos'), ignore_index=False)
train = train.append(carregar_arquivo_conjunto10('dos'), ignore_index=False)
train = train.append(carregar_arquivo_conjunto10('infiltration'), ignore_index=False)
train = train.append(carregar_arquivo_conjunto10('web'), ignore_index=False)

In [None]:
train['Label'].value_counts()

Benign           1348472
DDOS              126394
DoS                65430
Brute Force        38095
Bot                28619
Infilteration      16194
Web Attack            93
Name: Label, dtype: int64

In [None]:
with pd.option_context('mode.use_inf_as_na', True):
  dataset = train.dropna()

In [None]:
CODIGOS_LABEL = {'Benign': 0, 'Bot': 1, 'Brute Force': 2, 'DDOS': 3, 'DoS': 4,'Infilteration': 5, 'Web Attack': 6 }

In [None]:
for key,value in CODIGOS_LABEL.items():
  dataset['Label'] = dataset['Label'].replace(key, value)

In [None]:
train['Label'].value_counts()

Benign           1348472
DDOS              126394
DoS                65430
Brute Force        38095
Bot                28619
Infilteration      16194
Web Attack            93
Name: Label, dtype: int64

In [None]:
dataset.shape

(1613742, 79)

In [None]:
target = pd.Categorical(dataset.pop('Label'))
target.shape

(1613742,)

In [None]:
target.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1339059,0.829785
1,28619,0.017735
2,38095,0.023607
3,126394,0.078324
4,65430,0.040546
5,16052,0.009947
6,93,5.8e-05


## Normalizar 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
transformer = StandardScaler()

In [None]:
transformer = transformer.fit(dataset.values)

In [None]:
normalized_dataset = transformer.transform(dataset.values)

In [None]:
normalized_dataset  = pd.DataFrame(normalized_dataset)
normalized_dataset['Label'] = target

In [None]:
normalized_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,Label
0,-0.044693,-0.562277,-0.395816,-0.013588,-0.015447,-0.013042,-0.021063,0.408003,-0.457578,0.949763,1.002518,-0.483553,-0.521281,-0.497294,-0.384792,-0.056255,-0.194874,-0.24826,-0.275738,-0.358994,-0.217159,-0.38769,-0.268322,-0.269351,-0.349929,-0.220923,-0.294366,-0.189651,-0.251591,-0.254785,-0.075835,-0.209331,0.0,-0.012694,0.0,-0.015221,-0.013781,-0.171094,-0.161612,-0.485597,-0.12825,-0.204519,-0.042065,-0.310398,-0.065587,-0.209331,2.075457,1.23897,-0.698645,-0.208865,-0.012694,2.075445,0.447882,-0.231446,0.949763,-0.497294,0.0,0.0,0.0,0.0,0.0,0.0,-0.013588,-0.013042,-0.015447,-0.021064,-0.039085,-0.412347,-0.012548,0.261957,-0.069261,-0.057579,-0.079411,-0.05475,-0.30279,-0.090838,-0.306095,-0.296124,1
1,-0.473974,1.668682,-0.395563,-0.014889,-0.035327,-0.018963,-0.021366,-0.562409,0.814182,-0.320954,-0.610446,-0.582066,0.710776,-0.30997,-0.64774,-0.067106,-0.197114,-0.247099,-0.27676,-0.358494,-0.215877,-0.387706,-0.26834,-0.269415,-0.349954,-0.220927,-0.294732,-0.19038,-0.25312,-0.255675,-0.075839,-0.209331,0.0,-0.012694,0.0,-0.020403,-0.041606,-0.172247,-0.165334,0.880848,-0.639863,-0.35086,-0.638174,-0.453919,-0.065587,-0.209331,-0.481821,-0.807122,-0.698645,-0.208865,-0.012694,-0.481824,0.447882,-0.254498,-0.320954,-0.30997,0.0,0.0,0.0,0.0,0.0,0.0,-0.014889,-0.018963,-0.035327,-0.021366,-0.542893,-0.42299,-0.013198,-1.293584,-0.069261,-0.057579,-0.079411,-0.05475,-0.30279,-0.090838,-0.306095,-0.296124,0
2,2.546605,-0.562277,-0.396135,-0.014239,-0.041953,-0.019585,-0.021655,-0.664384,-0.457578,-0.828149,-0.610446,-0.708725,-0.521281,-0.693757,-0.64774,-0.068522,0.024456,-0.248377,-0.27676,-0.35952,-0.217157,-0.387704,-0.268337,-0.269415,-0.349952,-0.220925,-0.294732,-0.19038,-0.25312,-0.255675,-0.075839,-0.209331,0.0,-0.012694,0.0,-0.017812,-0.044256,0.10276,-0.165938,-0.485597,-0.762416,-0.751793,-0.751793,-0.457694,-0.065587,-0.209331,-0.481821,-0.807122,1.431343,-0.208865,-0.012694,-0.481824,-0.448223,-0.830784,-0.828149,-0.693757,0.0,0.0,0.0,0.0,0.0,0.0,-0.014239,-0.019585,-0.041953,-0.021655,-0.52709,-0.42299,-0.013198,0.261957,-0.069261,-0.057579,-0.079411,-0.05475,-0.30279,-0.090838,-0.306095,-0.296124,0
3,-0.044693,-0.562277,-0.395795,-0.013588,-0.015447,-0.013042,-0.021063,0.408003,-0.457578,0.949763,1.002518,-0.483553,-0.521281,-0.497294,-0.384792,-0.057019,-0.19504,-0.248252,-0.275669,-0.358958,-0.21716,-0.387687,-0.26832,-0.269337,-0.349924,-0.220923,-0.294344,-0.189606,-0.251485,-0.254726,-0.075839,-0.209331,0.0,-0.012694,0.0,-0.015221,-0.013781,-0.171182,-0.161882,-0.485597,-0.12825,-0.204519,-0.042065,-0.310398,-0.065587,-0.209331,2.075457,1.23897,-0.698645,-0.208865,-0.012694,2.075445,0.447882,-0.231446,0.949763,-0.497294,0.0,0.0,0.0,0.0,0.0,0.0,-0.013588,-0.013042,-0.015447,-0.021064,-0.039085,-0.412347,-0.012548,0.261957,-0.069261,-0.057579,-0.079411,-0.05475,-0.30279,-0.090838,-0.306095,-0.296124,1
4,-0.295566,-0.562277,-0.304923,-0.010337,0.011059,0.003052,-0.014234,1.509997,-0.457578,1.478773,1.297378,1.649547,-0.521281,0.53832,1.32681,-0.068262,-0.197518,-0.234785,-0.206297,-0.304296,-0.21716,-0.29625,-0.239596,-0.131563,-0.251712,-0.220927,-0.19471,-0.104961,-0.173246,-0.162722,-0.020355,-0.209331,0.0,-0.012694,0.0,-0.007124,0.012718,-0.172497,-0.165908,-0.485597,1.519416,0.802508,1.15041,0.600392,-0.065587,-0.209331,2.075457,1.23897,-0.698645,-0.208865,-0.012694,2.075445,0.447882,0.751698,1.478773,0.53832,0.0,0.0,0.0,0.0,0.0,0.0,-0.010337,0.003052,0.011059,-0.014234,-0.039085,2.618475,-0.009945,0.261957,-0.069261,-0.057579,-0.079411,-0.05475,-0.30279,-0.090838,-0.306095,-0.296124,0


## Treinamento

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

In [None]:
target = normalized_dataset.pop('Label')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(normalized_dataset.values, target, test_size=0.3, random_state=42, stratify=target) 

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1129619, 78), (1129619,), (484123, 78), (484123,))

In [None]:
pd.Categorical(y_test).describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0,401718,0.829785
1,8586,0.017735
2,11428,0.023606
3,37918,0.078323
4,19629,0.040545
5,4816,0.009948
6,28,5.8e-05


In [None]:
pd.Categorical(y_train).describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0,937341,0.829785
1,20033,0.017734
2,26667,0.023607
3,88476,0.078324
4,45801,0.040546
5,11236,0.009947
6,65,5.8e-05


### **Balancear dados de treinamento**

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1129619, 78), (1129619,), (484123, 78), (484123,))

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.under_sampling import ClusterCentroids 

#0	937341	0.829785
#1	20033	0.017734
#2	26667	0.023607
#3	88476	0.078324
#4	45801	0.040546
#5	11236	0.009947
#6	65	0.000058


quantidade_exemplos_por_classe = {0: 937341, 1: 100000, 2: 100000, 3: 100000, 4: 100000, 5: 100000, 6: 100000}
oversample = SMOTE(sampling_strategy=quantidade_exemplos_por_classe)
X_train, y_train = oversample.fit_resample(X_train, y_train)

#sme = SMOTEENN(random_state=42)
#X_train, y_train = sme.fit_resample(X_train, y_train)

#rus = RandomUnderSampler(random_state=42,sampling_strategy='majority')
#X_train, y_train = rus.fit_resample(X_train, y_train)

#cnn = CondensedNearestNeighbour(random_state=42) 
#X_train, y_train = cnn.fit_resample(X_train, y_train)

#cc = ClusterCentroids(random_state=42, sampling_strategy='majority')
#X_train, y_train = cc.fit_resample(X_train, y_train)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((1537341, 78), (1537341,), (484123, 78), (484123,))

In [None]:
pd.Categorical(y_train).describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0,937341,0.609716
1,100000,0.065047
2,100000,0.065047
3,100000,0.065047
4,100000,0.065047
5,100000,0.065047
6,100000,0.065047


In [None]:
pd.Categorical(y_test).describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
0,401718,0.829785
1,8586,0.017735
2,11428,0.023606
3,37918,0.078323
4,19629,0.040545
5,4816,0.009948
6,28,5.8e-05


### MultilayerPerceptron

In [None]:
def criarModeloMLP():
  return  MLPClassifier(hidden_layer_sizes=(78,78), max_iter=300,activation='relu',solver='adam',random_state=1, verbose=50)

In [None]:
mlpClassifier = criarModeloMLP().fit(X_train, y_train)

### One Vs All com MLP

In [None]:
ova = OneVsRestClassifier(criarModeloMLP()).fit(X_train, y_train)

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)


### ExtraTree

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

## Pós processamento

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

### Métricas

In [None]:
nome_classes = ['Benign','Bot','Brute Force','DDOS','DoS','Infilteration','Web Attack']

def salvar_informacoes(clf):
  y_pred = clf.predict(X_test)
  matriz = multilabel_confusion_matrix(y_test, y_pred)
  acc = accuracy_score(y_test, y_pred)
  acc_balanced = balanced_accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  texto = '=====================================================================\n\n'
  texto += f'Acurácia: {acc}\n'
  texto += f'Acurácia Balanceada: {acc_balanced}\n'
  texto += f'Precision : {precision}\n'
  texto += '\n\n'

  acc_classes = []
  precision_classes = []
  recall_classes = []
  TNR_classes = []
  f1_score_classes = []
  texto += 'RESULTADOS POR CLASSE\n\n'
  a = 'Results: \n '
  for j in range(0,len(nome_classes)):
    texto += '\n\n'
    texto += f'Classe {j}: {nome_classes[j]}\n'

    #separa a matriz de cada classe j em tn, fp, fn, tp  
    tn = matriz[j][0][0]
    fp = matriz[j][0][1]
    fn = matriz[j][1][0]
    tp = matriz[j][1][1]
    #imprime matriz no arquivo
    texto += '\n\n-- N --|-- P --\n'
    texto += f'N| {tn} | {fp} |\n'
    texto += '-----------------------\n'
    texto += f'P| {fn} | {tp} |\n'
    texto += '\n\n'

    #calcula as métricas com base nos dados tp, tn, fn, fp
    acc_classes.append(((tn+tp)/(tn+tp+fn+fp)))
    precision = (tp/(tp+fp))
    precision_classes.append(precision)
    recall = (tp/(tp+fn))
    recall_classes.append(recall)
    TNR_classes.append((tn/(tn+fp)))
    f1_score_classes.append((2 * ((precision * recall)/(precision + recall))))

    #imprimir linha no arquivo com as métricas calculadas para a classe j no fold i
    texto += '             acc,                     loss,                   precision,            recall,                TNR,              f1-score\n'
    texto += f'Classe {nome_classes[j]}:  {acc_classes[j]},   {1-acc_classes[j]},   {precision_classes[j]},   {recall_classes[j]},     {TNR_classes[j]},  {f1_score_classes[j]}\n'
    a += f'Classe {nome_classes[j]}:  {acc_classes[j]},   {1-acc_classes[j]},   {precision_classes[j]},   {recall_classes[j]},     {TNR_classes[j]},  {f1_score_classes[j]}\n'
  
  texto += '\n\n\n\n'
  texto += a
    
  
  
  return texto

In [None]:
ova_info = salvar_informacoes(ova)

In [None]:
ovo_info = salvar_informacoes(ovo)

In [None]:
evot_info = salvar_informacoes(evot)

In [None]:
evot3_info = salvar_informacoes(evot3)

In [None]:
mlp_info = salvar_informacoes(mlpClassifier)

In [None]:
knn_info = salvar_informacoes(clf)

In [None]:
dt_info = salvar_informacoes(dt)

In [None]:
rf_info = salvar_informacoes(clf)

In [None]:
extraTree_info = salvar_informacoes(clf)

In [None]:
NB_info = salvar_informacoes(gnb)

In [None]:
def salvar(texto, nome, caminho):
  arquivo = open(caminho + nome, 'w')
  arquivo.write(texto)

In [None]:
salvar(ova_info,'ovaInfo78_78_relu_balanced_SMOTE_especific_100.txt', CONJUNTO_10)

In [None]:
salvar(ovo_info,'ovoInfo78_78_relu.txt', CONJUNTO_10)

In [None]:
salvar(evot3_info,'evot3Info_mlp78_78_relu_knn_rf.txt', CONJUNTO_10)

In [None]:
salvar(evot_info,'evot_soft_Info78_78_relu.txt', CONJUNTO_10)

In [None]:
salvar(mlp_info,'mlpClassifierInfo_78_78_relu_normalizado2.txt', CONJUNTO_10)

In [None]:
salvar(knn_info,'KNNInfo.txt', CONJUNTO_10)

In [None]:
salvar(dt_info,'DTInfo.txt', CONJUNTO_10)

In [None]:
salvar(rf_info,'RFInfo.txt', CONJUNTO_10)

In [None]:
salvar(extraTree_info,'ExtraTreeInfo.txt', CONJUNTO_10)

In [None]:
salvar(NB_info,'NaiveBayes_Info.txt', CONJUNTO_10)