<a href="https://colab.research.google.com/github/krugerleo/CDadosSeg/blob/master/Final/datascience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introdução
## Autores: [Leonardo Krüger](https://github.com/krugerleo/CDadosSeg),[Lucas Block](https://github.com/lucasvillatore/CDadosSeg).[João Picolo](https://github.com/JoaoPicolo/CDadosSeg)
### Algoritmos utilizados
- K-nearest neighbors
- Random Forests
- Multilayer perceptron

### Dataset Utilizado e informações
##### Foi utilizado o dataset relacionado a ataque de negação de serviço (DDoS) Esse dataset é disponibilizado nesse link: [ddos-dataset](https://www.kaggle.com/devendra416/ddos-datasets)
##### O dataset foi gerado e rotulado automaticamente utilizando [CICFlow](https://www.unb.ca/cic/research/applications.html#CICFlowMeter), desta forma é possível a extração de estatísticas sobre características de tráfico de rede.


### Campos utilizados
- timestamp
- fwd seg size min
- source ip
- dst ip
- flow iat min
- source port
- tot fwd pkts
- init bwd win bytes


# Instalação

> Para execução correta:

1.   Deve montar o drive com a celula abaixo.
2.   O **final_dataset.csv** (nome deve ser igual) precisa estar na pasta raiz do drive.
3.   Para primeira execução não se existe dataset montado então essa opção será "não".
4.   Para futuras execuções é recomendado salvar os fragmentos do dataset que serão gerado e poder utilizar a opção de dataset montado.

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

In [None]:
use_mounted = input("Use mounted dataset? [y/n] ")

if(use_mounted.lower() == 'y'):
  phase_one, phase_two = loadDatasets()
else:
  phase_one, phase_two = mountDatasets()
  
  save_df = input("Save created datasets? [y/n]")
  
  if(save_df.lower() == 'y'):
    saveDatasets(phase_one, phase_two)




# Import's e definição de variaveis


In [None]:
import sys

import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt; plt.rcdefaults()
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, plot_roc_curve
from sklearn.utils import shuffle
from sklearn import preprocessing


In [None]:
# Global variables
path = '/content/drive/MyDrive/'
columns_df = ['Timestamp', 'Fwd Seg Size Min', "Src IP", "Dst IP", 'Flow IAT Min', 'Src Port', 'Tot Fwd Pkts', 'Flow Duration', 'Label']

# Separação e montagem de dataframe

In [None]:
def loadDatasets():
  global path, columns_df

  print("Loading datasets...")
  
  
  train_path_df = path + "train_dataset_phase_one.csv"
  train_df = pd.read_csv(
      train_path_df,
      usecols=columns_df
  )

  test_path_df = path + "test_dataset_phase_one.csv"
  test_df = pd.read_csv(
      test_path_df,
      usecols=columns_df
  )

  phase_one = {
      'train': train_df,
      'test': test_df
  }

  test_path_df = path + "test_dataset_phase_two.csv"
  test_df = pd.read_csv(
      test_path_df,
      usecols=columns_df
  )

  phase_two = {
      'test':test_df
  }

  print("Datasets recovered")

  return phase_one, phase_two


def mountDatasets():
  global path, columns_df

  print("Creating dataset...")
  path_df = path + "final_dataset.csv"

  data_frame = pd.read_csv(
      path_df,
      usecols=columns_df
  )
  data_frame.describe()
  # Separate df by type
  data_frame = shuffle(data_frame)
  ddos_df = data_frame.loc[data_frame['Label'] == 'ddos']
  benign_df = data_frame.loc[data_frame['Label'] == 'Benign']

  # Split dfs into training and test dfs
  eighty_ddos_percent, twenty_ddos_percent = train_test_split(ddos_df, test_size=0.2)
  eighty_ddos_percent, twenty_ddos_percent = train_test_split(twenty_ddos_percent, test_size=0.2)
  ddos_train, ddos_test = train_test_split(eighty_ddos_percent, test_size=0.2)
  
  
  eighty_benign_percent, twenty_benign_percent = train_test_split(benign_df, test_size=0.2)
  eighty_benign_percent, twenty_benign_percent = train_test_split(twenty_benign_percent, test_size=0.2)
  benign_train, benign_test = train_test_split(eighty_benign_percent, test_size=0.2)

  # Concatenate dataframes for train and test 80% percent
  train_df = pd.concat([ddos_train, benign_train])
  test_df = pd.concat([ddos_test, benign_test])
  
  phase_two_test = pd.concat([twenty_ddos_percent, twenty_benign_percent])
  print("Dataset created")

  # phase one
  phase_one = {
      'train': train_df,
      'test': test_df
  }

  phase_two = {
      'test': phase_two_test
  }

  return phase_one, phase_two, 


def saveDatasets(phase_one, phase_two):
  global path

  print("Saving datasets...")

  phase_one['train'].to_csv(path + "train_dataset_phase_one.csv", index=False)
  phase_one['test'].to_csv(path + "test_dataset_phase_one.csv", index=False)

  phase_two['test'].to_csv(path + "test_dataset_phase_two.csv", index=False)

  print("Datasets saved")

# Dataframes

**train_df_one:** Porcentagem para treino -> 80% de x% do dataset original  
**test_df_one:** Porcentagem para teste -> 20% de x% do dataset original  
**test_df_two:** Porcentagem para teste na segunda fase -> 20% de x% do dataset original  

In [None]:
train_df_one = phase_one['train']
test_df_one = phase_one['test']
test_df_two = phase_two['test'] 

# Distribuição dataframes
  Distribuição dos dataframes de teste e treino pelo label ["ddos", "Benign"]

## 1 Treino

In [None]:
ddos_df = train_df_one.loc[train_df_one['Label'] == 'ddos']
benign_df = train_df_one.loc[train_df_one['Label'] == 'Benign']

objects = ("Ataque DDoS", "Benigno")
y_pos = np.arange(len(objects))
distribution = [len(ddos_df), len(benign_df)]

plt.bar(y_pos, distribution, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Quantidade')
plt.title('Distribuição de amostras no treino')
plt.show()

## 2 Teste

In [None]:
ddos_df = test_df_one.loc[test_df_one['Label'] == 'ddos']
benign_df = test_df_one.loc[test_df_one['Label'] == 'Benign']

objects = ("Ataque DDoS", "Benigno")
y_pos = np.arange(len(objects))
distribution = [len(ddos_df), len(benign_df)]

plt.bar(y_pos, distribution, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Quantidade')
plt.title('Distribuição de amostras no teste')
plt.show()

#  Algoritmos
 Utilizados com cross validation k-folders = 5
1.   K-nearest neighbors
2.   Random Forests
3.   Multilayer perceptron




## 1.1 KNeighbors

In [None]:
print("Executing KNN classification")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)
fit_test = test_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)

x_train = fit_train.iloc[:, :-1].values # Features ?
y_train = fit_train.iloc[:, n_columns-1].values # (n_columns-1) = Labels

x_test = fit_test.iloc[:, :-1].values # Features ?
y_test = fit_test.iloc[:, n_columns-1].values # (n_columns-1) = Labels

# Executes algorithm
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(classification_report(y_test, y_pred))
plot_roc_curve(classifier, x_test, y_test)

## 1.2 KNeighbors cross validation

In [None]:
print("Cross validating KNNN")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)
x = fit_train.iloc[:, :-1].values # Features
y = fit_train.iloc[:, n_columns-1].values # Labels


kf = StratifiedKFold(n_splits=5)

for train_index, test_index in kf.split(x, y):
  x_train, x_test = x[train_index], x[test_index]
  y_train, y_test = y[train_index], y[test_index]

  classifier = KNeighborsClassifier(n_neighbors=5)
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)


  print(confusion_matrix(y_test, y_pred))
  print(mean_absolute_error(y_test, y_pred))
  print(classification_report(y_test, y_pred))   
  plot_roc_curve(classifier, x_test, y_test)

## 2.1 Radom forests

In [None]:
print("Executing Random Forest classification")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)
fit_test = test_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)

x_train = fit_train.iloc[:, :-1].values # Features
y_train = fit_train.iloc[:, n_columns-1].values # Labels

x_test = fit_test.iloc[:, :-1].values # Features
y_test = fit_test.iloc[:, n_columns-1].values # Labels

classifier = RandomForestClassifier(n_estimators=50)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(classification_report(y_test, y_pred))
plot_roc_curve(classifier, x_test, y_test)

## 2.2 Random forests cross validation

In [None]:
print("Cross validating Random Forest")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)
x = fit_train.iloc[:, :-1].values # Features
y = fit_train.iloc[:, n_columns-1].values # Labels

idx = 0
kf = StratifiedKFold(n_splits=5)

for train_index, test_index in kf.split(x, y):
  x_train, x_test = x[train_index], x[test_index]
  y_train, y_test = y[train_index], y[test_index]
  idx += 1
  print(f"Fold {idx}\n")


  classifier = RandomForestClassifier(n_estimators=50)
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)

  print(confusion_matrix(y_test, y_pred))
  print(mean_absolute_error(y_test, y_pred))
  print(classification_report(y_test, y_pred))
  plot_roc_curve(classifier, x_test, y_test)

## 3.1 Multilayer perceptron

In [None]:
executeMLP(train_df_one, test_df_one)
print("Executing MLP classification")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)
fit_test = test_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)

x_train = fit_train.iloc[:, :-1].values # Features
y_train = fit_train.iloc[:, n_columns-1].values # Labels

x_test = fit_test.iloc[:, :-1].values # Features
y_test = fit_test.iloc[:, n_columns-1].values # Labels

# Executes algorithm
classifier = MLPClassifier(hidden_layer_sizes=6)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(classification_report(y_test, y_pred))
plot_roc_curve(classifier, x_test, y_test)


## 3.2 Multilayer perceptron cross validation

In [None]:
print("Cross validating MLP")

fit_train = train_df_one.apply(LabelEncoder().fit_transform)

n_columns = len(fit_train.columns)
x = fit_train.iloc[:, :-1].values # Features
y = fit_train.iloc[:, n_columns-1].values # Labels

idx = 0
kf = StratifiedKFold(n_splits=5)

for train_index, test_index in kf.split(x, y):
  x_train, x_test = x[train_index], x[test_index]
  y_train, y_test = y[train_index], y[test_index]
  idx += 1
  print(f"Fold {idx}\n")


  classifier = MLPClassifier(hidden_layer_sizes=6)
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)

  print(confusion_matrix(y_test, y_pred))
  print(mean_absolute_error(y_test, y_pred))
  print(classification_report(y_test, y_pred))
  plot_roc_curve(classifier, x_test, y_test)