<a href="https://colab.research.google.com/github/ks-abel/FLearning-Cybersecurity/blob/master/FLearning_Cybersecurity_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet --upgrade tensorflow_federated

In [None]:
!pip install --quiet --upgrade nest_asyncio

In [None]:
# Auteur : Komlan Sessofia

import nest_asyncio
nest_asyncio.apply()
import collections
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import collections

import tensorflow as tf
import datetime
#tf.debugging.set_log_device_placement(True)
tf.compat.v1.enable_v2_behavior()
import tensorflow_federated as tff
from tensorflow.python.data import Dataset
from tensorflow.python.keras import regularizers
from sklearn.metrics import classification_report

%load_ext tensorboard

In [None]:
# Chargement du dataset.
CSVFile = "https://flearning-cybersecurity.komlansessofia.com/resources/DataSet/mainSimulationAccessTraces.csv"
DS2OS_traffic_traces_DataFrame = pd.read_csv(CSVFile, sep=",")
# Les attributs non négligeables
feat = ['sourceType',
        'sourceAddress',
        'destinationServiceAddress',
        'destinationServiceType',
        'accessedNodeType',
        'operation',
        'value']
# Les attributs négligeables 
Nfeat = ['sourceID',
        'sourceLocation',
        'destinationLocation',
        'accessedNodeAddress',
        'timestamp']

In [None]:
le = LabelEncoder()
sc = StandardScaler()
enc = OneHotEncoder()

BATCH_SIZE = 40
REPEAT_NUM = 20
SHUFFLE_BUFFER = 79
PREFETCH_BUFFER = 40
CLIENTS = 500

!rm -R /tmp/logs/*
train_logdir = "/tmp/logs/scalars/training/"
train_summary_writer = tf.summary.create_file_writer(train_logdir)

class_names = DS2OS_traffic_traces_DataFrame['normality'].unique()
federated_dataset_spec = ''

rm: cannot remove '/tmp/logs/*': No such file or directory


In [None]:
## Fonctions pour le prétraitement des données

# Function to calculate missing values by column
def missing_values_table(df):
   
    # Total missing values
    mis_val = df.isnull().sum()
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    # Sort the table by percentage of missing descending
    # .iloc[:, 1]!= 0: filter on missing missing values not equal to zero
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(2)  # round(2), keep 2 digits
    # Print some summary information
    print("Your slelected dataframe has {} columns.".format(df.shape[1]) + '\n' + 
    "There are {} columns that have missing values.".format(mis_val_table_ren_columns.shape[0]))
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

# Fonction pour la tranformation des attributs sous forme numérique
def preprocess_features(DataFrame):
  # Récupération des attributs non negligeables
  DS2OS = DataFrame.drop(columns=['normality'], axis=1)
  processed_features = DS2OS.copy()
  # Conversion des variables catégorielles en numerique
  processed_features["sourceAddress"] = le.fit_transform(DS2OS['sourceAddress'])
  processed_features["sourceType"] = le.fit_transform(DS2OS['sourceType'])
  processed_features["destinationServiceAddress"] = le.fit_transform(DS2OS['destinationServiceAddress'])
  processed_features["destinationServiceType"] = le.fit_transform(DS2OS['destinationServiceType'])
  processed_features["accessedNodeType"] = le.fit_transform(DS2OS['accessedNodeType'])
  processed_features["operation"] = le.fit_transform(DS2OS['operation'])
  return processed_features

# Mise à l'échelle des attributs
def scaling_features(fts):
  df_norm = sc.fit_transform(fts)
  features = pd.DataFrame(df_norm)
  return features

# Fonction pour la tranformation des etiquettes sous forme numérique
def preprocess_targets(DataFrame):
  output_targets = pd.DataFrame()
  # Encodage des étiquettes cibles
  output_targets["normality"] = le.fit_transform(DataFrame['normality'])
  return output_targets


# Transformation de chaque instance en tenseur
def numpy_line_to_tensor(numpy_lines):
  tensors = []
  for line in numpy_lines:
    tensors.append(tf.constant(line))
  return tensors


# Fonctions pour le prétraitement des données d'entrée
def preprocess(dataset):
  def preprocess_transform(x, y):
    return collections.OrderedDict(
        x=tf.cast(tf.reshape(x, [-1,7]), tf.float32),
        y=tf.cast(tf.reshape(y, [-1,1]), tf.float32)
    )
  
  return dataset.repeat(REPEAT_NUM
              ).shuffle(SHUFFLE_BUFFER
                        ).batch(BATCH_SIZE, drop_remainder=True
                                ).map(preprocess_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE
                                      ).prefetch(tf.data.experimental.AUTOTUNE)


# Fonction de creation et de répartition aléatoire des données sur les clients
def generate_clients_datasets(n, source_x, source_y):
  clients_dataset=[]
  size = len(source_x)//n
  for i in range(0, size*n, size):
    dataset=tf.data.Dataset.from_tensor_slices((source_x[i:i+size], source_y[i:i+size]))
    dataset=preprocess(dataset)
    clients_dataset.append(dataset)
  assert(len(clients_dataset) == n)
  return clients_dataset

# Tf preprocess
def _preprocess_dataset(features, targets):
  dataset = tf.data.Dataset.from_tensor_slices((features, targets))
  dataset = preprocess(dataset)
  return dataset

# Création d'un modèle avec Keras
def create_keras_model():
  return tf.keras.models.Sequential([
      tf.keras.layers.InputLayer(input_shape=(7,)),
      tf.keras.layers.Dense(7, activation='relu'),
      tf.keras.layers.Dense(5, activation='relu'),
      tf.keras.layers.Dense(8, activation='softmax')
  ])


# Construction d'un modèle pour TFF à partir du modèle Keras
def model_fn():
  keras_model = create_keras_model()
  return tff.learning.from_keras_model(
      keras_model,
      input_spec=federated_dataset_spec,
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
  

# Affichage des infos des metrics
def display_metrics(metrics):
  res = []
  if 'train' in metrics:
    for name, value in metrics['train'].items():
      name = name.split("_")
      metric_name = name[len(name)-1]
      res.append([metric_name, value])
  else:
    for name, value in metrics.items():
      name = name.split("_")
      metric_name = name[len(name)-1]
      res.append([metric_name, value])
  return res


# Processus d'évaluation (Test)
def evaluation_process(model_fn, state, federated_test_dataset):
  evaluation = tff.learning.build_federated_evaluation(model_fn)
  eval_metrics = evaluation(state.model, federated_test_dataset)
  return eval_metrics


# Processus de formation / Cycles de formation (entrainement & validation)
def training_process(_state, federated_train_dataset, epoch=1):
  global state
  state = _state
  for epoch_num in range(1, epoch+1):
    separator = ": "
    #tm, vm = ([], ) * 2
    tm = ([], )
    import time
    start = time.perf_counter()
    state, train_metrics = iterative_process.next(state, federated_train_dataset)
    #valid_metrics = evaluation_process(model_fn, state, federated_valid_dataset)
    elapsed = time.perf_counter() - start
    tm = display_metrics(train_metrics);
    #vm = display_metrics(valid_metrics);
    with train_summary_writer.as_default():
      for t in tm:
        tf.summary.scalar(t[0], t[1], step=epoch_num)
    #with valid_summary_writer.as_default():
     # for v in vm:
      #  tf.summary.scalar(v[0], v[1], step=epoch_num)
    tm.sort();
    print('Elapsed %.3f s/tour' % elapsed)
    print('Tour {:2d}/{:2d}, train= {}'.format(epoch_num, epoch, str(tm)))


# Calcul de la moyenne fédérée 
def build_federated_averaging_process(model_fn, local_learning_rate, server_learning_rate=1.0):
  favg_process = tff.learning.build_federated_averaging_process(
              model_fn,
              client_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=local_learning_rate), # Pour le calculer les mises à jour du modèle local sur chaque client
              server_optimizer_fn=lambda: tf.keras.optimizers.SGD(learning_rate=server_learning_rate) # Pour la mise à jour moyenne au modèle global sur le serveur
          )
  return favg_process


In [None]:
# Nettoyage et gestion des valeurs manquantes
DS2OS = DS2OS_traffic_traces_DataFrame.copy()
DS2OS['accessedNodeType'] = DS2OS['accessedNodeType'].fillna('/malicious')
DS2OS['value'] = DS2OS_traffic_traces_DataFrame['value'].fillna(0);
DS2OS = DS2OS.replace({'value':{'false':'0', 'true':'1', 'twenty':'20', 'none':'0'}});
DS2OS = DS2OS.replace({'value':{'org.+':'1'}},regex=True);
DS2OS = DS2OS.drop(Nfeat, axis=1)

In [None]:
# Transformation des données catégorielles et mise à l'échelle
fts = preprocess_features(DS2OS)
features = scaling_features(fts)
targets = preprocess_targets(DS2OS)

In [None]:
# Création aléatoire des sous-ensembles des données de formation et de test
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, stratify=targets, random_state=42)
# Transformation en tableaux de chaque sous-ensemble
DS2OS_train_features = x_train.to_numpy()
DS2OS_train_labels = y_train.to_numpy()
DS2OS_test_features = x_test.to_numpy()
DS2OS_test_labels = y_test.to_numpy()
# Transformation en tenseur de chaque ligne des sous-ensembles 
_DS2OS_train_features = numpy_line_to_tensor(x_train.to_numpy())
_DS2OS_train_labels = numpy_line_to_tensor(y_train.to_numpy())
_DS2OS_test_features = numpy_line_to_tensor(x_test.to_numpy())
_DS2OS_test_labels = numpy_line_to_tensor(y_test.to_numpy())
# Creation et répartition des données sur les clients (450)
federated_train_dataset = generate_clients_datasets(CLIENTS, _DS2OS_train_features, _DS2OS_train_labels)
federated_test_dataset = generate_clients_datasets(CLIENTS, _DS2OS_test_features, _DS2OS_test_labels)
# type d'arguments attendus par le modèle
federated_dataset_spec = federated_train_dataset[0].element_spec

In [None]:
# Apprentissage centralisé sur l'ensemble des données
_model = create_keras_model()
_model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
print('\nEntrainement')
_history = _model.fit(DS2OS_train_features, DS2OS_train_labels, epochs=30, batch_size=BATCH_SIZE)
print('\nEvaluation finale')
_mse = _model.evaluate(DS2OS_test_features, DS2OS_test_labels)
#plot_history(_history)


Entrainement
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

Evaluation finale


In [None]:
# Matrice de confusion
predictions = np.argmax(_model.predict(DS2OS_test_features), axis=-1)
res = tf.math.confusion_matrix(DS2OS_test_labels, predictions)
print('Confusion_matrix: ',res)

# Rapport de classification
print('\nClassification Report\n')
print(classification_report(DS2OS_test_labels, predictions, target_names = ["DoS","DP","MC","MO","SC","SP","WS","NL"]))

Confusion_matrix:  tf.Tensor(
[[  1151      0      0      0      0      0      0    583]
 [     0      0      0      0     58      0      0     45]
 [     0      0    248      0      0      0      0     19]
 [     0      0      0    169      0      0      0     72]
 [    10      0      5      5    346      3      0     95]
 [     0      0      0      0     25     31      0    104]
 [     0      0      0      0      0      0      0     36]
 [    57      0     57      0      8      0      0 104259]], shape=(8, 8), dtype=int32)

Classification Report

              precision    recall  f1-score   support

         DoS       0.94      0.66      0.78      1734
          DP       0.00      0.00      0.00       103
          MC       0.80      0.93      0.86       267
          MO       0.97      0.70      0.81       241
          SC       0.79      0.75      0.77       464
          SP       0.91      0.19      0.32       160
          WS       0.00      0.00      0.00        36
          NL

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Apprentissage fédéré avec 1/4 des 500 clients
train_datasets = [federated_train_dataset[node] for node in range(125)]
test_datasets = [federated_test_dataset[node] for node in range(125)]
# Définition du processus itératif qui effectue une moyenne fédérée sur les modeles clients
iterative_process = build_federated_averaging_process(model_fn, 0.5, 1)
# Construction de l'état initial du serveur
state = iterative_process.initialize()
training_process(state, train_datasets, epoch=30)
eval_metrics = evaluation_process(model_fn, state, test_datasets)
ev = display_metrics(eval_metrics)
print('eval = {}'.format(str(ev)))

# Matrice de confusion
modelK = create_keras_model()
modelK.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
state.model.assign_weights_to(modelK)
predictions = np.argmax(modelK.predict(DS2OS_test_features), axis=-1)
res = tf.math.confusion_matrix(DS2OS_test_labels, predictions)
print('Confusion_matrix: ',res)

# Rapport de classification
print('\nClassification Report\n')
print(classification_report(DS2OS_test_labels, predictions, target_names = ["DoS","DP","MC","MO","SC","SP","WS","NL"]))

In [None]:
# Apprentissage fédéré avec 500 clients

# Définition du processus itératif qui effectue une moyenne fédérée sur les modeles clients
iterative_process = build_federated_averaging_process(model_fn, 0.5, 1)
# Construction de l'état initial du serveur
state = iterative_process.initialize()
training_process(state, federated_train_dataset, epoch=30)
eval_metrics = evaluation_process(model_fn, state, federated_test_dataset)
ev = display_metrics(eval_metrics)
print('eval = {}'.format(str(ev)))

# Matrice de confusion
modelK = create_keras_model()
modelK.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
state.model.assign_weights_to(modelK)
predictions = np.argmax(modelK.predict(DS2OS_test_features), axis=-1)
res = tf.math.confusion_matrix(DS2OS_test_labels, predictions)
print('Confusion_matrix: ',res)

# Rapport de classification
print('\nClassification Report\n')
print(classification_report(DS2OS_test_labels, predictions, target_names = ["DoS","DP","MC","MO","SC","SP","WS","NL"]))

Elapsed 538.161 s/tour
Tour  1/30, train= [['accuracy', 0.9685446], ['loss', 0.14860874]]
Elapsed 373.754 s/tour
Tour  2/30, train= [['accuracy', 0.9755994], ['loss', 0.09216008]]
Elapsed 373.362 s/tour
Tour  3/30, train= [['accuracy', 0.9806374], ['loss', 0.07244383]]
Elapsed 376.282 s/tour
Tour  4/30, train= [['accuracy', 0.9826086], ['loss', 0.06289039]]
Elapsed 381.030 s/tour
Tour  5/30, train= [['accuracy', 0.9830852], ['loss', 0.057616394]]
Elapsed 378.743 s/tour
Tour  6/30, train= [['accuracy', 0.9837786], ['loss', 0.054241817]]
Elapsed 374.332 s/tour
Tour  7/30, train= [['accuracy', 0.984598], ['loss', 0.052520342]]
Elapsed 380.767 s/tour
Tour  8/30, train= [['accuracy', 0.985208], ['loss', 0.050819308]]
Elapsed 391.191 s/tour
Tour  9/30, train= [['accuracy', 0.9856754], ['loss', 0.048677973]]
Elapsed 392.401 s/tour
Tour 10/30, train= [['accuracy', 0.9859144], ['loss', 0.047239084]]
Elapsed 416.557 s/tour
Tour 11/30, train= [['accuracy', 0.9864], ['loss', 0.044918206]]
Elapsed 

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
!pip3 install ann_visualizer
!pip install graphviz
from ann_visualizer.visualize import ann_viz;
ann_viz(_model, title="My first neural network")